initial release

3f1316d5 · traveller59 · a347176a · 3f1316d5 · 3f1316d5 · 3f1316d5
Commit 3f1316d5 authored Jan 20, 2019 by traveller59
20 changed files
--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_OP_H_
+#define SPARSE_CONV_OP_H_
+
+#include <cuda_runtime_api.h>
+#include <spconv/indice.h>
+#include <spconv/reordering.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+template <unsigned NDim>
+std::vector<torch::Tensor>
+getIndicePair(torch::Tensor indices, int64_t batchSize,
+        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+        std::vector<int64_t> padding, std::vector<int64_t> dilation,
+        std::vector<int64_t> outPadding, bool subM, bool transpose) {
+  // auto timer = spconv::CudaContextTimer<>();
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 256, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                   torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique =
+      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                    torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
+      numActOut =
+          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+                        padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor>
+getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+        std::vector<int64_t> padding, std::vector<int64_t> dilation,
+        std::vector<int64_t> outPadding, bool subM, bool transpose) {
+  // auto timer = spconv::CudaContextTimer<>();
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 256, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                   torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique =
+      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                    torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
+      numActOut =
+          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+                        padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+  
+}
+
+
+template <typename T>
+torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
+                       torch::Tensor indicePairs, torch::Tensor indiceNum,
+                       int64_t numActOut, bool inverse, bool subm) {
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  
+  /*if (subm){
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+
+    auto indicePairVecMaxSizeIter = std::max_element(
+        indicePairNumVec.begin(), indicePairNumVec.end());
+    indicePairMaxSize = *indicePairVecMaxSizeIter;
+  }*/
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subm) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      /* slower than SparseGatherFunctor, may due to int->long conversion
+      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
+      indicePairOptions); 
+      torch::index_select_out(inputBufferBlob, features, 0,
+      indicePairBlob);*/
+    }
+    // totalGatherTime += timer.report() / 1000.0;
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+    // totalGEMMTime += timer.report() / 1000.0;
+
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+
+template <typename T>
+std::vector<torch::Tensor>
+indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+                 torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum,
+                 bool inverse, bool subm) {
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subm) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
+                    tv::torch2tv<const T>(outGrad),
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
+                    tv::torch2tv<const T>(outGrad),
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    auto filterGradSub = filtersGrad[i];
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+
+    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
+                  tv::torch2tv<const T>(inputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
+                  tv::torch2tv<const T>(inputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template <typename T>
+torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, bool inverse, bool subm) {
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto totalActsTen = indicePairNumCpu.sum();
+  auto totalActs = indicePairNumCpu.data<int>()[0];
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
+                              indicePairNumCpu.data<int>() + kernelVolume);
+  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+  int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
+  if (subm) {
+    indicePairMaxSize = subRuleMaxSize;
+  }
+  auto timer = spconv::CudaContextTimer<>();
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  std::cout << "create time " << timer.report()/1000.0 << std::endl;
+  if (subm) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  // auto timer = spconv::CudaContextTimer<>();
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
+      continue;
+    }
+    // 
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    // }
+    // for (int i = 0; i < kernelVolume; ++i) {
+    // totalGatherTime += timer.report() / 1000.0;
+    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+    // {nHot, numOutPlanes}, options);
+
+  }
+  // totalGatherTime += timer.report() / 1000.0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
+      continue;
+    }
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+  }
+  // totalGEMMTime += timer.report() / 1000.0;
+  // totalGEMMTime += timer.report() / 1000.0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
+      continue;
+    }
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBufferBlob),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBufferBlob),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // totalSAddTime += timer.report() / 1000.0;
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+
+} // namespace spconv
+
+#endif
\ No newline at end of file
--- a/include/tensorview/helper_kernel.cu.h
+++ b/include/tensorview/helper_kernel.cu.h
+#pragma once
+// from tensorflow
+namespace tv
+{
+namespace detail
+{
+
+template <typename T>
+class KernelLoop
+{
+  struct Iterator
+  {
+    __forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+    __forceinline__ __device__ T operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++()
+    {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const
+    {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      // Anything past an end iterator (delta_ == 0) is equal.
+      // In range-based for loops, this optimizes to 'return less'.
+      if (!other.delta_)
+      {
+        return less;
+      }
+      if (!delta_)
+      {
+        return greater;
+      }
+      return less || greater;
+    }
+
+  private:
+    T index_;
+    const T delta_;
+  };
+
+public:
+  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+private:
+  T begin_;
+  T delta_;
+  T end_;
+};
+
+} // namespace detail
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
+                                  gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
+                                  gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
+                                  gridDim.z * blockDim.z * NumILP, count);
+}
+
+} // namespace tv
\ No newline at end of file
--- a/include/tensorview/helper_launch.h
+++ b/include/tensorview/helper_launch.h
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv
+{
+namespace launch
+{
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+inline int getBlocks(const int N)
+{
+    TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+    return DivUp(N, CUDA_NUM_THREADS);
+}
+} // namespace launch
+} // namespace tv
\ No newline at end of file
--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <memory>
+#include <prettyprint.h>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+namespace tv {
+
+#ifdef __NVCC__
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...)                                                  \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      printf(__VA_ARGS__);                                                     \
+      assert(expr);                                                            \
+    }                                                                          \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                                           \
+  {                                                                            \
+    if (!(expr) && threadIdx.x == 0)                                           \
+      printf(__VA_ARGS__);                                                     \
+    assert(expr);                                                              \
+  }
+
+template <class SStream, class T> void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                                            \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      throw std::invalid_argument(__macro_s.str());                            \
+    }                                                                          \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                                    \
+  {                                                                            \
+    auto err = cudaGetLastError();                                             \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << "cuda execution failed with error " << err;                 \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
+
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  cudaStream_t stream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+struct CPU {};
+
+#define TV_MAX_DIM 6
+/*
+template <typename T>
+constexpr size_t calc_align(size_t ndim)
+{
+  if (ndim * sizeof(T) == 1)
+    return 1;
+  else if (ndim * sizeof(T) == 2)
+    return 2;
+  else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
+    return 4;
+  else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
+    return 8;
+  else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
+    return 16;
+  else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
+    return 32;
+  else
+    return 64;
+}
+*/
+template <typename T, size_t MaxDim = TV_MAX_DIM>
+struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
+public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (T s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<T> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(T s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const T *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+  public:
+    typedef iterator self_type;
+    typedef T value_type;
+    typedef T &reference;
+    typedef T *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+  private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+  public:
+    typedef const_iterator self_type;
+    typedef T value_type;
+    typedef const T &reference;
+    typedef const T *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+  private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+protected:
+  T mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename T, size_t MaxDim>
+bool operator==(const SimpleVector<T, MaxDim> &lfs,
+                const SimpleVector<T, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size())
+    return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i])
+      return false;
+  }
+  return true;
+}
+
+template <typename T, size_t MaxDim>
+bool operator!=(const SimpleVector<T, MaxDim> &lfs,
+                const SimpleVector<T, MaxDim> &rfs) {
+
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename T>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (T s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  template <typename T, template <class...> class Container>
+  ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0)
+      return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1)
+        shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1)
+        shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N> struct ArrayIndexRowMajor {
+  // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <> struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename T> constexpr const char *simpleTypeName(T val = T());
+template <> constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <> constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
+template <> constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
+template <> constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+}; // namespace detail
+
+template <typename T, int Rank = -1> struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  // explicit TV_HOST_DEVICE_INLINE TensorView(const
+  // TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
+  // mShape(tview.shape()) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  assign(const TensorView<T, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    T *ptr = mPtr;
+    const T *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i)
+      *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  assign(std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    T *ptr = mPtr;
+    for (const T1 &s : seq)
+      *(ptr++) = T(s);
+    return *this;
+  }
+
+  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE T &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
+  /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
+    int idx = rowArrayIdx(mShape, index);
+#ifdef TV_DEBUG
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+                int(idx), size());
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
+    int idx = rowArrayIdx(mShape, index);
+#ifdef TV_DEBUG
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+                int(idx), size());
+#endif
+    return mPtr[idx];
+  }*/
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  operator[](SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
+  operator[](SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE T *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const T *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<T, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<T, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze() const {
+    return TensorView<T, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze(int dim) const {
+    return TensorView<T, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slice slice,
+                                                    Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1; // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(int id, Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
+                               mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty())
+      return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
+      // detail::simpleTypeName<T>());
+      ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1)
+        ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0)
+          ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
+    // detail::simpleTypeName<T>());
+    ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
+    return ss.str();
+  }
+
+protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  _subview(SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1; // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  T *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename T, int Rank>
+Os &operator<<(Os &os, const TensorView<T, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename T, int Rank>
+Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename T> constexpr const char *printfTypeFormat(T val = T());
+template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
+template <> constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
+template <> constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
+template <> constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+}; // namespace detail
+
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
+                                    const char *format) {
+  if (tensor.empty())
+    return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1)
+      printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0)
+        printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename T>
+TV_HOST_DEVICE void printTensorView(TensorView<T> tensor) {
+  using Traw = typename std::remove_const<T>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
+  using Traw = typename std::remove_const<T>::type;
+  return printTensorView(TensorView<const T>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const T>(ptr, shape), format);
+}
+
+} // namespace tv
\ No newline at end of file
--- a/include/torch_utils.h
+++ b/include/torch_utils.h
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <tensorview/tensorview.h>
+#include <torch/script.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+namespace tv {
+
+struct TorchGPU: public tv::GPU {
+  TorchGPU(){
+    mStream = at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+  case at::ScalarType::Double: {
+    auto val = std::is_same<std::remove_const_t<T>, double>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Float: {
+    auto val = std::is_same<std::remove_const_t<T>, float>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Int: {
+    auto val = std::is_same<std::remove_const_t<T>, int>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Half: {
+    auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+
+  default:
+    TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename T>
+tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<T>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
+}
+} // namespace tv
\ No newline at end of file
--- a/include/utility/timer.h
+++ b/include/utility/timer.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+namespace spconv {
+
+template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
+  CudaContextTimer() {
+    cudaDeviceSynchronize();
+    mCurTime = std::chrono::steady_clock::now();
+  }
+  typename TimeT::rep report() {
+    cudaDeviceSynchronize();
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+
+template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
+  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
+  typename TimeT::rep report() {
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+
+} // namespace spconv
--- a/setup.py
+++ b/setup.py
+import os
+import re
+import sys
+import platform
+import subprocess
+
+from setuptools import setup, Extension, find_packages
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+if 'LIBTORCH_ROOT' not in os.environ:
+    raise ValueError("You must set LIBTORCH_ROOT to your torch c++ library.")
+
+PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir='', library_dirs=[]):
+        Extension.__init__(self, name, sources=[], library_dirs=library_dirs)
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        if platform.system() == "Windows":
+            raise NotImplementedError
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        print(extdir)
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir + "/spconv",
+                      '-DCMAKE_PREFIX_PATH=' + os.environ["LIBTORCH_ROOT"],
+                      '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
+                      '-DSPCONV_BuildTests=OFF',
+                      '-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"']
+
+        cfg = 'Debug' if self.debug else 'Release'
+        # cfg = 'Debug'
+        build_args = ['--config', cfg]
+        print(cfg)
+        if platform.system() == "Windows":
+            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j4']
+
+        env = os.environ.copy()
+        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
+                                                              self.distribution.get_version())
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+        
+        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+
+packages = find_packages(exclude=('tools', 'tools.*'))
+setup(
+    name='spconv',
+    version='1.0',
+    author='Yan Yan',
+    author_email='scrin@foxmail.com',
+    description='spatial sparse convolution for pytorch',
+    long_description='',
+    setup_requires = ['torch>=1.0.0'],
+    packages=packages,
+    package_dir = {'spconv': 'spconv'},
+    ext_modules=[CMakeExtension('spconv', library_dirs=[])],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+)
+
--- a/spconv/__init__.py
+++ b/spconv/__init__.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import numpy as np
+import torch
+from spconv import utils
+from spconv.conv import SparseConv2d, SparseConv3d, SubMConv2d, SubMConv3d
+from spconv.conv import SparseConvTranspose2d, SparseConvTranspose3d
+from spconv.conv import SparseInverseConv2d, SparseInverseConv3d
+from spconv.modules import SparseModule, SparseSequential
+from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
+
+_LIB_PATH = str(Path(__file__).parent / "libspconv.so")
+torch.ops.load_library(_LIB_PATH)
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+class SparseConvTensor(object):
+    def __init__(self, features, indices, spatial_shape, batch_size, grid=None):
+        """
+        Args:
+            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
+                is very large.
+        """
+        self.features = features
+        self.indices = indices 
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict = {}
+        self.grid = grid
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key):
+        if key is None:
+            return None 
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first=True):
+        output_shape = [self.batch_size] + list(self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return self.indices.shape[0] / np.prod(self.spatial_shape) / self.batch_size
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor.
+    """
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer.
+    """
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
\ No newline at end of file
--- a/spconv/conv.py
+++ b/spconv/conv.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import time
+
+import numpy as np
+import spconv
+import spconv.functional as Fsp
+import torch
+from spconv import ops
+from spconv.modules import SparseModule
+from torch import nn
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+
+def _calculate_fan_in_and_fan_out_hwio(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(-2)
+        fan_out = tensor.size(-1)
+    else:
+        num_input_fmaps = tensor.size(-2)
+        num_output_fmaps = tensor.size(-1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[..., 0, 0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+class SparseConvolution(SparseModule):
+    def __init__(self,
+                 ndim,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 subm=False,
+                 output_padding=0,
+                 transposed=False,
+                 inverse=False,
+                 indice_key=None):
+        super(SparseConvolution, self).__init__()
+        assert groups == 1
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        if not isinstance(output_padding, (list, tuple)):
+            output_padding = [output_padding] * ndim
+
+        for d, s in zip(dilation, stride):
+            assert any([s == 1, d == 1]), "don't support this."
+
+        self.ndim = ndim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.conv1x1 = np.prod(kernel_size) == 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.inverse = inverse
+        self.output_padding = output_padding
+        self.groups = groups
+        self.subm = subm
+        self.indice_key = indice_key
+
+        self.weight = Parameter(
+            torch.Tensor(*kernel_size, in_channels, out_channels))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        assert isinstance(input, spconv.SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            if self.transposed:
+                out_spatial_shape = ops.get_deconv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding)
+            else:
+                out_spatial_shape = ops.get_conv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation)
+
+        else:
+            out_spatial_shape = spatial_shape
+        # input.update_grid(out_spatial_shape)
+        # t = time.time()
+        if self.conv1x1:
+            input.features = torch.mm(
+                input.features,
+                self.weight.view(self.in_channels, self.out_channels))
+            if self.bias:
+                input.features += self.bias
+            return input
+        datas = input.find_indice_pair(self.indice_key)
+        if self.inverse:
+            assert datas is not None and self.indice_key is not None
+            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
+        else:
+            if self.indice_key is not None and datas is not None:
+                outids, _, indice_pairs, indice_pair_num, _ = datas
+            else:
+                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                    indices, batch_size, spatial_shape, self.kernel_size,
+                    self.stride, self.padding, self.dilation, self.output_padding, self.subm, self.transposed, grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices, indice_pairs, indice_pair_num, spatial_shape)
+        if self.subm:
+            out_features = Fsp.indice_subm_conv(features, self.weight,
+                                              indice_pairs.to(device),
+                                              indice_pair_num,
+                                              outids.shape[0])
+        else:
+            if self.inverse:
+                out_features = Fsp.indice_inverse_conv(features,
+                                            self.weight, indice_pairs.to(device),
+                                            indice_pair_num, outids.shape[0])
+            else:
+                out_features = Fsp.indice_conv(features,
+                                            self.weight, indice_pairs.to(device),
+                                            indice_pair_num, outids.shape[0])
+
+        if self.bias:
+            out_features += self.bias
+        out_tensor = spconv.SparseConvTensor(out_features, outids,
+                                             out_spatial_shape, batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseConv2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+class SparseConv3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+class SparseConvTranspose2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConvTranspose2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+class SparseConvTranspose3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 
+                 indice_key=None):
+        super(SparseConvTranspose3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+class SparseInverseConv2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 indice_key,
+                 bias=True):
+        super(SparseInverseConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+class SparseInverseConv3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 indice_key,
+                 bias=True):
+        super(SparseInverseConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+class SubMConv2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+class SubMConv3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
--- a/spconv/functional.py
+++ b/spconv/functional.py
+
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import spconv.ops as ops
+import torch
+from torch import nn
+from torch.autograd import Function
+
+
+class SparseConvFunction(Function):
+    @staticmethod
+    def forward(
+            ctx,
+            features,
+            filters,
+            indice_pairs,
+            indice_pair_num,
+            num_activate_out):
+        ctx.save_for_backward(
+            indice_pairs,
+            indice_pair_num,
+            features,
+            filters)
+        return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, False)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(features, filters, grad_output, indice_pairs, indice_pair_num, False)
+        
+        return input_bp, filters_bp, None, None, None
+
+class SparseInverseConvFunction(Function):
+    @staticmethod
+    def forward(
+            ctx,
+            features,
+            filters,
+            indice_pairs,
+            indice_pair_num,
+            num_activate_out):
+        ctx.save_for_backward(
+            indice_pairs,
+            indice_pair_num,
+            features,
+            filters)
+        return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, True, False)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(features, filters, grad_output, indice_pairs, indice_pair_num, True, False)
+        
+        return input_bp, filters_bp, None, None, None
+
+
+class SubMConvFunction(Function):
+    @staticmethod
+    def forward(
+            ctx,
+            features,
+            filters,
+            indice_pairs,
+            indice_pair_num,
+            num_activate_out):
+        ctx.save_for_backward(
+            indice_pairs,
+            indice_pair_num,
+            features,
+            filters)
+        return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, False, True)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(features, filters, grad_output, indice_pairs, indice_pair_num, False, True)
+        
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseMaxPoolFunction(Function):
+    @staticmethod
+    def forward(
+            ctx,
+            features,
+            indice_pairs,
+            indice_pair_num,
+            num_activate_out):
+        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out)
+        ctx.save_for_backward(
+            indice_pairs,
+            indice_pair_num,
+            features,
+            out)
+        return out
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
+        input_bp = ops.indice_maxpool_backward(features, out, grad_output, indice_pairs, indice_pair_num)
+        return input_bp, None, None, None
+
+
+indice_conv = SparseConvFunction.apply
+indice_inverse_conv = SparseInverseConvFunction.apply
+indice_subm_conv = SubMConvFunction.apply
+indice_maxpool = SparseMaxPoolFunction.apply
--- a/spconv/modules.py
+++ b/spconv/modules.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import spconv
+import torch
+from torch import nn
+import time 
+
+def is_spconv_module(module):
+    spconv_modules = (SparseModule,)
+    return isinstance(module, spconv_modules)
+
+def _mean_update(vals, m_vals, t):
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+
+
+class SparseModule(nn.Module):
+    """ place holder, all module subclass from this will take sptensor in SparseSequential.
+    """
+    pass
+
+
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, given is a small example::
+
+        # Example of using Sequential
+        model = SparseSequential(
+                  SparseConv2d(1,20,5),
+                  nn.ReLU(),
+                  SparseConv2d(20,64,5),
+                  nn.ReLU()
+                )
+
+        # Example of using Sequential with OrderedDict
+        model = SparseSequential(OrderedDict([
+                  ('conv1', SparseConv2d(1,20,5)),
+                  ('relu1', nn.ReLU()),
+                  ('conv2', SparseConv2d(20,64,5)),
+                  ('relu2', nn.ReLU())
+                ]))
+        
+        # Example of using Sequential with kwargs(python 3.6+)
+        model = SparseSequential(
+                  conv1=SparseConv2d(1,20,5),
+                  relu1=nn.ReLU(),
+                  conv2=SparseConv2d(20,64,5),
+                  relu2=nn.ReLU()
+                )
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(SparseSequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError("kwargs only supported in py36+")
+            if name in self._modules:
+                raise ValueError("name exists.")
+            self.add_module(name, module)
+        self._sparity_dict = {}
+
+    def __getitem__(self, idx):
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError('index {} is out of range'.format(idx))
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+
+    def __len__(self):
+        return len(self._modules)
+
+    @property 
+    def sparity_dict(self):
+        return self._sparity_dict
+
+    def add(self, module, name=None):
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError("name exists")
+        self.add_module(name, module)
+
+    def forward(self, input):
+        for k, module in self._modules.items():
+            if is_spconv_module(module): # use SpConvTensor as input
+                assert isinstance(input, spconv.SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, spconv.SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
--- a/spconv/ops.py
+++ b/spconv/ops.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import spconv
+import torch
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                            output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices,
+             batch_size,
+             spatial_shape,
+             ksize=3,
+             stride=1,
+             padding=0,
+             dilation=1,
+             out_padding=0,
+             subm=False,
+             transpose=False,
+             grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+    
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride, padding,
+                                            dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride, padding,
+                                            dilation)
+
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_2d
+        elif ndim == 3:
+            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_3d
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
+                            stride, padding, dilation, out_padding, subm, transpose)
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
+        elif ndim == 3:
+            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3d
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape, spatial_shape, ksize,
+                            stride, padding, dilation, out_padding, subm, transpose)
+
+
+
+def indice_conv(features,
+              filters,
+              indice_pairs,
+              indice_pair_num,
+              num_activate_out,
+              inverse=False,
+              subm=False):
+    if filters.dtype == torch.float32:
+        return torch.ops.spconv.indice_conv_fp32(features, filters, indice_pairs,
+                                               indice_pair_num, num_activate_out,
+                                               inverse, subm)
+    elif filters.dtype == torch.half:
+        return torch.ops.spconv.indice_conv_half(features, filters, indice_pairs,
+                                               indice_pair_num, num_activate_out,
+                                               inverse, subm)
+    else:
+        raise NotImplementedError
+
+
+def indice_conv_backward(features,
+                       filters,
+                       out_bp,
+                       indice_pairs,
+                       indice_pair_num,
+                       inverse=False,
+                       subm=False):
+    if filters.dtype == torch.float32:
+        return torch.ops.spconv.indice_conv_backward_fp32(
+            features, filters, out_bp, indice_pairs, indice_pair_num, inverse, subm)
+    elif filters.dtype == torch.half:
+        return torch.ops.spconv.indice_conv_backward_half(
+            features, filters, out_bp, indice_pairs, indice_pair_num, inverse, subm)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32:
+        return torch.ops.spconv.indice_maxpool_fp32(features, indice_pairs, indice_pair_num,
+                                                  num_activate_out)
+    elif features.dtype == torch.half:
+        return torch.ops.spconv.indice_maxpool_half(features, indice_pairs, indice_pair_num,
+                                                  num_activate_out)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs, indice_pair_num):
+    if features.dtype == torch.float32:
+        return torch.ops.spconv.indice_maxpool_backward_fp32(
+            features, out_features, out_bp, indice_pairs, indice_pair_num)
+    elif features.dtype == torch.half:
+        return torch.ops.spconv.indice_maxpool_backward_half(
+            features, out_features, out_bp, indice_pairs, indice_pair_num)
+    else:
+        raise NotImplementedError
--- a/spconv/pool.py
+++ b/spconv/pool.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+import time
+
+import numpy as np
+import spconv
+import spconv.functional as Fsp
+import torch
+from spconv import ops
+from spconv.modules import SparseModule
+from torch import nn
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+
+class SparseMaxPool(SparseModule):
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super(SparseMaxPool, self).__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+
+    def forward(self, input):
+        assert isinstance(input, spconv.SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = ops.get_conv_output_size(
+                spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size,
+            self.stride, self.padding, self.dilation, 0, self.subm)
+        
+        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
+                                        indice_pairs_num.to(device), outids.shape[0])
+        out_tensor = spconv.SparseConvTensor(out_features, outids,
+                                             out_spatial_shape, batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseMaxPool2d(SparseMaxPool):
+    def __init__(self,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1):
+        super(SparseMaxPool2d, self).__init__(
+            2,
+            kernel_size,
+            stride,
+            padding,
+            dilation)
+
+
+class SparseMaxPool3d(SparseMaxPool):
+    def __init__(self,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1):
+        super(SparseMaxPool3d, self).__init__(
+            3,
+            kernel_size,
+            stride,
+            padding,
+            dilation)
--- a/spconv/test_utils.py
+++ b/spconv/test_utils.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+
+class TestCase(unittest.TestCase):
+    def _GetNdArray(self, a):
+        if not isinstance(a, np.ndarray):
+            a = np.array(a)
+        return a
+
+    def assertAllEqual(self, a, b):
+        """Asserts that two numpy arrays have the same values.
+        Args:
+        a: the expected numpy ndarray or anything can be converted to one.
+        b: the actual numpy ndarray or anything can be converted to one.
+        """
+        a = self._GetNdArray(a)
+        b = self._GetNdArray(b)
+        self.assertEqual(a.shape, b.shape,
+                         "Shape mismatch: expected %s, got %s." % (a.shape,
+                                                                   b.shape))
+        same = (a == b)
+
+        if a.dtype == np.float32 or a.dtype == np.float64:
+            same = np.logical_or(same, np.logical_and(
+                np.isnan(a), np.isnan(b)))
+        if not np.all(same):
+            # Prints more details than np.testing.assert_array_equal.
+            diff = np.logical_not(same)
+            if a.ndim:
+                x = a[np.where(diff)]
+                y = b[np.where(diff)]
+                print("not equal where = ", np.where(diff))
+            else:
+                # np.where is broken for scalars
+                x, y = a, b
+            print("not equal lhs = ", x)
+            print("not equal rhs = ", y)
+            np.testing.assert_array_equal(a, b)
+
+    def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
+        """Asserts that two numpy arrays, or dicts of same, have near values.
+        This does not support nested dicts.
+        Args:
+        a: The expected numpy ndarray (or anything can be converted to one), or
+            dict of same. Must be a dict iff `b` is a dict.
+        b: The actual numpy ndarray (or anything can be converted to one), or
+            dict of same. Must be a dict iff `a` is a dict.
+        rtol: relative tolerance.
+        atol: absolute tolerance.
+        Raises:
+        ValueError: if only one of `a` and `b` is a dict.
+        """
+        is_a_dict = isinstance(a, dict)
+        if is_a_dict != isinstance(b, dict):
+            raise ValueError("Can't compare dict to non-dict, %s vs %s." % (a,
+                                                                            b))
+        if is_a_dict:
+            self.assertCountEqual(
+                a.keys(),
+                b.keys(),
+                msg="mismatched keys, expected %s, got %s" % (a.keys(),
+                                                              b.keys()))
+            for k in a:
+                self._assertArrayLikeAllClose(
+                    a[k],
+                    b[k],
+                    rtol=rtol,
+                    atol=atol,
+                    msg="%s: expected %s, got %s." % (k, a, b))
+        else:
+            self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
+
+    def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
+        a = self._GetNdArray(a)
+        b = self._GetNdArray(b)
+        self.assertEqual(a.shape, b.shape,
+                         "Shape mismatch: expected %s, got %s." % (a.shape,
+                                                                   b.shape))
+        if not np.allclose(a, b, rtol=rtol, atol=atol):
+            # Prints more details than np.testing.assert_allclose.
+            #
+            # NOTE: numpy.allclose (and numpy.testing.assert_allclose)
+            # checks whether two arrays are element-wise equal within a
+            # tolerance. The relative difference (rtol * abs(b)) and the
+            # absolute difference atol are added together to compare against
+            # the absolute difference between a and b.  Here, we want to
+            # print out which elements violate such conditions.
+            cond = np.logical_or(
+                np.abs(a - b) > atol + rtol * np.abs(b),
+                np.isnan(a) != np.isnan(b))
+            if a.ndim:
+                x = a[np.where(cond)]
+                y = b[np.where(cond)]
+                print("not close where = ", np.where(cond))
+            else:
+                # np.where is broken for scalars
+                x, y = a, b
+            print("not close lhs = ", x)
+            print("not close rhs = ", y)
+            print("not close dif = ", np.abs(x - y))
+            print("not close tol = ", atol + rtol * np.abs(y))
+            print("dtype = %s, shape = %s" % (a.dtype, a.shape))
+            np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
+
+def params_grid(*params):
+    size = len(params)
+    length = 1
+    for p in params:
+        length *= len(p)
+    sizes = [len(p) for p in params]
+    counter = [0] * size
+    total = []
+    for i in range(length):
+        total.append([0]* size)
+    for i in range(length):
+        for j in range(size):
+            total[i][j] = params[j][counter[j]]
+        counter[size - 1] += 1
+        for c in range(size - 1, -1, -1):
+            if (counter[c] == sizes[c] and c > 0):
+                counter[c - 1] += 1
+                counter[c] = 0
+    return total
+
+def generate_sparse_data(shape,
+                    num_points,
+                    num_channels,
+                    integer=False,
+                    data_range=(-1, 1),
+                    with_dense=True,
+                    dtype=np.float32):
+    dense_shape = shape
+    ndim = len(dense_shape)
+    # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
+    num_points = np.array(num_points)
+    # num_points = np.array([3, 2])
+    batch_size = len(num_points)
+    batch_indices = []
+    coors_total = np.stack(
+        np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
+    coors_total = coors_total.reshape(-1, ndim)
+    for i in range(batch_size):
+        np.random.shuffle(coors_total)
+        inds_total = coors_total[:num_points[i]]
+        inds_total = np.pad(
+            inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
+        batch_indices.append(inds_total)
+    if integer:
+        sparse_data = np.random.randint(
+            data_range[0], data_range[1], size=[num_points.sum(), num_channels]).astype(dtype)
+    else:
+        sparse_data = np.random.uniform(
+            data_range[0], data_range[1], size=[num_points.sum(), num_channels]).astype(dtype)
+
+    # sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
+    
+        
+    res =  {
+        "features": sparse_data.astype(dtype),
+        
+    }
+    if with_dense:
+        dense_data = np.zeros(
+            [batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
+        start = 0
+        for i, inds in enumerate(batch_indices):
+            for j, ind in enumerate(inds):
+                dense_slice = (i, slice(None), *ind[:-1])
+                dense_data[dense_slice] = sparse_data[start + j]
+            start += len(inds)
+        res["features_dense"] = dense_data.astype(dtype)
+    batch_indices = np.concatenate(batch_indices, axis=0)
+    res["indices"] = batch_indices.astype(np.int32)
+    return res 
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from spconv import spconv_utils
+from spconv.spconv_utils import (non_max_suppression, non_max_suppression_cpu,
+                                 points_to_voxel_3d_np, rbbox_iou,
+                                 rotate_non_max_suppression_cpu)
+
+def points_to_voxel(points,
+                     voxel_size,
+                     coors_range,
+                     coor_to_voxelidx,
+                     max_points=35,
+                     max_voxels=20000):
+    """convert 3d points(N, >=3) to voxels. This version calculate
+    everything in one loop. now it takes only 0.8ms(~6k voxels) 
+    with c++ and 3.2ghz cpu.
+
+    Args:
+        points: [N, ndim] float tensor. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
+        coors_range: [6] list/tuple or array, float. indicate voxel range.
+            format: xyzxyz, minmax
+        coor_to_voxelidx: int array. used as a dense map.
+        max_points: int. indicate maximum points contained in a voxel.
+        max_voxels: int. indicate maximum voxels this function create.
+            for voxelnet, 20000 is a good choice. you should shuffle points
+            before call this function because max_voxels may drop some points.
+
+    Returns:
+        voxels: [M, max_points, ndim] float tensor. only contain points.
+        coordinates: [M, 3] int32 tensor. zyx format.
+        num_points_per_voxel: [M] int32 tensor.
+    """
+    if not isinstance(voxel_size, np.ndarray):
+        voxel_size = np.array(voxel_size, dtype=points.dtype)
+    if not isinstance(coors_range, np.ndarray):
+        coors_range = np.array(coors_range, dtype=points.dtype)
+    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
+    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
+    voxelmap_shape = voxelmap_shape[::-1]
+    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
+    voxels = np.zeros(
+        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
+    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
+    voxel_num = points_to_voxel_3d_np(
+        points, voxels, coors, num_points_per_voxel, coor_to_voxelidx,
+        voxel_size.tolist(), coors_range.tolist(), max_points, max_voxels)
+    coors = coors[:voxel_num]
+    voxels = voxels[:voxel_num]
+    num_points_per_voxel = num_points_per_voxel[:voxel_num]
+    return voxels, coors, num_points_per_voxel
+
+class VoxelGenerator:
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = np.array(voxel_size, dtype=np.float32)
+        grid_size = (
+            point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size
+        grid_size = np.round(grid_size).astype(np.int64)
+        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
+        voxelmap_shape = voxelmap_shape[::-1]
+
+        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
+        self._voxel_size = voxel_size
+        self._point_cloud_range = point_cloud_range
+        self._max_num_points = max_num_points
+        self._max_voxels = max_voxels
+        self._grid_size = grid_size
+
+    def generate(self, points, max_voxels=None):
+        res = points_to_voxel(
+            points, self._voxel_size, self._point_cloud_range, self._coor_to_voxelidx,
+            self._max_num_points, max_voxels or self._max_voxels)
+        return res 
+
+
+    @property
+    def voxel_size(self):
+        return self._voxel_size
+
+    @property
+    def max_num_points_per_voxel(self):
+        return self._max_num_points
+
+
+    @property
+    def point_cloud_range(self):
+        return self._point_cloud_range
+
+    @property
+    def grid_size(self):
+        return self._grid_size
\ No newline at end of file
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
+add_library(spconv SHARED all.cc indice.cc indice.cu 
+            reordering.cc reordering.cu maxpool.cc maxpool.cu)
+
+target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
+set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
+set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
+set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries(spconv PRIVATE ${ALL_LIBS})
+install (TARGETS spconv DESTINATION lib)
--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime_api.h>
+#include <spconv/pool_ops.h>
+#include <spconv/spconv_ops.h>
+
+static auto registry =
+    torch::jit::RegisterOperators("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
+        .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
+        .op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
+        .op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
+        .op("spconv::indice_conv_fp32", &spconv::indiceConv<float>)
+        .op("spconv::indice_conv_backward_fp32", &spconv::indiceConvBackward<float>)
+        .op("spconv::indice_conv_half", &spconv::indiceConv<at::Half>)
+        .op("spconv::indice_conv_backward_half",
+            &spconv::indiceConvBackward<at::Half>)
+        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
+        .op("spconv::indice_maxpool_backward_fp32",
+            &spconv::indiceMaxPoolBackward<float>)
+        .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
+        .op("spconv::indice_maxpool_backward_half",
+            &spconv::indiceMaxPoolBackward<at::Half>);
\ No newline at end of file
--- a/src/spconv/indice.cc
+++ b/src/spconv/indice.cc
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/geometry.h>
+#include <spconv/indice.h>
+#include <spconv/spconv_ops.h>
+#include <torch/script.h>
+
+namespace spconv {
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                     tv::TensorView<Index> indicesOut,
+                     tv::TensorView<IndexGrid> gridsOut,
+                     tv::TensorView<Index> indicePairs,
+                     tv::TensorView<Index> indiceNum,
+                     const tv::SimpleVector<Index, NDim> kernelSize,
+                     const tv::SimpleVector<Index, NDim> stride,
+                     const tv::SimpleVector<Index, NDim> padding,
+                     const tv::SimpleVector<Index, NDim> dilation,
+                     const tv::SimpleVector<Index, NDim> outSpatialShape,
+                     bool transpose, bool resetGrid) {
+    if (transpose)
+      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut,
+          gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+    else
+      return getIndicePairsConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut,
+          gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+
+  }
+};
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                     tv::TensorView<IndexGrid> gridsOut,
+                     tv::TensorView<Index> indicePairs,
+                     tv::TensorView<Index> indiceNum,
+                     const tv::SimpleVector<Index, NDim> kernelSize,
+                     const tv::SimpleVector<Index, NDim> stride,
+                     const tv::SimpleVector<Index, NDim> padding,
+                     const tv::SimpleVector<Index, NDim> dilation,
+                     const tv::SimpleVector<Index, NDim> outSpatialShape,
+                     bool transpose, bool resetGrid) {
+    return getIndicePairsSubM<Index, IndexGrid, NDim>(
+        indicesIn,
+        gridsOut, indicePairs, indiceNum,
+        kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+  }
+};
+} // namespace functor
+
+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>;      \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,  \
+                                                         NDIM>;
+
+
+#define DECLARE_CPU_INDEX(Index)                                               \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_SPECS_INDEX_NDIM
+
+} // namespace spconv
+
--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/indice.h>
+#include <spconv/indice.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+
+namespace spconv {
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    // auto timer = spconv::CudaContextTimer<>();
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 256>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.stream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                           indiceNum, indicePairUnique, kernelSize, stride,
+                           padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 256>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.stream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                           indiceNum, indicePairUnique, kernelSize, stride,
+                           padding, dilation, outSpatialShape);
+    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
+    return 1;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.stream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                         indicePairUnique, outSpatialShape, batchSize);
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.stream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                         indicePairUnique, outSpatialShape);
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.stream()>>>(indicePairUnique.data(), gridsOut, numAct);
+    }
+    return numAct;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    // auto timer = spconv::CudaContextTimer<>();
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.stream()>>>(indicesIn, gridsOut, outSpatialShape);
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.stream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                         kernelSize, stride, padding, dilation, outSpatialShape);
+    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.stream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
+    }
+    return numActIn;
+  }
+};
+} // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
+  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,    \
+                                                       NDIM>;                  \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int,  \
+                                                         NDIM>;                \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int,  \
+                                                         NDIM>;                \
+  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,    \
+                                                       NDIM>;
+
+#define DECLARE_GPU_INDEX(Index)                                               \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_GPU_INDEX(int);
+
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
+} // namespace spconv
\ No newline at end of file
--- a/src/spconv/maxpool.cc
+++ b/src/spconv/maxpool.cc
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/maxpool.h>
+#include <torch/script.h>
+
+namespace spconv {
+
+namespace functor {
+template <typename T, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto doutData = dout.data();
+    auto dinData = din.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          dinData[idxi + plane] += doutData[idxo + plane];
+    }
+  }
+};
+} // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>;     \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)                                                   \
+  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
+
+} // namespace spconv