working on tensor core test

01ed382c · yan.yan · 3517290c · 3517290c · 3517290c · 3517290c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/fused_spconv_ops.h>
-#include <spconv/nms_ops.h>
-#include <spconv/pillar_scatter_ops.h>
-#include <spconv/point2voxel_ops.h>
-#include <spconv/pool_ops.h>
-#include <spconv/spconv_ops.h>
-#include <torch/script.h>
-
-static auto registry =
-    torch::RegisterOperators()
-        .op("spconv::points_to_voxel", &spconv::pointsToVoxel)
-        .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
-        .op("spconv::indice_conv", &spconv::indiceConv)
-        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
-        .op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
-        .op("spconv::indice_maxpool", &spconv::indiceMaxPool)
-        .op("spconv::indice_maxpool_backward", &spconv::indiceMaxPoolBackward)
-        .op("spconv::nms", &spconv::nonMaxSuppression<float>)
-        .op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
-        .op("spconv::pillar_scatter_half",
-            &spconv::pointPillarScatter<at::Half>);
--- a/src/spconv/cublas_gemm.cc
+++ b/src/spconv/cublas_gemm.cc
-#include <ATen/ATen.h>
-#include <spconv/cublas_gemm.h>
-
-namespace spconv {
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const float *alpha, const float *A, int lda,
-                           const float *B, int ldb, const float *beta, float *C,
-                           int ldc) {
-  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const __half *alpha, const __half *A, int lda,
-                           const __half *B, int ldb, const __half *beta,
-                           __half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const at::Half *alpha, const at::Half *A, int lda,
-                           const at::Half *B, int ldb, const at::Half *beta,
-                           at::Half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k,
-                     reinterpret_cast<const __half *>(alpha),
-                     reinterpret_cast<const __half *>(A), lda,
-                     reinterpret_cast<const __half *>(B), ldb,
-                     reinterpret_cast<const __half *>(beta),
-                     reinterpret_cast<__half *>(C), ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const double *alpha, const double *A, int lda,
-                           const double *B, int ldb, const double *beta,
-                           double *C, int ldc) {
-  return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <> inline __half constant_scalar(float data) {
-  return __float2half(data);
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/fused_conv.cu
+++ b/src/spconv/fused_conv.cu
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <spconv/fused_conv.cu.h>
-#include <spconv/fused_conv.h>
-#include <spconv/minkowski.cu.h>
-#include <tensorview/torch_utils.h>
-
-namespace spconv {
-void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
-                     torch::Tensor filters, torch::Tensor indicesIn,
-                     torch::Tensor indicesOut, int nHot) {
-  auto dtype = output.scalar_type();
-  auto input_nPlanes = features.size(1);
-  auto output_nPlanes = output.size(1);
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::dispatch_torch<float, at::Half>(dtype, [&](auto I) {
-    using T = decltype(I);
-    dConvolution_forward2(stream, features.data_ptr<T>(), output.data_ptr<T>(),
-                          filters.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
-                          indicesOut.data_ptr<int32_t>(), nHot, input_nPlanes,
-                          input_nPlanes, output_nPlanes, output_nPlanes, 1);
-  });
-}
-
-void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
-                              torch::Tensor dout, torch::Tensor filters,
-                              torch::Tensor dfilters, torch::Tensor indicesIn,
-                              torch::Tensor indicesOut, int nHot) {
-  auto dtype = features.scalar_type();
-  auto input_nPlanes = features.size(1);
-  auto output_nPlanes = dout.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  tv::dispatch_torch<float>(dtype, [&](auto I) {
-    using T = decltype(I);
-    dConvolution_backward_dW2(
-        stream, features.data_ptr<T>(), din.data_ptr<T>(), dout.data_ptr<T>(),
-        filters.data_ptr<T>(), dfilters.data_ptr<T>(),
-        indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>(), nHot,
-        input_nPlanes, input_nPlanes, output_nPlanes, output_nPlanes, 1);
-  });
-}
-
-void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
-                               torch::Tensor filters, torch::Tensor indicesIn,
-                               torch::Tensor indicesOut, int nHot) {
-  auto dtype = output.scalar_type();
-  auto in_nchannel = features.size(1);
-  auto out_nchannel = output.size(1);
-  int shared_mem_size = -1;
-  if ((in_nchannel > 16 && out_nchannel > 16 &&
-       in_nchannel * out_nchannel >= 512) ||
-      (in_nchannel > 24 && out_nchannel > 24))
-    shared_mem_size = 32;
-  else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
-    shared_mem_size = 24;
-  else if ((in_nchannel > 8 && out_nchannel > 8) ||
-           (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
-    shared_mem_size = 16;
-  else
-    shared_mem_size = 8;
-  constexpr int MAX_GRID = 65535;
-  auto stream = at::cuda::getCurrentCUDAStream();
-  using shmem_sizes_t = tv::mp_list_c<int, 32, 24, 16, 8>;
-  int num_grid = (nHot + shared_mem_size - 1) / shared_mem_size;
-  int num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
-  int step = (nHot + num_div - 1) / num_div;
-  dim3 threads(shared_mem_size, shared_mem_size);
-
-  tv::dispatch_torch<float>(dtype, [&](auto I) {
-    using T = decltype(I);
-    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
-      constexpr int ShmemSize = decltype(ShSizeValue)::value;
-      for (int s = 0; s < num_div; s++) {
-        int remainder = nHot - step * s;
-        int curr_num_active = remainder < step ? remainder : step;
-        dim3 grid((out_nchannel + threads.x - 1) / threads.x,
-                  (curr_num_active + threads.y - 1) / threads.y);
-        matmul<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
-            features.data_ptr<T>(), in_nchannel, curr_num_active,
-            filters.data_ptr<T>(), out_nchannel, in_nchannel,
-            output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
-            indicesOut.data_ptr<int32_t>());
-      }
-    });
-  });
-}
-void fused_conv_backward_cuda_minkowski(torch::Tensor features,
-                                        torch::Tensor din, torch::Tensor dout,
-                                        torch::Tensor filters,
-                                        torch::Tensor dfilters,
-                                        torch::Tensor indicesIn,
-                                        torch::Tensor indicesOut, int nHot) {
-  auto dtype = features.scalar_type();
-  auto in_nchannel = features.size(1);
-  auto out_nchannel = dout.size(1);
-  int shared_mem_size = -1;
-  if ((in_nchannel > 16 && out_nchannel > 16 &&
-       in_nchannel * out_nchannel >= 512) ||
-      (in_nchannel % 32 == 0 && out_nchannel % 32 == 0))
-    shared_mem_size = 32;
-  else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
-    shared_mem_size = 24;
-  else if ((in_nchannel > 8 && out_nchannel > 8) ||
-           (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
-    shared_mem_size = 16;
-  else
-    shared_mem_size = 8;
-  dim3 threads(shared_mem_size, shared_mem_size);
-
-  constexpr int MAX_GRID = 65535;
-  auto stream = at::cuda::getCurrentCUDAStream();
-  using shmem_sizes_t = tv::mp_list_c<int, 32, 24, 16, 8>;
-
-  int num_grid = (nHot + shared_mem_size - 1) / shared_mem_size;
-  int num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
-  int step = (nHot + num_div - 1) / num_div;
-
-  tv::dispatch_torch<float>(dtype, [&](auto I) {
-    using T = decltype(I);
-    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
-      constexpr int ShmemSize = decltype(ShSizeValue)::value;
-      for (int s = 0; s < num_div; s++) {
-        int remainder = nHot - step * s;
-        int curr_num_active = remainder < step ? remainder : step;
-        dim3 grid((in_nchannel + threads.x - 1) / threads.x,
-                  (curr_num_active + threads.y - 1) / threads.y);
-        matmul2<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
-            dout.data_ptr<T>(), out_nchannel, curr_num_active, // A
-            filters.data_ptr<T>(), out_nchannel,
-            in_nchannel,                                          // B
-            features.data_ptr<T>(), in_nchannel, curr_num_active, // D
-            din.data_ptr<T>(),                                    // C
-            dfilters.data_ptr<T>(),                               // E
-            indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>());
-      }
-    });
-  });
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/indice.cc
+++ b/src/spconv/indice.cc
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/Parallel.h>
-#include <spconv/geometry.h>
-#include <spconv/indice.h>
-#include <spconv/spconv_ops.h>
-#include <tensorview/tensor.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<Index> indicesOut,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *kernelSize, const Index *stride,
-                         const Index *padding, const Index *dilation,
-                         const Index *outSpatialShape) {
-  // indicesOut: num_active * kernelVolume * (NDim + 1)
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index hashval;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      auto iter = hash.find(index);
-      if (iter == hash.end()) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        hashval = numAct++;
-        hash[index] = hashval;
-      } else {
-        hashval = iter->second;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(0, offset, indiceNum[offset]) = j;
-      indicePairs(1, offset, indiceNum[offset]++) = hashval;
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
-                           tv::TensorView<Index> indicesOut,
-                           tv::TensorView<IndexGrid> gridsOut,
-                           tv::TensorView<Index> indicePairs,
-                           tv::TensorView<Index> indiceNum,
-                           const Index *kernelSize, const Index *stride,
-                           const Index *padding, const Index *dilation,
-                           const Index *outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index hashval;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-
-      auto iter = hash.find(index);
-      if (iter == hash.end()) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        hashval = numAct++;
-        hash[index] = hashval;
-      } else {
-        hashval = iter->second;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(0, offset, indiceNum[offset]) = j;
-      indicePairs(1, offset, indiceNum[offset]++) = hashval;
-    }
-  }
-  return numAct;
-}
-
-#ifndef TV_WINDOWS
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    Index index = 0;
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    hash[index] = j;
-  }
-
-  at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
-    Index index = 0;
-    Index numValidPoints = 0;
-    std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-    Index *validPoints = validPoints_.data();
-    Index *pointPtr = nullptr;
-    Index oldOffset = 0;
-    for (int j = begin; j < end; ++j) {
-      numValidPoints = getValidOutPos<Index, NDim>(
-          indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-          dilation, outSpatialShape, validPoints);
-      for (Index i = 0; i < numValidPoints; ++i) {
-        pointPtr = validPoints + i * (NDim + 1);
-        auto offset = pointPtr[NDim];
-        index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                spatialVolume * indicesIn(j, 0);
-        auto iter = hash.find(index);
-        if (iter != hash.end()) {
-#pragma omp atomic capture
-          oldOffset = indiceNum[offset]++;
-          indicePairs(0, offset, oldOffset) = j;
-          indicePairs(1, offset, oldOffset) = iter->second;
-        }
-      }
-    }
-  });
-  return numActIn;
-}
-#else
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  // Index validPoints[kernelVolume * (NDim + 1)];
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    Index index = 0;
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    hash[index] = j;
-  }
-  Index index = 0;
-  for (int j = 0; j < numActIn; ++j) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-              spatialVolume * indicesIn(j, 0);
-      auto iter = hash.find(index);
-      if (iter != hash.end()) {
-        indicePairs(0, offset, indiceNum[offset]) = j;
-        indicePairs(1, offset, indiceNum[offset]++) = iter->second;
-      }
-    }
-  }
-  return numActIn;
-}
-#endif
-
-int create_conv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
-    bool useHash) {
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = decltype(V);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      if (transpose)
-        numActIn = getIndicePairsDeConv<Index, IndexGrid, NDim>(
-            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
-            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
-            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
-            di.data(), ou.data());
-      else
-        numActIn = getIndicePairsConv<Index, IndexGrid, NDim>(
-            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
-            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
-            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
-            di.data(), ou.data());
-    });
-  });
-  return numActIn;
-}
-
-int create_submconv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = decltype(V);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      numActIn = getIndicePairsSubM<Index, IndexGrid, NDim>(
-          tv::torch2tv<Index>(indicesIn), tv::torch2tv<IndexGrid>(gridsOut),
-          tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indiceNum),
-          ks.data(), st.data(), pa.data(), di.data(), ou.data());
-    });
-  });
-  return numActIn;
-}
-
-} // namespace spconv
--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <boost/mp11.hpp>
-#include <chrono>
-#include <cuhash/hash_table.h>
-#include <limits>
-#include <spconv/indice.cu.h>
-#include <spconv/indice.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <type_traits>
-#include <utility/timer.h>
-namespace spconv {
-
-using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
-
-int create_conv_indice_pair_p1_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = kernelSize.size();
-  auto numActIn = indicesIn.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  // auto timer = spconv::CudaContextTimer<>();
-
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      tv::DispatchInt<max_kernel_vol_t>()(
-          kernelVolume, std::less_equal<int>(), [&](auto I2) {
-            constexpr int MaxKernelVolume = decltype(I2)::value;
-            tv::dispatch_int<0, 1>(int(transpose), [&](auto I) {
-              constexpr bool UseDeconv = decltype(I)::value;
-              prepareIndicePairsKernel<Index, NDim, UseDeconv, MaxKernelVolume>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum),
-                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
-                                  pa, di, ou);
-              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
-            });
-#ifdef TV_LOG_KERNEL_INFO
-            cudaFuncAttributes attr;
-            checkCudaErrors(cudaFuncGetAttributes(
-                &attr,
-                prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
-            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
-                        MaxKernelVolume, ">", attr.numRegs);
-#endif
-          });
-    });
-  });
-  return 1;
-}
-
-int create_conv_indice_pair_p2_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  int numAct = indicePairUnique.size(0) - 1;
-
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      using IndexGrid = int32_t;
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      if (useHash) {
-        auto table = cuhash::HashTable();
-        // std::cout << "create " << numAct << " size table..." << std::endl;
-        table.Initialize(numAct, 2.0, 4);
-        unsigned *d_values = nullptr;
-        cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
-        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-        arangeKernel<unsigned>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(d_values, numAct);
-        TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
-        bool res = table.Build(
-            numAct,
-            reinterpret_cast<unsigned *>(indicePairUnique.data_ptr<Index>()),
-            d_values);
-        cudaFree(d_values);
-        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
-        if (!res) {
-          return -1; // use -1 to tell outside use CPU implementation
-        }
-        assignIndiceOutKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
-                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
-        TV_CHECK_CUDA_ERR_V2("assignIndiceOutKernel failed");
-
-        auto tableSize = table.get_table_size();
-        auto tableData = table.data();
-        auto constants = table.get_constants_4();
-        auto stash_constants = table.get_stash_constants();
-        auto stash_count = table.get_stash_count();
-        assignIndicePairsHashKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), tableSize,
-                         tableData, constants, stash_constants, stash_count);
-        TV_CHECK_CUDA_ERR_V2("assignIndicePairsHashKernel failed");
-
-      } else {
-        assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut),
-                         tv::torch2tv<IndexGrid>(gridsOut), numAct,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
-        TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
-        assignIndicePairsKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut),
-                         tv::torch2tv<IndexGrid>(gridsOut), numActIn,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), ou);
-        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
-        tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim,
-                    ">", attr.numRegs);
-        cudaFuncAttributes attr2;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
-        tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">",
-                    attr2.numRegs);
-#endif
-      }
-
-      if (resetGrid && (!useHash)) {
-        resetGridKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(indicePairUnique.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), numAct);
-        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-      }
-    });
-  });
-  return numAct;
-}
-
-int create_submconv_indice_pair_cuda(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      Index spatialVolume = 1;
-      for (int i = 0; i < NDim; ++i) {
-        spatialVolume *= outSpatialShape[i];
-      }
-      auto dispatcher = tv::DispatchIntNoexcept<tv::mp_list_c<int, 1, 3, 5>>();
-      namespace mp11 = boost::mp11;
-
-      using kernel2_candidates_t =
-          mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
-                           tv::mp_list_c<int, 1, 3, 5>>;
-      using kernel3_candidates_t =
-          mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
-                           tv::mp_list_c<int, 1, 3, 5>,
-                           tv::mp_list_c<int, 1, 3, 5>>;
-      using kernel3_candidates_final_t =
-          mp11::mp_push_back<kernel3_candidates_t>;
-      auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
-      auto dispatcher3 =
-          tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
-
-      if (useHash) {
-        auto table = cuhash::HashTable();
-        // std::cout << "create " << numAct << " size table..." << std::endl;
-        table.Initialize(numActIn, 2.0, 4);
-        unsigned *d_keyvalues = nullptr;
-        cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
-        unsigned *d_values = d_keyvalues + numActIn;
-        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-        prepareSubMHashKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
-                         ou);
-        TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
-        bool res =
-            table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
-                        reinterpret_cast<unsigned *>(d_values));
-        cudaFree(d_keyvalues);
-        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
-        if (!res) {
-          return -1; // use -1 to tell outside use CPU implementation
-        }
-        auto tableSize = table.get_table_size();
-        auto tableData = table.data();
-        auto constants = table.get_constants_4();
-        auto stash_constants = table.get_stash_constants();
-        auto stash_count = table.get_stash_count();
-        bool dilation_one = true;
-        for (int i = 0; i < NDim; ++i) {
-          dilation_one &= di[i] == 1;
-        }
-        auto found = false;
-        if (dilation_one && (NDim == 2 || NDim == 3)) {
-          auto indiceNumCpu = indiceNum.cpu();
-          if (NDim == 2) {
-            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              found = true;
-              getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume, tableSize, tableData,
-                                  constants, stash_constants, stash_count);
-            });
-          } else if (NDim == 3) {
-            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
-              found = true;
-              getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume, tableSize, tableData,
-                                  constants, stash_constants, stash_count);
-            });
-          }
-        }
-        if (!found) {
-          tv::DispatchInt<max_kernel_vol_t>()(
-              kernelVolume, std::less_equal<int>(), [&](auto I2) {
-                constexpr int MaxKernelVolume = decltype(I2)::value;
-                getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
-                    <<<tv::cuda::getBlocks(numActIn),
-                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                        tv::torch2tv<Index>(indicesIn),
-                        tv::torch2tv<Index>(indicePairs),
-                        tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
-                        tableSize, tableData, constants, stash_constants,
-                        stash_count);
-                TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
-              });
-        }
-      } else {
-        // auto timer = spconv::CudaContextTimer<>();
-        prepareSubMGridKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
-        // tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
-        TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
-        // when dilation all one, we use a simple kernel to calc result
-        bool dilation_one = true;
-        for (int i = 0; i < NDim; ++i) {
-          dilation_one &= di[i] == 1;
-        }
-        auto found = false;
-        if (dilation_one && (NDim == 2 || NDim == 3)) {
-          auto indiceNumCpu = indiceNum.cpu();
-          if (NDim == 2) {
-            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              found = true;
-              getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<IndexGrid>(gridsOut),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume);
-            });
-          } else if (NDim == 3) {
-            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
-              found = true;
-              getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<IndexGrid>(gridsOut),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume);
-            });
-            /*
-            dispatcher(kernelSize[0], [&](auto K0C) {
-              dispatcher(kernelSize[1], [&](auto K1C) {
-                dispatcher(kernelSize[2], [&](auto K2C) {
-                  constexpr int K0 = decltype(K0C)::value;
-                  constexpr int K1 = decltype(K1C)::value;
-                  constexpr int K2 = decltype(K2C)::value;
-                  found = true;
-                  getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
-                      <<<tv::cuda::getBlocks(numActIn),
-                         tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                          tv::torch2tv<Index>(indicesIn),
-                          tv::torch2tv<IndexGrid>(gridsOut),
-                          tv::torch2tv<Index>(indicePairs),
-                          tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
-                });
-              });
-            });*/
-          }
-        }
-        if (!found) {
-          tv::DispatchInt<
-              max_kernel_vol_t>()(ndim, std::less_equal<int>(), [&](auto I2) {
-            constexpr int MaxKernelVolume = decltype(I2)::value;
-            getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
-                <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-                   stream>>>(tv::torch2tv<Index>(indicesIn),
-                             tv::torch2tv<IndexGrid>(gridsOut),
-                             tv::torch2tv<Index>(indicePairs),
-                             tv::torch2tv<Index>(indiceNum), ks, st, pa, di,
-                             ou);
-            TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsKernel failed");
-          });
-        }
-        // tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
-      }
-
-      if (resetGrid && (!useHash)) {
-        resetGridSubMKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(indicesIn.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn,
-                         spatialVolume);
-        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-      }
-    });
-  });
-  return numActIn;
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/maxpool.cc
+++ b/src/spconv/maxpool.cc
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/maxpool.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-
-void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size) {
-  if (size <= 0)
-    return;
-  int stride = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      auto outFeaturesData = outFeatures.data_ptr<T>();
-      auto inFeaturesData = inFeatures.data_ptr<T>();
-      auto indicesInData = indicesIn.data_ptr<Index>();
-      auto indicesOutData = indicesOut.data_ptr<Index>();
-      Index idxi, idxo;
-      for (int row = 0; row < size; row++) {
-        idxi = indicesInData[row] * stride;
-        idxo = indicesOutData[row] * stride;
-        for (int plane = 0; plane < stride; ++plane)
-          if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
-            outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
-      }
-    });
-  });
-}
-
-void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor dout, torch::Tensor din,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size) {
-  if (size <= 0)
-    return;
-  int stride = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      auto outFeaturesData = outFeatures.data_ptr<T>();
-      auto inFeaturesData = inFeatures.data_ptr<T>();
-      auto doutData = dout.data_ptr<T>();
-      auto dinData = din.data_ptr<T>();
-      auto indicesInData = indicesIn.data_ptr<Index>();
-      auto indicesOutData = indicesOut.data_ptr<Index>();
-      Index idxi, idxo;
-      for (int row = 0; row < size; row++) {
-        idxi = indicesInData[row] * stride;
-        idxo = indicesOutData[row] * stride;
-        for (int plane = 0; plane < stride; ++plane)
-          if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
-            dinData[idxi + plane] += doutData[idxo + plane];
-      }
-    });
-  });
-}
-
-} // namespace spconv
--- a/src/spconv/maxpool.cu
+++ b/src/spconv/maxpool.cu
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/maxpool.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-
-namespace spconv {
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  T in, out;
-  int ILPStrideY[NumILP];
-  Index idxo, idxi;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in > out) {
-          outFeatures[idxo] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
-                             const Index *indicesIn, const Index *indicesOut,
-                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in > out) {
-          outFeatures[RO[ilp] + iy] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
-                                         const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] > bufo[i]) {
-          bufo[i] = bufi[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idxo] =
-          reinterpret_cast<VecType *>(bufo)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
-                                        const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in > out) {
-            outFeatures[RO[ilp] + iy] = in;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
-                      T *din, const Index *indicesIn, const Index *indicesOut,
-                      int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  T in, out;
-  Index idxo, idxi;
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  dout += blockIdx.y * NumTLP;
-  din += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in == out) {
-          din[idxi] += dout[idxo];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
-                                             const T *inFeatures, const T *dout,
-                                             T *din, const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in == out) {
-          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
-maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
-                         const T *dout, T *din, const Index *indicesIn,
-                         const Index *indicesOut, int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  T bufdi[vecloadFactor];
-  T bufdo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<const VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-      reinterpret_cast<VecType *>(bufdo)[0] =
-          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] =
-          reinterpret_cast<VecType *>(din)[idxi];
-
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] == bufo[i]) {
-          bufdi[i] += bufdo[i];
-        }
-      }
-      reinterpret_cast<VecType *>(din)[idxi] =
-          reinterpret_cast<VecType *>(bufdi)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
-                        const T *dout, T *din, const Index *indicesIn,
-                        const Index *indicesOut, int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in == out) {
-            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-          }
-        }
-      }
-    }
-  }
-}
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-
-void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t =
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indicesIn,
-                                       &indicesOut, &notFound](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (numHotBlock >= NumTLP) {
-              maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                     dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                     stream>>>(
-                      outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                      indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                      numHotBlock, numPlanes / vecloadFactor);
-              TV_CHECK_CUDA_ERR();
-            }
-
-            if (size > numHotBlock) {
-              maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  inFeatures.data_ptr<T>(),
-                                  indicesIn.data_ptr<Index>() + numHotBlock,
-                                  indicesOut.data_ptr<Index>() + numHotBlock,
-                                  size - numHotBlock, numPlanes);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (numHotBlock >= NumTLP) {
-          maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                  numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-
-        if (size > numHotBlock) {
-          maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>() + numHotBlock,
-                  indicesOut.data_ptr<Index>() + numHotBlock,
-                  size - numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-      }
-    });
-  });
-}
-
-void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor dout, torch::Tensor din,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t =
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout,
-                                       &din, &indicesIn, &indicesOut,
-                                       &notFound](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (numHotBlock >= NumTLP) {
-              maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                     dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                     stream>>>(outFeatures.data_ptr<T>(),
-                               inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
-                               din.data_ptr<T>(), indicesIn.data_ptr<Index>(),
-                               indicesOut.data_ptr<Index>(), numHotBlock,
-                               numPlanes / vecloadFactor);
-              TV_CHECK_CUDA_ERR();
-            }
-
-            if (size > numHotBlock) {
-              maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
-                                  din.data_ptr<T>(),
-                                  indicesIn.data_ptr<Index>() + numHotBlock,
-                                  indicesOut.data_ptr<Index>() + numHotBlock,
-                                  size - numHotBlock, numPlanes);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (numHotBlock >= NumTLP) {
-          maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  dout.data_ptr<T>(), din.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                  numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-
-        if (size > numHotBlock) {
-          maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  dout.data_ptr<T>(), din.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>() + numHotBlock,
-                  indicesOut.data_ptr<Index>() + numHotBlock,
-                  size - numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-      }
-    });
-  });
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/nms.cc
+++ b/src/spconv/nms.cc
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <boost/geometry.hpp>
-#include <spconv/nms_functor.h>
-#include <torch/script.h>
-#include <vector>
-
-namespace spconv {
-
-namespace functor {
-template <typename T, typename Index>
-struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
-  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxes, T threshold, T eps) {
-    auto ndets = boxes.dim(0);
-    auto suppressed = std::vector<Index>(ndets);
-    auto area = std::vector<T>(ndets);
-    for (int i = 0; i < ndets; ++i) {
-      area[i] =
-          (boxes(i, 2) - boxes(i, 0) + eps) * (boxes(i, 3) - boxes(i, 1) + eps);
-    }
-    int i, j;
-    T xx1, xx2, w, h, inter, ovr;
-    int keepNum = 0;
-    for (int _i = 0; _i < ndets; ++_i) {
-      i = _i;
-      if (suppressed[i] == 1)
-        continue;
-      keep[keepNum] = i;
-      keepNum += 1;
-      for (int _j = _i + 1; _j < ndets; ++_j) {
-        j = _j;
-        if (suppressed[j] == 1)
-          continue;
-        xx2 = std::min(boxes(i, 2), boxes(j, 2));
-        xx1 = std::max(boxes(i, 0), boxes(j, 0));
-        w = xx2 - xx1 + eps;
-        if (w > 0) {
-          xx2 = std::min(boxes(i, 3), boxes(j, 3));
-          xx1 = std::max(boxes(i, 1), boxes(j, 1));
-          h = xx2 - xx1 + eps;
-          if (h > 0) {
-            inter = w * h;
-            ovr = inter / (area[i] + area[j] - inter);
-            if (ovr >= threshold)
-              suppressed[j] = 1;
-          }
-        }
-      }
-    }
-    return keepNum;
-  }
-};
-
-template <typename T, typename Index>
-struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
-  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxCorners,
-                   tv::TensorView<const T> standupIoU, T threshold) {
-    auto ndets = boxCorners.dim(0);
-    auto suppressed = std::vector<Index>(ndets);
-    int i, j;
-    namespace bg = boost::geometry;
-    typedef bg::model::point<T, 2, bg::cs::cartesian> point_t;
-    typedef bg::model::polygon<point_t> polygon_t;
-    polygon_t poly, qpoly;
-    std::vector<polygon_t> poly_inter, poly_union;
-    T inter_area, union_area, overlap;
-    int keepNum = 0;
-    for (int _i = 0; _i < ndets; ++_i) {
-      i = _i;
-      if (suppressed[i] == 1)
-        continue;
-      keep[keepNum] = i;
-      keepNum += 1;
-      for (int _j = _i + 1; _j < ndets; ++_j) {
-        j = _j;
-        if (suppressed[j] == 1)
-          continue;
-        if (standupIoU(i, j) <= 0.0)
-          continue;
-        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
-        bg::append(poly, point_t(boxCorners(i, 1, 0), boxCorners(i, 1, 1)));
-        bg::append(poly, point_t(boxCorners(i, 2, 0), boxCorners(i, 2, 1)));
-        bg::append(poly, point_t(boxCorners(i, 3, 0), boxCorners(i, 3, 1)));
-        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 1, 0), boxCorners(j, 1, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 2, 0), boxCorners(j, 2, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 3, 0), boxCorners(j, 3, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
-        bg::intersection(poly, qpoly, poly_inter);
-
-        if (!poly_inter.empty()) {
-          inter_area = bg::area(poly_inter.front());
-          bg::union_(poly, qpoly, poly_union);
-          if (!poly_union.empty()) { // ignore invalid box
-            union_area = bg::area(poly_union.front());
-            overlap = inter_area / union_area;
-            if (overlap >= threshold)
-              suppressed[j] = 1;
-            poly_union.clear();
-          }
-        }
-        poly.clear();
-        qpoly.clear();
-        poly_inter.clear();
-      }
-    }
-    return keepNum;
-  }
-};
-
-} // namespace functor
-
-#define DECLARE_CPU_T_INDEX(T, Index)                                          \
-  template struct functor::NonMaxSupressionFunctor<tv::CPU, T, Index>;         \
-  template struct functor::rotateNonMaxSupressionFunctor<tv::CPU, T, Index>;
-
-#define DECLARE_CPU_INDEX(Index)                                               \
-  DECLARE_CPU_T_INDEX(float, Index);                                           \
-  DECLARE_CPU_T_INDEX(double, Index);
-
-DECLARE_CPU_INDEX(int);
-DECLARE_CPU_INDEX(long);
-
-#undef DECLARE_CPU_INDEX
-#undef DECLARE_CPU_T_INDEX
-
-} // namespace spconv
--- a/src/spconv/nms.cu
+++ b/src/spconv/nms.cu
-// ------------------------------------------------------------------
-// Deformable Convolutional Networks
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN
-// (https://github.com/shaoqingren/faster_rcnn)
-// ------------------------------------------------------------------
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-#include <utility/timer.h>
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b) {
-  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
-  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  DType interS = width * height;
-  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-
-template <typename DType, int BLOCK_THREADS>
-__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes,
-                           unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
-
-  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size) {
-#pragma unroll
-    for (int i = 0; i < 5; ++i) {
-      block_boxes[threadIdx.x * 5 + i] =
-          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
-    const DType *cur_box = dev_boxes + cur_box_idx * 5;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (int i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
--- a/src/spconv/pillar_scatter.cu
+++ b/src/spconv/pillar_scatter.cu
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/pillar_scatter_functor.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-#include <utility/timer.h>
-
-namespace spconv {
-template <typename T, typename Index>
-__global__ void pointPillarsScatterKernel(tv::TensorView<T> canvas,
-                                          tv::TensorView<const T> features,
-                                          tv::TensorView<const T> coors) {
-  auto numFeatures = features.dim(0);
-  auto numPoints = features.dim(1);
-  for (int i : tv::KernelLoopX<int>(numPoints)) {
-    for (int ifeature : tv::KernelLoopY<int>(numFeatures)) {
-      canvas(int(coors(0, i)), ifeature, int(coors(2, i)), int(coors(3, i))) =
-          features(ifeature, i);
-    }
-  }
-}
-namespace functor {
-template <typename T, typename Index>
-struct PointPillarScatter<tv::GPU, T, Index> {
-  void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const T> coors) {
-    auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
-                     tv::cuda::DivUp(features.dim(0), 32));
-    pointPillarsScatterKernel<T, Index>
-        <<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
-    TV_CHECK_CUDA_ERR();
-  }
-};
-} // namespace functor
-
-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::PointPillarScatter<tv::GPU, T, Index>;
-
-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
-
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/point2voxel.cu
+++ b/src/spconv/point2voxel.cu
-
-#include <ATen/ATen.h>
-#include <spconv/point2voxel.cu.h>
-//#include <spconv/point2voxel.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-
-namespace spconv {
-
-void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
-                                torch::Tensor grids,
-                                torch::Tensor numPointsPerGrid,
-                                torch::Tensor pointIndex,
-                                std::vector<int64_t> gridShape,
-                                const int ndim) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto num_points = points.size(0);
-  auto num_features = points.size(1);
-  tv::dispatch_torch<int32_t>(pointIndex.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
-      scatterPointToGridKernel<Index, NDim>
-          <<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS, 0,
-             stream>>>(tv::torch2tv<float>(points),
-                       tv::torch2tv<Index>(indexes), tv::torch2tv<float>(grids),
-                       tv::torch2tv<Index>(numPointsPerGrid),
-                       tv::torch2tv<Index>(pointIndex), gs);
-      TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-      cudaFuncAttributes attr;
-      checkCudaErrors(
-          cudaFuncGetAttributes(&attr, scatterPointToGridKernel<Index, NDim>));
-      tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim, ">",
-                  attr.numRegs);
-#endif
-    });
-  });
-}
-
-void gather_point_from_grid_cuda(torch::Tensor grids,
-                                 torch::Tensor numPointsPerGrid,
-                                 torch::Tensor pointIndex,
-                                 torch::Tensor pointIndexUnique,
-                                 torch::Tensor voxels, torch::Tensor coors,
-                                 std::vector<int64_t> gridShape,
-                                 const int ndim) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto num_voxel = voxels.size(0);
-  auto num_max_points = pointIndex.size(0) - 1;
-  auto grid_volume = grids.size(0);
-  tv::dispatch_torch<int32_t>(
-      pointIndexUnique.scalar_type(), [&](auto IndexValue) {
-        using Index = decltype(IndexValue);
-        tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-          constexpr int NDim = decltype(I)::value;
-          tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
-
-          resetPointIndexKernel<Index>
-              <<<tv::cuda::getBlocks(num_max_points),
-                 tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                  tv::torch2tv<Index>(pointIndex), grid_volume);
-          TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-          cudaFuncAttributes attr0;
-          checkCudaErrors(cudaFuncGetAttributes(
-              &attr0, resetPointIndexKernel<Index, NDim>));
-          tv::ssprint("resetPointIndexKernel<", tv::type_s<Index>, NDim, ">",
-                      attr0.numRegs);
-#endif
-
-          gatherPointFromGridKernel<Index, NDim>
-              <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, 0,
-                 stream>>>(tv::torch2tv<float>(grids),
-                           tv::torch2tv<Index>(numPointsPerGrid),
-                           tv::torch2tv<Index>(pointIndexUnique),
-                           tv::torch2tv<float>(voxels),
-                           tv::torch2tv<Index>(coors), gs);
-          TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-          cudaFuncAttributes attr1;
-          checkCudaErrors(cudaFuncGetAttributes(
-              &attr1, gatherPointFromGridKernel<Index, NDim>));
-          tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim,
-                      ">", attr1.numRegs);
-#endif
-
-          resetGridKernel<Index><<<tv::cuda::getBlocks(num_voxel),
-                                   tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-              tv::torch2tv<float>(grids), tv::torch2tv<Index>(numPointsPerGrid),
-              tv::torch2tv<Index>(pointIndexUnique));
-          TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-          cudaFuncAttributes attr2;
-          checkCudaErrors(
-              cudaFuncGetAttributes(&attr2, resetGridKernel<Index, NDim>));
-          tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
-                      attr2.numRegs);
-#endif
-        });
-      });
-}
-
-} // namespace spconv
--- a/src/spconv/point2voxel_ops.cc
+++ b/src/spconv/point2voxel_ops.cc
-#include <spconv/point2voxel_ops.h>
-//#include <spconv/point2voxel.cu.h>
-
-namespace spconv {
-
-int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
-                      torch::Tensor pointIndex, torch::Tensor grids,
-                      torch::Tensor numPointsPerGrid, torch::Tensor voxels,
-                      torch::Tensor coors, std::vector<int64_t> gridShape,
-                      const int64_t ndim) {
-  if (points.device().type() == torch::kCPU) {
-    TV_THROW_INVALID_ARG("not support cpu currently");
-  }
-#ifdef TV_CUDA
-  else if (points.device().type() == torch::kCUDA) {
-    scatter_point_to_grid_cuda(points, indexes, grids, numPointsPerGrid,
-                               pointIndex, gridShape, ndim);
-  }
-#endif
-  else {
-    TV_THROW_INVALID_ARG("unknown device type");
-  }
-  auto res = torch::_unique(pointIndex);
-  auto pointIndexUnique = std::get<0>(res);
-  auto num_voxel = pointIndexUnique.size(0) - 1;
-  if (points.device().type() == torch::kCPU) {
-    TV_THROW_INVALID_ARG("not support cpu currently");
-  }
-#ifdef TV_CUDA
-  else if (points.device().type() == torch::kCUDA) {
-    gather_point_from_grid_cuda(grids, numPointsPerGrid, pointIndex,
-                                pointIndexUnique, voxels, coors, gridShape,
-                                ndim);
-  }
-#endif
-  else {
-    TV_THROW_INVALID_ARG("unknown device type");
-  }
-  return num_voxel;
-}
-
-} // namespace spconv
--- a/src/spconv/pool_ops.cc
+++ b/src/spconv/pool_ops.cc
-#include <spconv/pool_ops.h>
-
-namespace spconv {
-torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                            torch::Tensor indiceNum, int64_t numAct) {
-  auto device = features.device().type();
-  auto kernelVolume = indiceNum.size(0);
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
-  double totalTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    if (device == torch::kCPU) {
-      maxpool_fwd_cpu(output, features, indicePairs[0][i], indicePairs[1][i],
-                      nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      maxpool_fwd_cuda(output, features, indicePairs[0][i], indicePairs[1][i],
-                       nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    // totalTime += timer.report() / 1000.0;
-  }
-  // std::cout << "maxpool forward time " << totalTime << std::endl;
-  return output;
-}
-
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                    torch::Tensor outFeatures,
-                                    torch::Tensor outGrad,
-                                    torch::Tensor indicePairs,
-                                    torch::Tensor indiceNum) {
-  auto device = features.device().type();
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  auto kernelVolume = indiceNum.size(0);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      maxpool_bwd_cpu(outFeatures, features, outGrad, inputGrad,
-                      indicePairs[0][i], indicePairs[1][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      maxpool_bwd_cuda(outFeatures, features, outGrad, inputGrad,
-                       indicePairs[0][i], indicePairs[1][i], nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-  }
-  return inputGrad;
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/reordering.cc
+++ b/src/spconv/reordering.cc
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/Parallel.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-
-namespace spconv {
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
-                       torch::Tensor indices, int size) {
-  int numPlanes = features.size(1);
-  auto dtype = features.scalar_type();
-  auto int_dtype = indices.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      Index *indices_data = indices.data_ptr<Index>();
-      T *buffer_data = buffer.data_ptr<T>();
-      const T *features_data = features.data_ptr<T>();
-      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
-        for (int i = begin; i < end; ++i) {
-          std::memcpy(buffer_data + i * numPlanes,
-                      features_data + indices_data[i] * numPlanes,
-                      sizeof(T) * numPlanes);
-        }
-      });
-    });
-  });
-}
-
-void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
-                            torch::Tensor indices, int size) {
-  int numPlanes = outFeatures.size(1);
-  auto dtype = outFeatures.scalar_type();
-  auto int_dtype = indices.scalar_type();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      Index *indices_data = indices.data_ptr<Index>();
-      const T *buffer_data = buffer.data_ptr<T>();
-      T *features_data = outFeatures.data_ptr<T>();
-      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
-        const T *buf = buffer.data_ptr<T>();
-        T *out = outFeatures.data_ptr<T>();
-        for (int i = begin; i < end; ++i) {
-          buf = buffer_data + i * numPlanes;
-          out = features_data + indices_data[i] * numPlanes;
-          for (int j = 0; j < numPlanes; ++j) {
-            out[j] += buf[j];
-          }
-        }
-      });
-    });
-  });
-}
-
-} // namespace spconv
--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-#include <type_traits>
-#include <utility/timer.h>
-namespace spconv {
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-template <typename T>
-using half_vec_t =
-    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
-template <typename T>
-using half_vec_sadd_t =
-    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
-using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;
-
-void sparse_gather_cuda(cudaStream_t stream, torch::Tensor buffer,
-                        torch::Tensor features, torch::Tensor indices,
-                        int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = features.size(1);
-  auto dtype = features.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  // auto timer = spconv::CudaContextTimer<>();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-
-      tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                   vecload_type_t>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                              vecload_type_t>));
-              tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                  <<<dim3(1, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                               features.data_ptr<T>(),
-                               indices.data_ptr<Index>() + nHotBlock,
-                               size - nHotBlock, numPlanes / vecloadFactor);
-
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, gatherVecKernel<T, Index, int(NumTLP), NumILP,
-                                         vecload_type_t>));
-              tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>,
-                          int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        gatherGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                buffer.data_ptr<T>(), features.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
-        tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>,
-                    int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                        torch::Tensor indices, int size) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  return sparse_gather_cuda(stream, buffer, features, indices, size);
-}
-
-void sparse_scatter_add_cuda(cudaStream_t stream, torch::Tensor buffer,
-                             torch::Tensor outFeatures, torch::Tensor indices,
-                             int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = outFeatures.size(1);
-  auto dtype = outFeatures.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_sadd_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor =
-          sizeof(vecload_type_t) / sizeof(T); // important for half.
-
-      tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
-        // vecloadFactor));
-        constexpr int NumILP = NumTLP / 4;
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                                  vecload_type_t>));
-              tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                  indices.data_ptr<Index>() + nHotBlock,
-                                  size - nHotBlock, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr,
-                  scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
-              tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        scatterAddGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
-        tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>,
-                    tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
-                             torch::Tensor indices, int size) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  return sparse_scatter_add_cuda(stream, buffer, outFeatures, indices, size);
-}
-
-void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                              torch::Tensor indices, int size) {
-  // indices: [volume, inds_stride]
-  // buffer: [volume, num_points, num_features]
-  // size == volume * num_points
-  if (size <= 0)
-    return;
-  int numPlanes = features.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = features.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  int inds_stride = indices.size(1);
-  int feature_stride = buffer.size(1);
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>(
-          [=, &buffer, &features, &indices, &notFound](auto NumTLP) {
-            constexpr int NumILP = NumTLP / 4;
-            // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-            int nHotBlock = (size / NumTLP) * NumTLP;
-            if (notFound) {
-              if (numPlanes % NumTLP == 0) {
-                if (nHotBlock >= NumTLP) {
-                  batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                            vecload_type_t>
-                      <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
-                                   indices.data_ptr<Index>(), nHotBlock,
-                                   numPlanes / vecloadFactor, inds_stride,
-                                   feature_stride);
-                  TV_CHECK_CUDA_ERR_V2("batchGatherVecBlockKernel");
-                }
-                if (size - nHotBlock > 0) {
-                  batchGatherVecKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                      <<<dim3(1, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                   features.data_ptr<T>(),
-                                   indices.data_ptr<Index>(), size - nHotBlock,
-                                   nHotBlock, numPlanes / vecloadFactor,
-                                   inds_stride, feature_stride);
-                  TV_CHECK_CUDA_ERR_V2("batchGatherVecKernel");
-                }
-                notFound = false;
-              }
-            }
-          });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        batchGatherGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                buffer.data_ptr<T>(), features.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes, inds_stride,
-                feature_stride);
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
-                                   torch::Tensor outFeatures,
-                                   torch::Tensor indices, int size) {
-  // indices: [volume, inds_stride]
-  // buffer: [volume, num_points, num_features]
-  // size == volume * num_points
-  if (size <= 0)
-    return;
-  int numPlanes = outFeatures.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = outFeatures.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  int inds_stride = indices.size(1);
-  int feature_stride = buffer.size(1);
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_sadd_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = 1; // important for half.
-
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
-                                       &notFound](auto NumTLP) {
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
-        // vecloadFactor));
-        constexpr int NumILP = NumTLP / 4;
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor, inds_stride,
-                               feature_stride);
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              batchScatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                  indices.data_ptr<Index>(), size - nHotBlock,
-                                  nHotBlock, numPlanes, inds_stride,
-                                  feature_stride);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        batchScatterAddGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, 0, numPlanes, inds_stride,
-                feature_stride);
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-} // namespace spconv
--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
-#include <spconv/fused_conv.h>
-#include <spconv/spconv_ops.h>
-#include <spgemm/gemm_th.h>
-#include <tensorview/tensor.h>
-
-namespace spconv {
-
-std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-               std::vector<int64_t> outSpatialShape,
-               std::vector<int64_t> spatialShape,
-               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-               std::vector<int64_t> outPadding, int64_t _subM,
-               int64_t _transpose, int64_t _useHash) {
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  auto NDim = kernelSize.size();
-  // CPU always use hash (tsl::robin_map).
-  bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  std::string msg = "due to limits of cuda hash, the volume of dense space "
-                    "include batch size ";
-  msg += "must less than std::numeric_limits<int>::max() = 2e9";
-  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
-                   msg);
-  torch::Tensor indicePairs =
-      torch::full({2, kernelVolume, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  auto gridSize = batchSize * outputVolume;
-  if (useHash) {
-    gridSize = batchSize;
-  }
-  bool resetGrid = gridOut.numel() != 0;
-  if (!resetGrid) {
-    gridOut = torch::full({gridSize}, -1,
-                          torch::dtype(torch::kInt32).device(indices.device()));
-  }
-  gridOut = gridOut.view({batchSize, -1});
-  int64_t numActOut = -1;
-  for (int i = 0; i < NDim; ++i) {
-    if (subM) {
-      padding[i] = kernelSize[i] / 2;
-      stride[i] = 1;
-    }
-  }
-  // tv::ssprint("prepare", timer.report() / 1000.0);
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      numActOut = create_submconv_indice_pair_cpu(
-          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
-          dilation, outSpatialShape, transpose, false, useHash);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      numActOut = create_submconv_indice_pair_cuda(
-          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
-          dilation, outSpatialShape, transpose, resetGrid, useHash);
-      if (numActOut == -1) {
-        auto device = indices.device();
-        indicePairs = indicePairs.to({torch::kCPU});
-        indiceNum = indiceNum.to({torch::kCPU});
-        indices = indices.to({torch::kCPU});
-        numActOut = create_submconv_indice_pair_cpu(
-            indices, gridOut, indicePairs, indiceNum, kernelSize, stride,
-            padding, dilation, outSpatialShape, transpose, false, useHash);
-        return {indices.to(device), indicePairs.to(device),
-                indiceNum.to(device)};
-      }
-
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // tv::ssprint("subm", timer.report() / 1000.0);
-    return {indices, indicePairs, indiceNum};
-  } else {
-    auto indicePairUnique = torch::full(
-        {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-        torch::dtype(torch::kInt32).device(indices.device()));
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      numActOut = create_conv_indice_pair_cpu(
-          indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
-          padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      numActOut = create_conv_indice_pair_p1_cuda(
-          indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
-          padding, dilation, outSpatialShape, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = create_conv_indice_pair_p2_cuda(
-            indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
-            outSpatialShape, transpose, resetGrid, useHash);
-        if (numActOut == -1) {
-          auto device = indices.device();
-          outInds = outInds.to({torch::kCPU});
-          indicePairs = indicePairs.to({torch::kCPU});
-          indiceNum = indiceNum.to({torch::kCPU});
-          indices = indices.to({torch::kCPU});
-          numActOut = create_conv_indice_pair_cpu(
-              indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,
-              stride, padding, dilation, outSpatialShape, transpose, false,
-              useHash);
-
-          return {outInds.to(device).slice(0, 0, numActOut),
-                  indicePairs.to(device), indiceNum.to(device)};
-        }
-      }
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-
-torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
-                               torch::Tensor indicePairs,
-                               torch::Tensor indiceNum, int64_t numActOut,
-                               int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  // auto timer = spconv::CudaContextTimer<>();
-
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = numActOut;
-  if (subM) { // the center index of subm conv don't need gather and scatter
-    // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-
-    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                                          indicePairNumCpu.data_ptr<int>() +
-                                              indicePairMaxOffset);
-    if (indicePairMaxSize == 0) {
-      return output;
-    }
-  } else {
-    indicePairMaxSize =
-        *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                          indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  }
-
-  torch::Tensor inputBuffer =
-      torch::empty({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({indicePairMaxSize, numOutPlanes}, options);
-
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0,
-  // std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-  //                      indicePairNumCpu.data_ptr<int>() + kernelVolume));
-
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // TODO torch::from_blob is a little slow
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
-                              nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
-  // tv::ssprint("final subm gemm time", timer.report() / 1000.0);
-
-  return output;
-}
-
-template <int Algo>
-torch::Tensor indiceConvFused(torch::Tensor features, torch::Tensor filters,
-                              torch::Tensor indicePairs,
-                              torch::Tensor indiceNum, int64_t numActOut,
-                              int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  if (subM) { // the center index of subm conv don't need gather and scatter
-    // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("fused only support gpu");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      FusedConvDispatch<Algo>::fwd(output, features, filters[i],
-                                   indicePairs[inverse][i],
-                                   indicePairs[!inverse][i], nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-  }
-  return output;
-}
-
-template <bool BatchScatter>
-torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
-                              torch::Tensor indicePairs,
-                              torch::Tensor indiceNum, int64_t numActOut,
-                              int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  auto batchScatter = BatchScatter;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indiceNum.size(0);
-  TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  // auto timer = spconv::CudaContextTimer<>();
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairNumVec =
-      std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
-                   indicePairNumVec.end(), std::greater<int>());
-  int indicePairTop2Size = indicePairNumVec[1];
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto indice_dtype = indicePairs.scalar_type();
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  // we cant use batch conv in subm directly because
-  // number of indice in the center of filter is much more than other
-  // filter location.
-  // so we first use top2 indice num to do batch conv, then
-  // do native conv (gemm) in center.
-  int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
-  int maxKernelVolumePart = kernelVolume;
-  std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
-  filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
-
-  if (subM) {
-    maxKernelVolumePart = std::max(indicePairMaxOffset,
-                                   int(kernelVolume - indicePairMaxOffset - 1));
-    part_ranges = {{0, indicePairMaxOffset},
-                   {indicePairMaxOffset + 1, kernelVolume}};
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-    if (indicePairTop2Size == 0) {
-      return output;
-    }
-  }
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0);
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-
-  torch::Tensor inputBuffer =
-      torch::empty({maxKernelVolumePart, bufferSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
-  for (auto &range : part_ranges) {
-    int start = range.first;
-    int end = range.second;
-    int length = end - start;
-    int64_t size = length * bufferSize;
-    auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
-    auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
-    auto indicePairs1Part =
-        tv::torch_slice_first_axis(indicePairs[inverse], start, end);
-    auto indicePairs2Part =
-        tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
-    auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
-                               size);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-
-    torch::bmm_out(outputBufferPart, inputBufferPart, filtersPart);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (batchScatter) {
-      if (device == torch::kCPU) {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-#ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        batch_sparse_scatter_add_cuda(outputBufferPart, output,
-                                      indicePairs2Part, size);
-      }
-#endif
-      else {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-    } else {
-      for (int i = 0; i < length; ++i) {
-        auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
-        if (nHot <= 0) {
-          continue;
-        }
-        if (device == torch::kCPU) {
-          sparse_scatter_add_cpu(outputBufferPart[i], output,
-                                 indicePairs2Part[i], nHot);
-        }
-#ifdef TV_CUDA
-        else if (device == torch::kCUDA) {
-          sparse_scatter_add_cuda(outputBufferPart[i], output,
-                                  indicePairs2Part[i], nHot);
-        }
-#endif
-        else {
-          TV_THROW_INVALID_ARG("unknown device type");
-        }
-      }
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
-
-  return output;
-}
-
-std::vector<torch::Tensor>
-indiceConvBwNative(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::empty(filterShape, options);
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-
-    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                                          indicePairNumCpu.data_ptr<int>() +
-                                              indicePairMaxOffset);
-    if (indicePairMaxSize == 0) {
-      return {inputGrad, filtersGrad.view(filterShape)};
-    }
-  } else {
-    indicePairMaxSize =
-        *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                          indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  }
-
-  torch::Tensor inputBuffer =
-      torch::empty({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({indicePairMaxSize, numOutPlanes}, options);
-
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
-      sparse_gather_cpu(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
-      sparse_gather_cuda(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-
-    auto filterGradSub = filtersGrad[i];
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(inputBuffer, inputGrad, indicePairs[inverse][i],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[inverse][i],
-                              nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <int Algo>
-std::vector<torch::Tensor>
-indiceConvBwFused(torch::Tensor features, torch::Tensor filters,
-                  torch::Tensor outGrad, torch::Tensor indicePairs,
-                  torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      FusedConvDispatch<Algo>::bwd(features, inputGrad, outGrad, filters[i],
-                                   filtersGrad[i], indicePairs[inverse][i],
-                                   indicePairs[!inverse][i], nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <bool BatchScatter>
-std::vector<torch::Tensor>
-indiceConvBwBatch(torch::Tensor features, torch::Tensor filters,
-                  torch::Tensor outGrad, torch::Tensor indicePairs,
-                  torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto batchScatter = BatchScatter;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indiceNum.size(0);
-  TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairNumVec =
-      std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
-                   indicePairNumVec.end(), std::greater<int>());
-  int indicePairTop2Size = indicePairNumVec[1];
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto indice_dtype = indicePairs.scalar_type();
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-
-  std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
-  int maxKernelVolumePart = kernelVolume;
-  if (subM) {
-    maxKernelVolumePart = std::max(indicePairMaxOffset,
-                                   int(kernelVolume - indicePairMaxOffset - 1));
-    part_ranges = {{0, indicePairMaxOffset},
-                   {indicePairMaxOffset + 1, kernelVolume}};
-    auto filtersGradSub = filtersGrad[indicePairMaxOffset];
-    auto filtersSub = filters[indicePairMaxOffset];
-    torch::mm_out(filtersGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filtersSub.t());
-    if (indicePairTop2Size == 0) {
-      return {inputGrad, filtersGrad.view(filterShape)};
-    }
-  }
-  torch::Tensor inputBuffer =
-      torch::zeros({maxKernelVolumePart, bufferSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
-
-  for (auto &range : part_ranges) {
-    int start = range.first;
-    int end = range.second;
-    int length = end - start;
-    int64_t size = length * bufferSize;
-    auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
-    auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
-    auto indicePairs1Part =
-        tv::torch_slice_first_axis(indicePairs[inverse], start, end);
-    auto indicePairs2Part =
-        tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
-    auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
-    auto filtersGradPart = tv::torch_slice_first_axis(filtersGrad, start, end);
-
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
-                               size);
-      batch_sparse_gather_cuda(outputBufferPart, outGrad, indicePairs2Part,
-                               size);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // filters: KV, I, O, inputBuffer: [KV, buffer, I]
-    // outputBuffer: [KV, buffer, O]
-    torch::bmm_out(filtersGradPart, inputBufferPart.permute({0, 2, 1}),
-                   outputBufferPart);
-    torch::bmm_out(inputBuffer, outputBufferPart,
-                   filtersPart.permute({0, 2, 1}));
-    if (batchScatter) {
-      if (device == torch::kCPU) {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-#ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        batch_sparse_scatter_add_cuda(inputBufferPart, inputGrad,
-                                      indicePairs1Part, size);
-      }
-#endif
-      else {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-    } else {
-      for (int i = 0; i < length; ++i) {
-        auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
-        if (nHot <= 0) {
-          continue;
-        }
-        if (device == torch::kCPU) {
-          sparse_scatter_add_cpu(inputBufferPart[i], inputGrad,
-                                 indicePairs1Part[i], nHot);
-        }
-#ifdef TV_CUDA
-        else if (device == torch::kCUDA) {
-          sparse_scatter_add_cuda(inputBufferPart[i], inputGrad,
-                                  indicePairs1Part[i], nHot);
-        }
-#endif
-        else {
-          TV_THROW_INVALID_ARG("unknown device type");
-        }
-      }
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <int Algo> struct ConvDispatch;
-
-template <> struct ConvDispatch<kNative> {
-  constexpr static auto *fwd = indiceConvNative;
-  constexpr static auto *bwd = indiceConvBwNative;
-};
-
-template <> struct ConvDispatch<kBatch> {
-  constexpr static auto *fwd = indiceConvBatch<false>;
-  constexpr static auto *bwd = indiceConvBwBatch<false>;
-};
-
-template <> struct ConvDispatch<kBatchGemmGather> {
-  constexpr static auto *fwd = indiceConvBatch<true>;
-  constexpr static auto *bwd = indiceConvBwBatch<true>;
-};
-
-template <> struct ConvDispatch<kSparseConvNet> {
-  constexpr static auto *fwd = indiceConvFused<kFSparseConvNet>;
-  constexpr static auto *bwd = indiceConvBwFused<kFSparseConvNet>;
-};
-
-template <> struct ConvDispatch<kMinkowskiEngine> {
-  constexpr static auto *fwd = indiceConvFused<kFMinkowskiEngine>;
-  constexpr static auto *bwd = indiceConvBwFused<kFMinkowskiEngine>;
-};
-
-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM,
-                         int64_t algo) {
-  torch::Tensor res;
-  tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
-    constexpr int AlgoValue = decltype(I)::value;
-    res = ConvDispatch<AlgoValue>::fwd(features, filters, indicePairs,
-                                       indiceNum, numActOut, _inverse, _subM);
-  });
-  return res;
-}
-
-std::vector<torch::Tensor>
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
-                   int64_t algo) {
-  std::vector<torch::Tensor> res;
-  tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
-    constexpr int AlgoValue = decltype(I)::value;
-    res = ConvDispatch<AlgoValue>::bwd(features, filters, outGrad, indicePairs,
-                                       indiceNum, _inverse, _subM);
-  });
-  return res;
-}
-} // namespace spconv
--- a/src/spgemm/CMakeLists.txt
+++ b/src/spgemm/CMakeLists.txt
-set(ALL_FILES ${ALL_FILES} gemm.cu)
-
-add_library(spgemm SHARED ${ALL_FILES})
-
-target_include_directories(spgemm PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} ${CUTLASS_INCLUDE} )
-set_property(TARGET spgemm PROPERTY CUDA_STANDARD 14)
-set_property(TARGET spgemm PROPERTY CXX_STANDARD 14)
-set_target_properties(spgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(spgemm PRIVATE ${ALL_LIBS})
-install (TARGETS spgemm DESTINATION lib)
--- a/src/spgemm/gemm.cu
+++ b/src/spgemm/gemm.cu
-#include <spgemm/gemm.h>
-#include <spgemm/gemm_th.h>
-namespace spconv {
-template <typename T>
-using determine_half_t =
-    std::conditional_t<std::is_same<T, at::Half>::value, cutlass::half_t, T>;
-
-void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
-                    torch::Tensor b) {
-  TV_ASSERT_RT_ERR(c.dtype() == a.dtype() && c.dtype() == b.dtype(),
-                   "dtype must be same");
-  TV_ASSERT_RT_ERR(c.is_contiguous() && b.is_contiguous() && a.is_contiguous(),
-                   "error");
-  auto M = a.size(0);
-  auto K = a.size(1);
-  auto N = b.size(1);
-  TV_ASSERT_RT_ERR(b.size(0) == K && c.size(0) == M && c.size(1) == N, "error");
-  tv::dispatch_torch<float, at::Half>(c.scalar_type(), [&](auto I) {
-    using T = decltype(I);
-    using HalfT = determine_half_t<T>;
-    auto status = cutlassGemm<HalfT, false, false, false>(
-        stream, M, N, K, HalfT(1.0), reinterpret_cast<HalfT *>(a.data_ptr<T>()),
-        a.size(1), reinterpret_cast<HalfT *>(b.data_ptr<T>()), b.size(1),
-        HalfT(0.0), reinterpret_cast<HalfT *>(c.data_ptr<T>()), c.size(1));
-    TV_ASSERT_RT_ERR(status == cudaSuccess, "error");
-  });
-}
-
-void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  return cutlass_mm_out(stream, c, a, b);
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spgemm/torchdev_cutlass.cu
+++ b/src/spgemm/torchdev_cutlass.cu
-#define TV_CUDA
-#include <cutlass/gemm/device/gemm.h>
-#include <spgemm/gemm.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-int main() {
-  auto M = 100000;
-  auto N = 128;
-  auto K = 128;
-  auto a =
-      torch::rand({M, K}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
-  auto b = torch::rand({K, N}, a.options());
-  auto c = torch::zeros({a.size(0), b.size(1)}, a.options());
-  auto c2 = torch::zeros({a.size(0), b.size(1)}, a.options());
-  torch::mm_out(c, a, b);
-  auto status = spconv::cutlassGemm<float, false, false, false>(
-      0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
-      b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
-  auto err = torch::norm(c2 - c);
-  tv::ssprint(status, "linalg norm", err);
-  tv::ssprint((c.view({-1}) == 0).sum());
-  auto timer = spconv::CudaContextTimer<>();
-  for (int i = 0; i < 10; ++i) {
-    torch::mm_out(c, a, b);
-    tv::ssprint("mm", timer.report() / 1000.0);
-    spconv::cutlassGemm<float, false, false, false>(
-        0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
-        b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
-    tv::ssprint("cutlass_mm", timer.report() / 1000.0);
-  }
-
-  return 0;
-}
\ No newline at end of file
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
-if (SPCONV_BuildCUDA)
-    add_library(spconv_nms STATIC nms.cu)
-    set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
-    set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
-    target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
-    set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
-    set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
-    set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(spconv_nms ${CUDA_CUDART})
-    install (TARGETS spconv_nms DESTINATION lib)
-endif()
-
-add_library(spconv_utils SHARED all.cc)
-set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
-set_target_properties(spconv_utils PROPERTIES SOVERSION 1)
-target_include_directories(spconv_utils PRIVATE ${ALL_INCLUDE}
-                    ${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
-set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
-set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
-set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
-                                         SUFFIX "${PYTHON_MODULE_EXTENSION}")
-if (SPCONV_BuildCUDA)
-    target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
-else()
-    target_link_libraries(spconv_utils pybind11::module)
-endif()
-install (TARGETS spconv_utils DESTINATION lib)