format code with clang-format, better c++ code

19e73bbe · Yan Yan · c336139f · 19e73bbe · 19e73bbe · 19e73bbe
Commit 19e73bbe authored May 20, 2020 by Yan Yan
17 changed files
--- a/src/cuhash/main.cc
+++ b/src/cuhash/main.cc
-#include <cuhash/hash_table.h>
 #include <cuda.h>
+#include <cuhash/hash_table.h>

-int main(){
-    auto table = cuhash::HashTable();
-    table.Initialize(10, 2.0);
-    const int N = 10;
-
-    // ハッシュテーブルに格納するデータ
-    int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
-    int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-    // デバイスメモリにコピー
-    int *d_keys, *d_vals;
-    cudaMalloc((void**)&d_keys, sizeof(int) * N);
-    cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_vals, sizeof(int) * N);
-    cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
-
-    // ハッシュテーブルにクエリするデータ
-    int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    int output[N];
-
-    // デバイスメモリにコピー
-    int *d_input, *d_output;
-    cudaMalloc((void**)&d_input, sizeof(int) * N);
-    cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_output, sizeof(int) * N);
-    cudaMemset(d_output, 0, sizeof(int) * N);
-    bool s = table.Build(N, (const unsigned int *) d_keys, 
-                            (const unsigned int *) d_vals);
-
-    std::cout << s << std::endl;
-    table.Retrieve(N, (const unsigned int *) d_input,
-                            (unsigned int *) d_output);
-
-    std::cout << s << std::endl;
-    cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
-    for (int i = 0; i < N; ++i) {
-        printf("%d\n", output[i]);
-    }
-
-    return 0;
+int main() {
+  auto table = cuhash::HashTable();
+  table.Initialize(10, 2.0);
+  const int N = 10;
+
+  // ハッシュテーブルに格納するデータ
+  int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
+  int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // デバイスメモリにコピー
+  int *d_keys, *d_vals;
+  cudaMalloc((void **)&d_keys, sizeof(int) * N);
+  cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
+  cudaMalloc((void **)&d_vals, sizeof(int) * N);
+  cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
+
+  // ハッシュテーブルにクエリするデータ
+  int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  int output[N];
+
+  // デバイスメモリにコピー
+  int *d_input, *d_output;
+  cudaMalloc((void **)&d_input, sizeof(int) * N);
+  cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
+  cudaMalloc((void **)&d_output, sizeof(int) * N);
+  cudaMemset(d_output, 0, sizeof(int) * N);
+  bool s = table.Build(N, (const unsigned int *)d_keys,
+                       (const unsigned int *)d_vals);
+
+  std::cout << s << std::endl;
+  table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
+
+  std::cout << s << std::endl;
+  cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  for (int i = 0; i < N; ++i) {
+    printf("%d\n", output[i]);
+  }
+
+  return 0;
 }
\ No newline at end of file
--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <torch/script.h>
-#include <spconv/pool_ops.h>
-#include <spconv/spconv_ops.h>
-#include <spconv/pillar_scatter_ops.h>
 #include <spconv/fused_spconv_ops.h>
 #include <spconv/nms_ops.h>
+#include <spconv/pillar_scatter_ops.h>
+#include <spconv/pool_ops.h>
+#include <spconv/spconv_ops.h>
+#include <torch/script.h>

 static auto registry =
    torch::RegisterOperators()
        .op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
        .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
        .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
-        .op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
-        .op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
+        .op("spconv::get_indice_pairs_grid_2d",
+            &spconv::getIndicePairPreGrid<2>)
+        .op("spconv::get_indice_pairs_grid_3d",
+            &spconv::getIndicePairPreGrid<3>)
        .op("spconv::indice_conv", &spconv::indiceConv)
        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
-        .op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>)
-        .op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>)
+        .op("spconv::fused_indice_conv_fp32",
+            &spconv::fusedIndiceConvBatchNorm<float>)
+        .op("spconv::fused_indice_conv_half",
+            &spconv::fusedIndiceConvBatchNorm<at::Half>)
        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
        .op("spconv::indice_maxpool_backward_fp32",
            &spconv::indiceMaxPoolBackward<float>)
@@ -38,4 +42,5 @@ static auto registry =
            &spconv::indiceMaxPoolBackward<at::Half>)
        .op("spconv::nms", &spconv::nonMaxSuppression<float>)
        .op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
-        .op("spconv::pillar_scatter_half", &spconv::pointPillarScatter<at::Half>);
+        .op("spconv::pillar_scatter_half",
+            &spconv::pointPillarScatter<at::Half>);
--- a/src/spconv/indice.cc
+++ b/src/spconv/indice.cc
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <ATen/Parallel.h>
 #include <spconv/geometry.h>
 #include <spconv/indice.h>
 #include <spconv/spconv_ops.h>
 #include <torch/script.h>
-#include <ATen/Parallel.h>

 namespace spconv {

@@ -45,7 +45,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  Index hashval;
  tsl::robin_map<Index, Index> hash;
@@ -67,7 +67,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
        indicesOut(numAct, 0) = batchIdx;
        hashval = numAct++;
        hash[index] = hashval;
-      }else{
+      } else {
        hashval = iter->second;
      }
      // indicePairs: [K, 2, L]
@@ -102,7 +102,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  Index hashval;
  tsl::robin_map<Index, Index> hash;
@@ -125,7 +125,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
        indicesOut(numAct, 0) = batchIdx;
        hashval = numAct++;
        hash[index] = hashval;
-      }else{
+      } else {
        hashval = iter->second;
      }
      // indicePairs: [K, 2, L]
@@ -136,7 +136,6 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
  return numAct;
 }

-
 #ifndef TV_WINDOWS
 template <typename Index, typename IndexGrid, unsigned NDim>
 Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
@@ -145,7 +144,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
                         tv::TensorView<Index> indiceNum,
                         const Index *const kernelSize,
                         const Index *const stride, const Index *const padding,
-                         const Index *dilation, const Index *const outSpatialShape) {
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
  Index numAct = 0;
  auto numActIn = indicesIn.dim(0);
  Index batchIdx = 0;
@@ -167,12 +167,12 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
            spatialVolume * indicesIn(j, 0);
    hash[index] = j;
  }
-  
-  at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end){
+
+  at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
    Index index = 0;
    Index numValidPoints = 0;
    std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-    Index* validPoints = validPoints_.data();
+    Index *validPoints = validPoints_.data();
    Index *pointPtr = nullptr;
    Index oldOffset = 0;
    for (int j = begin; j < end; ++j) {
@@ -186,7 +186,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
                spatialVolume * indicesIn(j, 0);
        auto iter = hash.find(index);
        if (iter != hash.end()) {
-          #pragma omp atomic capture
+#pragma omp atomic capture
          oldOffset = indiceNum[offset]++;
          indicePairs(offset, 0, oldOffset) = j;
          indicePairs(offset, 1, oldOffset) = iter->second;
@@ -196,7 +196,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
  });
  return numActIn;
 }
-#else 
+#else
 template <typename Index, typename IndexGrid, unsigned NDim>
 Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
                         tv::TensorView<IndexGrid> gridsOut,
@@ -204,7 +204,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
                         tv::TensorView<Index> indiceNum,
                         const Index *const kernelSize,
                         const Index *const stride, const Index *const padding,
-                         const Index *dilation, const Index *const outSpatialShape) {
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
  Index numAct = 0;
  auto numActIn = indicesIn.dim(0);
  Index batchIdx = 0;
@@ -221,7 +222,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
  Index numValidPoints = 0;
  // Index validPoints[kernelVolume * (NDim + 1)];
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  tsl::robin_map<Index, Index> hash;
  for (int j = 0; j < numActIn; ++j) {
@@ -255,57 +256,53 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
 namespace functor {
 template <typename Index, typename IndexGrid, unsigned NDim>
 struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                     tv::TensorView<Index> indicesOut,
-                     tv::TensorView<IndexGrid> gridsOut,
-                     tv::TensorView<Index> indicePairs,
-                     tv::TensorView<Index> indiceNum,
-                     const tv::SimpleVector<Index, NDim> kernelSize,
-                     const tv::SimpleVector<Index, NDim> stride,
-                     const tv::SimpleVector<Index, NDim> padding,
-                     const tv::SimpleVector<Index, NDim> dilation,
-                     const tv::SimpleVector<Index, NDim> outSpatialShape,
-                     bool transpose, bool resetGrid, bool useHash) {
+  Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid, bool useHash) {
    if (transpose)
      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut,
-          gridsOut, indicePairs, indiceNum,
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
    else
      return getIndicePairsConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut,
-          gridsOut, indicePairs, indiceNum,
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
-
  }
 };
 template <typename Index, typename IndexGrid, unsigned NDim>
 struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                     tv::TensorView<IndexGrid> gridsOut,
-                     tv::TensorView<Index> indicePairs,
-                     tv::TensorView<Index> indiceNum,
-                     const tv::SimpleVector<Index, NDim> kernelSize,
-                     const tv::SimpleVector<Index, NDim> stride,
-                     const tv::SimpleVector<Index, NDim> padding,
-                     const tv::SimpleVector<Index, NDim> dilation,
-                     const tv::SimpleVector<Index, NDim> outSpatialShape,
-                     bool transpose, bool resetGrid, bool useHash) {
+  Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid, bool useHash) {
    return getIndicePairsSubM<Index, IndexGrid, NDim>(
-        indicesIn,
-        gridsOut, indicePairs, indiceNum,
-        kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
  }
 };
 } // namespace functor

 #define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
-  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>;      \
-  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,  \
-                                                         NDIM>;
-
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int,    \
+                                                       NDIM>;                  \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,    \
+                                                       NDIM>;

 #define DECLARE_CPU_INDEX(Index)                                               \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1);                                      \
@@ -320,4 +317,3 @@ DECLARE_CPU_INDEX(long);
 #undef DECLARE_CPU_SPECS_INDEX_NDIM

 } // namespace spconv
-
--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -14,17 +14,240 @@

 #include <ATen/ATen.h>
 #include <chrono>
+#include <cuhash/hash_table.h>
 #include <limits>
-#include <spconv/mp_helper.h>
-#include <spconv/indice.h>
 #include <spconv/indice.cu.h>
-#include <tensorview/helper_launch.h>
+#include <spconv/indice.h>
+#include <tensorview/cuda_utils.h>
+#include <tensorview/mp_helper.h>
+#include <tensorview/torch_utils.h>
+
+#include <tensorview/tensor.h>
 #include <tensorview/tensorview.h>
 #include <type_traits>
 #include <utility/timer.h>
-#include <cuhash/hash_table.h>

 namespace spconv {
+
+int create_conv_indice_pair_p1_cuda(
+    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
+    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
+    std::vector<int64_t> stride, std::vector<int64_t> padding,
+    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
+    bool transpose) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  auto ndim = kernelSize.size();
+  auto numActIn = indicesIn.size(0);
+  if (numActIn == 0)
+    return 0;
+  // dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
+  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
+    using Index = decltype(V);
+    using IndexGrid = int32_t;
+    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
+      constexpr int NDim = I;
+      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
+      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
+      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
+      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
+      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
+                                       outSpatialShape.end());
+      if (transpose) {
+        prepareDeConvIndicePairsKernel<Index, NDim, 4096>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesIn),
+                         tv::torch2tv<Index>(indicePairs),
+                         tv::torch2tv<Index>(indiceNum),
+                         tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
+                         ou);
+      } else {
+        prepareIndicePairsKernel<Index, NDim, 4096>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesIn),
+                         tv::torch2tv<Index>(indicePairs),
+                         tv::torch2tv<Index>(indiceNum),
+                         tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
+                         ou);
+      }
+    });
+  });
+  return 1;
+}
+
+int create_conv_indice_pair_p2_cuda(
+    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
+    torch::Tensor indicePairs, torch::Tensor indiceNum,
+    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
+    bool transpose, bool resetGrid, bool useHash) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  auto ndim = outSpatialShape.size();
+  auto numActIn = indicesIn.size(0);
+  int batchSize = gridsOut.size(0);
+  int numAct = indicePairUnique.size(0) - 1;
+
+  auto kernelVolume = indicePairs.size(0);
+  if (numActIn == 0)
+    return 0;
+  // dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
+  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
+    using Index = decltype(V);
+    using IndexGrid = int32_t;
+    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
+      constexpr int NDim = I;
+      using IndexGrid = int32_t;
+      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
+                                       outSpatialShape.end());
+      if (useHash) {
+        auto table = cuhash::HashTable();
+        // std::cout << "create " << numAct << " size table..." << std::endl;
+        table.Initialize(numAct, 2.0, 4);
+        unsigned *d_values = nullptr;
+        cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
+        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
+        arangeKernel<unsigned>
+            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(d_values, numAct);
+        TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
+        bool res =
+            table.Build(numAct,
+                        reinterpret_cast<unsigned *>(
+                            tv::torch2tv<Index>(indicePairUnique).data()),
+                        d_values);
+        cudaFree(d_values);
+        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
+        if (!res) {
+          return -1; // use -1 to tell outside use CPU implementation
+        }
+        assignIndiceOutKernel<Index, NDim>
+            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
+                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
+        auto tableSize = table.get_table_size();
+        auto tableData = table.data();
+        auto constants = table.get_constants_4();
+        auto stash_constants = table.get_stash_constants();
+        auto stash_count = table.get_stash_count();
+        assignIndicePairsHashKernel<Index, NDim>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
+                         tv::torch2tv<Index>(indicePairs),
+                         tv::torch2tv<Index>(indicePairUnique), tableSize,
+                         tableData, constants, stash_constants, stash_count);
+
+      } else {
+        assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesOut),
+                         tv::torch2tv<IndexGrid>(gridsOut), numAct,
+                         tv::torch2tv<Index>(indicePairs),
+                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
+        TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
+        assignIndicePairsKernel<Index, IndexGrid, NDim>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesOut),
+                         tv::torch2tv<IndexGrid>(gridsOut), numAct,
+                         tv::torch2tv<Index>(indicePairs),
+                         tv::torch2tv<Index>(indicePairUnique), ou);
+        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+      }
+
+      if (resetGrid && (!useHash)) {
+        resetGridKernel<Index, IndexGrid, NDim>
+            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(indicePairUnique.data_ptr<Index>(),
+                         tv::torch2tv<IndexGrid>(gridsOut), numAct);
+        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
+      }
+    });
+  });
+  return numAct;
+}
+
+int create_submconv_indice_pair_cuda(
+    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
+    std::vector<int64_t> stride, std::vector<int64_t> padding,
+    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
+    bool transpose, bool resetGrid, bool useHash) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  auto ndim = outSpatialShape.size();
+  auto numActIn = indicesIn.size(0);
+  int batchSize = gridsOut.size(0);
+
+  auto kernelVolume = indicePairs.size(0);
+  if (numActIn == 0)
+    return 0;
+  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
+    using Index = decltype(V);
+    using IndexGrid = int32_t;
+    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
+      constexpr int NDim = I;
+      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
+      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
+      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
+      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
+      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
+                                       outSpatialShape.end());
+      if (useHash) {
+        auto table = cuhash::HashTable();
+        // std::cout << "create " << numAct << " size table..." << std::endl;
+        table.Initialize(numActIn, 2.0, 4);
+        unsigned *d_keyvalues = nullptr;
+        cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
+        unsigned *d_values = d_keyvalues + numActIn;
+        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
+        prepareSubMHashKernel<Index, NDim>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
+                         ou);
+        TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
+        bool res =
+            table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
+                        reinterpret_cast<unsigned *>(d_values));
+        cudaFree(d_values);
+        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
+        if (!res) {
+          return -1; // use -1 to tell outside use CPU implementation
+        }
+        auto tableSize = table.get_table_size();
+        auto tableData = table.data();
+        auto constants = table.get_constants_4();
+        auto stash_constants = table.get_stash_constants();
+        auto stash_count = table.get_stash_count();
+        getSubMIndicePairsHashKernel<Index, NDim, 4096>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesIn),
+                         tv::torch2tv<Index>(indicePairs),
+                         tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
+                         tableSize, tableData, constants, stash_constants,
+                         stash_count);
+      } else {
+        prepareSubMGridKernel<Index, IndexGrid, NDim>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesIn),
+                         tv::torch2tv<IndexGrid>(gridsOut), ou);
+        TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
+        getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(tv::torch2tv<Index>(indicesIn),
+                         tv::torch2tv<IndexGrid>(gridsOut),
+                         tv::torch2tv<Index>(indicePairs),
+                         tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou);
+        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+      }
+
+      if (resetGrid && (!useHash)) {
+        resetGridSubMKernel<Index, IndexGrid, NDim>
+            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+               stream>>>(indicesIn.data_ptr<Index>(),
+                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn);
+        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
+      }
+    });
+  });
+  return numActIn;
+}
+
 namespace functor {
 template <typename Index, typename IndexGrid, unsigned NDim>
 struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
@@ -46,17 +269,17 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
      return 0;
    // auto timer = spconv::CudaContextTimer<>();
    if (transpose)
-      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                           indiceNum, indicePairUnique, kernelSize, stride,
-                           padding, dilation, outSpatialShape);
+      prepareDeConvIndicePairsKernel<Index, NDim, 4096>
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicePairs, indiceNum,
+                              indicePairUnique, kernelSize, stride, padding,
+                              dilation, outSpatialShape);
    else
-      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                           indiceNum, indicePairUnique, kernelSize, stride,
-                           padding, dilation, outSpatialShape);
+      prepareIndicePairsKernel<Index, NDim, 4096>
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicePairs, indiceNum,
+                              indicePairUnique, kernelSize, stride, padding,
+                              dilation, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
    return 1;
@@ -78,57 +301,58 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0)
      return 0;
-    // after unique, there is a std::numeric_limits<int>::max() in the end of indicePairUnique
-    Index numAct = indicePairUnique.dim(0) - 1; 
-    if (useHash){
+    // after unique, there is a std::numeric_limits<int>::max() in the end of
+    // indicePairUnique
+    Index numAct = indicePairUnique.dim(0) - 1;
+    if (useHash) {
      auto table = cuhash::HashTable();
      // std::cout << "create " << numAct << " size table..." << std::endl;
      table.Initialize(numAct, 2.0, 4);
      unsigned *d_values = nullptr;
-      cudaMalloc((void**)&d_values, sizeof(unsigned) * numAct);
+      cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
      TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-      arangeKernel<unsigned><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(d_values, numAct);
-      bool res = table.Build(numAct, reinterpret_cast<unsigned*>(indicePairUnique.data()), 
-                d_values);
+      arangeKernel<unsigned>
+          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(d_values, numAct);
+      bool res = table.Build(
+          numAct, reinterpret_cast<unsigned *>(indicePairUnique.data()),
+          d_values);
      cudaFree(d_values);
-      if (!res){
-        return -1; //use -1 to tell outside use CPU implementation
+      if (!res) {
+        return -1; // use -1 to tell outside use CPU implementation
      }
      assignIndiceOutKernel<Index, NDim>
-          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesOut, numAct,
-                          indicePairUnique, outSpatialShape, batchSize);
+          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesOut, numAct, indicePairUnique,
+                              outSpatialShape, batchSize);
      TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
      auto tableSize = table.get_table_size();
      auto tableData = table.data();
      auto constants = table.get_constants_4();
      auto stash_constants = table.get_stash_constants();
      auto stash_count = table.get_stash_count();
-      assignIndicePairsHashKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesOut, numActIn, indicePairs,
-                          indicePairUnique,
-                          tableSize, tableData, constants, stash_constants,
-                          stash_count);
+      assignIndicePairsHashKernel<Index, NDim>
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesOut, numActIn, indicePairs,
+                              indicePairUnique, tableSize, tableData, constants,
+                              stash_constants, stash_count);
      TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
-    }else{
+    } else {
      assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
-                          indicePairUnique, outSpatialShape, batchSize);
+          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                              indicePairUnique, outSpatialShape, batchSize);
      TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
      assignIndicePairsKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
-                          indicePairUnique, outSpatialShape);
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                              indicePairUnique, outSpatialShape);
      TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
-
    }

    if (resetGrid && (!useHash)) {
      resetGridKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
      TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
    }
@@ -152,22 +376,25 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
    if (numActIn == 0)
      return 0;
    // auto timer = spconv::CudaContextTimer<>();
-    if (useHash){
+    if (useHash) {
      auto table = cuhash::HashTable();
-      // std::cout << "subm create " << numActIn << " size table..." << std::endl;
+      // std::cout << "subm create " << numActIn << " size table..." <<
+      // std::endl;
      table.Initialize(numActIn, 2.0, 4);
      unsigned *d_keyvalues = nullptr;
-      cudaMalloc((void**)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
+      cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
      unsigned *d_values = d_keyvalues + numActIn;
      prepareSubMHashKernel<Index, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesIn, d_keyvalues, d_values, outSpatialShape);
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, d_keyvalues, d_values,
+                              outSpatialShape);
      TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
-      bool res = table.Build(numActIn, reinterpret_cast<unsigned*>(d_keyvalues), 
-                reinterpret_cast<unsigned*>(d_values));
+      bool res =
+          table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
+                      reinterpret_cast<unsigned *>(d_values));
      cudaFree(d_keyvalues);
-      if (!res){
-        return -1; //use -1 to tell outside use CPU implementation
+      if (!res) {
+        return -1; // use -1 to tell outside use CPU implementation
      }
      auto tableSize = table.get_table_size();
      auto tableData = table.data();
@@ -175,28 +402,30 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
      auto stash_constants = table.get_stash_constants();
      auto stash_count = table.get_stash_count();
      getSubMIndicePairsHashKernel<Index, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesIn, indicePairs, indiceNum,
-                          kernelSize, stride, padding, dilation, outSpatialShape,
-                          tableSize, tableData, constants, stash_constants,
-                          stash_count);
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicePairs, indiceNum, kernelSize,
+                              stride, padding, dilation, outSpatialShape,
+                              tableSize, tableData, constants, stash_constants,
+                              stash_count);
      TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
-    }else{
+    } else {
      prepareSubMGridKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
      TV_CHECK_CUDA_ERR();
      getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-            d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                          kernelSize, stride, padding, dilation, outSpatialShape);
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                              kernelSize, stride, padding, dilation,
+                              outSpatialShape);
      TV_CHECK_CUDA_ERR();
    }
    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
    if (resetGrid && (!useHash)) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
+          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
      TV_CHECK_CUDA_ERR();
    }
    return numActIn;

--- a/src/spconv/maxpool.cu
+++ b/src/spconv/maxpool.cu
@@ -16,9 +16,9 @@
 #include <chrono>
 #include <limits>
 #include <spconv/maxpool.h>
-#include <spconv/mp_helper.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
+#include <tensorview/cuda_utils.h>
+#include <tensorview/kernel_utils.h>
+#include <tensorview/mp_helper.h>
 #include <tensorview/tensorview.h>
 #include <type_traits>

@@ -255,7 +255,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
          reinterpret_cast<const VecType *>(inFeatures)[idxi];
      reinterpret_cast<VecType *>(bufdo)[0] =
          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(din)[idxi];

 #pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
@@ -263,7 +264,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
          bufdi[i] += bufdo[i];
        }
      }
-      reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
+      reinterpret_cast<VecType *>(din)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
    }
  }
 }
@@ -309,7 +311,7 @@ template <typename T, typename Index>
 struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const Index> indices, int size) {
@@ -318,21 +320,22 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
-                                 &notFound](auto NumTLP) {
+    tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                     &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;

      int numHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
-            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                 indices.subview(0).data(),
-                                 indices.subview(1).data(), numHotBlock,
-                                 numPlanes / vecloadFactor);
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }

@@ -340,9 +343,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    indices.subview(0).data() + numHotBlock,
-                                    indices.subview(1).data() + numHotBlock,
-                                    size - numHotBlock, numPlanes);
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
@@ -356,7 +359,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
      int numHotBlock = (size / NumTLP) * NumTLP;
      if (numHotBlock >= NumTLP) {
        maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+            <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data(), indices.subview(1).data(),
@@ -366,7 +369,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {

      if (size > numHotBlock) {
        maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+            <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data() + numHotBlock,
@@ -382,7 +385,7 @@ template <typename T, typename Index>
 struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const T> dout, tv::TensorView<T> din,
@@ -392,22 +395,23 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
-                                 &indices, &notFound](auto NumTLP) {
+    tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
+                                     &indices, &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;

      int numHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
-            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                 dout.data(), din.data(),
-                                 indices.subview(0).data(),
-                                 indices.subview(1).data(), numHotBlock,
-                                 numPlanes / vecloadFactor);
+                                    dout.data(), din.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }

@@ -415,10 +419,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    dout.data(), din.data(),
-                                    indices.subview(0).data() + numHotBlock,
-                                    indices.subview(1).data() + numHotBlock,
-                                    size - numHotBlock, numPlanes);
+                                       dout.data(), din.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
@@ -432,7 +436,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
      int numHotBlock = (size / NumTLP) * NumTLP;
      if (numHotBlock >= NumTLP) {
        maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+            <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
                indices.subview(0).data(), indices.subview(1).data(),
@@ -442,7 +446,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {

      if (size > numHotBlock) {
        maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+            <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
                indices.subview(0).data() + numHotBlock,

--- a/src/spconv/nms.cc
+++ b/src/spconv/nms.cc
@@ -21,7 +21,7 @@ namespace spconv {

 namespace functor {
 template <typename T, typename Index>
-struct NonMaxSupressionFunctor<tv::CPU, T, Index>  {
+struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
                   tv::TensorView<const T> boxes, T threshold, T eps) {
    auto ndets = boxes.dim(0);
@@ -131,7 +131,7 @@ struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {

 #define DECLARE_CPU_INDEX(Index)                                               \
  DECLARE_CPU_T_INDEX(float, Index);                                           \
-  DECLARE_CPU_T_INDEX(double, Index);                                          
+  DECLARE_CPU_T_INDEX(double, Index);

 DECLARE_CPU_INDEX(int);
 DECLARE_CPU_INDEX(long);

--- a/src/spconv/nms.cu
+++ b/src/spconv/nms.cu
@@ -2,18 +2,18 @@
 // Deformable Convolutional Networks
 // Copyright (c) 2015 Microsoft
 // Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
+// Modified from MATLAB Faster R-CNN
+// (https://github.com/shaoqingren/faster_rcnn)
 // ------------------------------------------------------------------

-
 #include <ATen/ATen.h>
 #include <chrono>
 #include <limits>
-#include <spconv/mp_helper.h>
-#include <spconv/reordering.h>
 #include <spconv/reordering.cu.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
+#include <spconv/reordering.h>
+#include <tensorview/cuda_utils.h>
+#include <tensorview/kernel_utils.h>
+#include <tensorview/mp_helper.h>
 #include <tensorview/tensorview.h>
 #include <type_traits>
 #include <utility/timer.h>
@@ -22,8 +22,7 @@
 int const threadsPerBlock = sizeof(unsigned long long) * 8;

 template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b)
-{
+__device__ inline DType devIoU(DType const *const a, DType const *const b) {
  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
@@ -35,44 +34,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)

 template <typename DType, int BLOCK_THREADS>
 __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes, unsigned long long *dev_mask)
-{
+                           const DType *dev_boxes,
+                           unsigned long long *dev_mask) {
  const int row_start = blockIdx.y;
  const int col_start = blockIdx.x;

  // if (row_start > col_start) return;

-  const int row_size =
-      min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size =
-      min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
+  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
+  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);

  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size)
-  {
+  if (threadIdx.x < col_size) {
 #pragma unroll
-    for (int i = 0; i < 5; ++i)
-    {
+    for (int i = 0; i < 5; ++i) {
      block_boxes[threadIdx.x * 5 + i] =
          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
    }
  }
  __syncthreads();

-  if (threadIdx.x < row_size)
-  {
+  if (threadIdx.x < row_size) {
    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
    const DType *cur_box = dev_boxes + cur_box_idx * 5;
    unsigned long long t = 0;
    int start = 0;
-    if (row_start == col_start)
-    {
+    if (row_start == col_start) {
      start = threadIdx.x + 1;
    }
-    for (int i = start; i < col_size; i++)
-    {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
-      {
+    for (int i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
        t |= 1ULL << i;
      }
    }
@@ -80,4 +71,3 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
    dev_mask[cur_box_idx * col_blocks + col_start] = t;
  }
 }
-
--- a/src/spconv/pillar_scatter.cu
+++ b/src/spconv/pillar_scatter.cu
@@ -15,10 +15,10 @@
 #include <ATen/ATen.h>
 #include <chrono>
 #include <limits>
-#include <spconv/mp_helper.h>
 #include <spconv/pillar_scatter_functor.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
+#include <tensorview/cuda_utils.h>
+#include <tensorview/kernel_utils.h>
+#include <tensorview/mp_helper.h>
 #include <tensorview/tensorview.h>
 #include <type_traits>
 #include <utility/timer.h>
@@ -43,8 +43,8 @@ struct PointPillarScatter<tv::GPU, T, Index> {
  void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
                  tv::TensorView<const T> features,
                  tv::TensorView<const T> coors) {
-    auto grid = dim3(tv::launch::DivUp(features.dim(1), 32),
-                     tv::launch::DivUp(features.dim(0), 32));
+    auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
+                     tv::cuda::DivUp(features.dim(0), 32));
    pointPillarsScatterKernel<T, Index>
        <<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
    TV_CHECK_CUDA_ERR();

--- a/src/spconv/reordering.cc
+++ b/src/spconv/reordering.cc
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <ATen/Parallel.h>
 #include <spconv/reordering.h>
 #include <torch/script.h>
-#include <ATen/Parallel.h>

 namespace spconv {
 namespace functor {
 template <typename T, typename Index>
 struct SparseGatherFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+  void operator()(const tv::CPU &d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size) {
    int numPlanes = features.dim(1);
-    at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
+    at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
      for (int i = begin; i < end; ++i) {
        std::memcpy(buffer.data() + i * numPlanes,
                    features.data() + indices[i] * numPlanes,
@@ -35,16 +36,16 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {

 template <typename T, typename Index>
 struct SparseScatterAddFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
-                  int size, bool stable) {
+  void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
    int numPlanes = outFeatures.dim(1);
-    const T* buf = buffer.data();
-    T* out = outFeatures.data();
+    const T *buf = buffer.data();
+    T *out = outFeatures.data();
    for (int i = 0; i < size; ++i) {
      buf = buffer.data() + i * numPlanes;
      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j){
+      for (int j = 0; j < numPlanes; ++j) {
        out[j] += buf[j];
      }
    }
@@ -53,9 +54,8 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {

 } // namespace functor

-
-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)               \
-  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>;  \
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>;             \
  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;

 #define DECLARE_CPU_SPECS(T)                                                   \
@@ -70,4 +70,3 @@ DECLARE_CPU_SPECS(at::Half);
 #undef DECLARE_CPU_SPECS_T_INDEX

 } // namespace spconv
-
--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -15,11 +15,11 @@
 #include <ATen/ATen.h>
 #include <chrono>
 #include <limits>
-#include <spconv/mp_helper.h>
-#include <spconv/reordering.h>
 #include <spconv/reordering.cu.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
+#include <spconv/reordering.h>
+#include <tensorview/cuda_utils.h>
+#include <tensorview/kernel_utils.h>
+#include <tensorview/mp_helper.h>
 #include <tensorview/tensorview.h>
 #include <type_traits>
 #include <utility/timer.h>
@@ -30,7 +30,7 @@ template <typename T, typename Index>
 struct SparseGatherFunctor<tv::GPU, T, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
                  tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size) {
@@ -39,8 +39,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
    int numPlanes = features.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
-                                 &notFound](auto NumTLP) {
+    tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                     &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;
      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
      int nHotBlock = (size / NumTLP) * NumTLP;
@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(buffer.data(), features.data(), indices.data(),
-                                 nHotBlock, numPlanes / vecloadFactor);
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);

            TV_CHECK_CUDA_ERR();
          }
@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
                <<<dim3(1, numPlanes / NumTLP),
                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
-                                 features.data(), indices.data() + nHotBlock,
-                                 size - nHotBlock, numPlanes / vecloadFactor);
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
@@ -73,8 +75,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      gatherGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
+          <<<dim3(tv::cuda::DivUp(size, NumTLP),
+                  tv::cuda::DivUp(numPlanes, NumTLP)),
             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              buffer.data(), features.data(), indices.data(), size, numPlanes);
      TV_CHECK_CUDA_ERR();
@@ -85,7 +87,7 @@ template <typename T, typename Index>
 struct SparseScatterAddFunctor<tv::GPU, T, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> buffer,
                  tv::TensorView<const Index> indices, int size, bool stable) {
@@ -95,8 +97,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
    bool notFound = true;
    constexpr int vecloadFactor =
        sizeof(vecload_type_t) / sizeof(T); // important for half.
-    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
-                                 &notFound](auto NumTLP) {
+    tv::mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                     &notFound](auto NumTLP) {
      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
      constexpr int NumILP = NumTLP / 4;
      int nHotBlock = (size / NumTLP) * NumTLP;
@@ -108,8 +110,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), buffer.data(),
-                                 indices.data(), nHotBlock,
-                                 numPlanes / vecloadFactor);
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
          if (size - nHotBlock > 0) {
@@ -128,8 +130,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
+          <<<dim3(tv::cuda::DivUp(size, NumTLP),
+                  tv::cuda::DivUp(numPlanes, NumTLP)),
             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              outFeatures.data(), buffer.data(), indices.data(), size,
              numPlanes);
@@ -139,7 +141,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
 };
 } // namespace functor

-
 #define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>;             \
  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
@@ -47,7 +47,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  double totalGatherTime = 0;
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
-  tv::torch_dispatch<float, double, at::Half>(
+  tv::dispatch_torch<float, double, at::Half>(
      features.scalar_type(), [&](auto I) {
        using T = decltype(I);
        for (int i = 0; i < kernelVolume; ++i) {
@@ -68,7 +68,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
          }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
          else if (device == torch::kCUDA) {
            functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
            gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
@@ -99,7 +99,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
          }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
          else if (device == torch::kCUDA) {
            functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
            scatterFtor(
@@ -158,7 +158,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
    torch::mm_out(filterGradSub, features.t(), outGrad);
    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
  }
-  tv::torch_dispatch<float, double,
+  tv::dispatch_torch<float, double,
                     at::Half>(features.scalar_type(), [&](auto I) {
    using T = decltype(I);
    for (int i = 0; i < kernelVolume; ++i) {
@@ -178,7 +178,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
                      tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
                      nHot);
      }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
      else if (device == torch::kCUDA) {
        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
@@ -213,7 +213,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
                    tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                    nHot);
      }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
      else if (device == torch::kCUDA) {
        functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
        scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),

--- a/src/utils/all.cc
+++ b/src/utils/all.cc
@@ -20,7 +20,7 @@ using namespace pybind11::literals;

 PYBIND11_MODULE(spconv_utils, m) {
  m.doc() = "util pybind11 functions for spconv";
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
  m.def("non_max_suppression", &spconv::non_max_suppression<double>,
        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);

--- a/src/utils/nms.cu
+++ b/src/utils/nms.cu
@@ -2,30 +2,28 @@
 // Deformable Convolutional Networks
 // Copyright (c) 2015 Microsoft
 // Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
+// Modified from MATLAB Faster R-CNN
+// (https://github.com/shaoqingren/faster_rcnn)
 // ------------------------------------------------------------------
-#include <vector>
-#include <iostream>
 #include <cuda_runtime.h>
+#include <iostream>
 #include <spconv/nms_gpu.h>
+#include <vector>

-#define CUDA_CHECK(condition)                               \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do                                                        \
-  {                                                         \
-    cudaError_t error = condition;                          \
-    if (error != cudaSuccess)                               \
-    {                                                       \
-      std::cout << cudaGetErrorString(error) << std::endl;  \
-    }                                                       \
+#define CUDA_CHECK(condition)                                                  \
+  /* Code block avoids redefinition of cudaError_t error */                    \
+  do {                                                                         \
+    cudaError_t error = condition;                                             \
+    if (error != cudaSuccess) {                                                \
+      std::cout << cudaGetErrorString(error) << std::endl;                     \
+    }                                                                          \
  } while (0)

 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 int const threadsPerBlock = sizeof(unsigned long long) * 8;

 template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b)
-{
+__device__ inline DType devIoU(DType const *const a, DType const *const b) {
  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
@@ -37,44 +35,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)

 template <typename DType, int BLOCK_THREADS>
 __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes, unsigned long long *dev_mask)
-{
+                           const DType *dev_boxes,
+                           unsigned long long *dev_mask) {
  const int row_start = blockIdx.y;
  const int col_start = blockIdx.x;

  // if (row_start > col_start) return;

-  const int row_size =
-      min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size =
-      min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
+  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
+  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);

  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size)
-  {
+  if (threadIdx.x < col_size) {
 #pragma unroll
-    for (int i = 0; i < 5; ++i)
-    {
+    for (int i = 0; i < 5; ++i) {
      block_boxes[threadIdx.x * 5 + i] =
          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
    }
  }
  __syncthreads();

-  if (threadIdx.x < row_size)
-  {
+  if (threadIdx.x < row_size) {
    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
    const DType *cur_box = dev_boxes + cur_box_idx * 5;
    unsigned long long t = 0;
    int start = 0;
-    if (row_start == col_start)
-    {
+    if (row_start == col_start) {
      start = threadIdx.x + 1;
    }
-    for (int i = start; i < col_size; i++)
-    {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
-      {
+    for (int i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
        t |= 1ULL << i;
      }
    }
@@ -83,12 +73,10 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
  }
 }

-void _set_device(int device_id)
-{
+void _set_device(int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id)
-  {
+  if (current_device == device_id) {
    return;
  }
  // The call to cudaSetDevice must come before any calls to Get, which
@@ -98,8 +86,7 @@ void _set_device(int device_id)

 template <typename DType, int BLOCK_THREADS>
 int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-          int boxes_dim, DType nms_overlap_thresh, int device_id)
-{
+             int boxes_dim, DType nms_overlap_thresh, int device_id) {
  _set_device(device_id);

  DType *boxes_dev = NULL;
@@ -107,27 +94,21 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,

  const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);

-  CUDA_CHECK(cudaMalloc(&boxes_dev,
-                        boxes_num * boxes_dim * sizeof(DType)));
-  CUDA_CHECK(cudaMemcpy(boxes_dev,
-                        boxes_host,
+  CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
+  CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
                        boxes_num * boxes_dim * sizeof(DType),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMalloc(&mask_dev,
                        boxes_num * col_blocks * sizeof(unsigned long long)));

-  dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS),
-              DIVUP(boxes_num, BLOCK_THREADS));
+  dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
  dim3 threads(BLOCK_THREADS);
-  nms_kernel<DType, BLOCK_THREADS><<<blocks, threads>>>(boxes_num,
-                                  nms_overlap_thresh,
-                                  boxes_dev,
-                                  mask_dev);
+  nms_kernel<DType, BLOCK_THREADS>
+      <<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);

  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  CUDA_CHECK(cudaMemcpy(&mask_host[0],
-                        mask_dev,
+  CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
                        sizeof(unsigned long long) * boxes_num * col_blocks,
                        cudaMemcpyDeviceToHost));

@@ -135,17 +116,14 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++)
-  {
+  for (int i = 0; i < boxes_num; i++) {
    int nblock = i / BLOCK_THREADS;
    int inblock = i % BLOCK_THREADS;

-    if (!(remv[nblock] & (1ULL << inblock)))
-    {
+    if (!(remv[nblock] & (1ULL << inblock))) {
      keep_out[num_to_keep++] = i;
      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++)
-      {
+      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
      }
    }
@@ -156,10 +134,15 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
  return num_to_keep;
 }

-
-//template<>
-template int _nms_gpu<float, threadsPerBlock>(int *keep_out, const float *boxes_host, int boxes_num,
-          int boxes_dim, float nms_overlap_thresh, int device_id);
-//template<>
-template int _nms_gpu<double, threadsPerBlock>(int *keep_out, const double *boxes_host, int boxes_num,
-          int boxes_dim, double nms_overlap_thresh, int device_id);
\ No newline at end of file
+// template<>
+template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
+                                              const float *boxes_host,
+                                              int boxes_num, int boxes_dim,
+                                              float nms_overlap_thresh,
+                                              int device_id);
+// template<>
+template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
+                                               const double *boxes_host,
+                                               int boxes_num, int boxes_dim,
+                                               double nms_overlap_thresh,
+                                               int device_id);
\ No newline at end of file
--- a/test/fake_dist_train.py
+++ b/test/fake_dist_train.py
-import horovod.torch as hvd
 import time
 from pathlib import Path

@@ -12,6 +11,7 @@ from torch.utils import data
 from torch.utils.data import DataLoader, Dataset
 from torchvision import datasets, transforms

+import horovod.torch as hvd
 import spconv
 from spconv.test_utils import generate_sparse_data

@@ -53,25 +53,47 @@ class FakeClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
+            spconv.SubMConv3d(3,
+                              8,
+                              3,
+                              indice_key="subm1",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(8),
            nn.ReLU(),
            spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
            nn.BatchNorm1d(16),
            nn.ReLU(),
-            spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
+            spconv.SubMConv3d(16,
+                              16,
+                              3,
+                              indice_key="subm2",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(16),
            nn.ReLU(),
-            spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
+            spconv.SparseConv3d(16, 32, 3, stride=2, padding=1,
+                                use_hash=False),
            nn.BatchNorm1d(32),
            nn.ReLU(),
-            spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
+            spconv.SubMConv3d(32,
+                              32,
+                              3,
+                              indice_key="subm3",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(32),
            nn.ReLU(),
-            spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
+            spconv.SparseConv3d(32, 64, 3, stride=2, padding=1,
+                                use_hash=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
-            spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              indice_key="subm4",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            spconv.ToDense()  # [64, 2, 8, 8]
@@ -100,15 +122,16 @@ def run():
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
    compression = hvd.Compression.none

-    optimizer = hvd.DistributedOptimizer(optimizer,
-                                         named_parameters=model.named_parameters(),
-                                         compression=compression,
-                                         op=hvd.Average)
+    optimizer = hvd.DistributedOptimizer(
+        optimizer,
+        named_parameters=model.named_parameters(),
+        compression=compression,
+        op=hvd.Average)

    for i in tqdm.tqdm(list(range(100))):
        # for j in range(4):
        #     features, indices, label = ds[(i * 4 + j) % len(ds)]
-        
+
        features, indices, label = ds[i % len(ds)]
        features_t = torch.from_numpy(features)
        indices_t = torch.from_numpy(indices)

--- a/test/fake_train.py
+++ b/test/fake_train.py
@@ -52,25 +52,47 @@ class FakeClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
+            spconv.SubMConv3d(3,
+                              8,
+                              3,
+                              indice_key="subm1",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(8),
            nn.ReLU(),
            spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
            nn.BatchNorm1d(16),
            nn.ReLU(),
-            spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
+            spconv.SubMConv3d(16,
+                              16,
+                              3,
+                              indice_key="subm2",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(16),
            nn.ReLU(),
-            spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
+            spconv.SparseConv3d(16, 32, 3, stride=2, padding=1,
+                                use_hash=False),
            nn.BatchNorm1d(32),
            nn.ReLU(),
-            spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
+            spconv.SubMConv3d(32,
+                              32,
+                              3,
+                              indice_key="subm3",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(32),
            nn.ReLU(),
-            spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
+            spconv.SparseConv3d(32, 64, 3, stride=2, padding=1,
+                                use_hash=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
-            spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              indice_key="subm4",
+                              padding=1,
+                              use_hash=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            spconv.ToDense()  # [64, 2, 8, 8]
@@ -97,7 +119,7 @@ def run():
    for i in tqdm.tqdm(list(range(100))):
        # for j in range(4):
        #     features, indices, label = ds[(i * 4 + j) % len(ds)]
-        
+
        features, indices, label = ds[i % len(ds)]
        features_t = torch.from_numpy(features)
        indices_t = torch.from_numpy(indices)

--- a/test/test_conv.py
+++ b/test/test_conv.py
 # Copyright 2019 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import time
+import unittest
 from pathlib import Path
-import spconv 
+
+import numpy as np
 import torch
-from torch import nn 
-import numpy as np 
-import time 
-from spconv.test_utils import params_grid, generate_sparse_data, TestCase
-import unittest
-# import sparseconvnet as scn 
+from torch import nn
+
+import spconv
+from spconv.test_utils import TestCase, generate_sparse_data, params_grid
+
+# import sparseconvnet as scn
+

 class SparseConv3dTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
-                 stride, padding, dilation):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride, padding, dilation):
        super().__init__()
-        layers = [spconv.SparseConv3d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False,
-                use_hash=True)]
+        layers = [
+            spconv.SparseConv3d(in_channels,
+                                out_channels,
+                                kernel_size,
+                                stride,
+                                padding=padding,
+                                dilation=dilation,
+                                bias=False,
+                                use_hash=True)
+        ]
        for i in range(1, num_layers):
-            layers.append(spconv.SparseConv3d(
-                out_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False))
-        self.net = spconv.SparseSequential(
-            *layers,
-        )
+            layers.append(
+                spconv.SparseConv3d(out_channels,
+                                    out_channels,
+                                    kernel_size,
+                                    stride,
+                                    padding=padding,
+                                    dilation=dilation,
+                                    bias=False))
+        self.net = spconv.SparseSequential(*layers, )
        # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
        self.grid = None
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
-        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
-        return self.net(x)# .dense()
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)  # .dense()
+

 class SubMConv3dTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
-                 stride, padding, dilation):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride, padding, dilation):
        super().__init__()
-        layers = [spconv.SubMConv3d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False)]
+        layers = [
+            spconv.SubMConv3d(in_channels,
+                              out_channels,
+                              kernel_size,
+                              stride,
+                              padding=padding,
+                              dilation=dilation,
+                              bias=False)
+        ]
        for i in range(1, num_layers):
-            layers.append(spconv.SubMConv3d(
-                out_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False))
-        self.net = spconv.SparseSequential(
-            *layers,
-        )
+            layers.append(
+                spconv.SubMConv3d(out_channels,
+                                  out_channels,
+                                  kernel_size,
+                                  stride,
+                                  padding=padding,
+                                  dilation=dilation,
+                                  bias=False))
+        self.net = spconv.SparseSequential(*layers, )
        self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
-        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
-        return self.net(x)# .dense()
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)  # .dense()


 class Conv3dTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
-                 stride, padding, dilation):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride, padding, dilation):
        super().__init__()
-        layers = [nn.Conv3d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False)]
+        layers = [
+            nn.Conv3d(in_channels,
+                      out_channels,
+                      kernel_size,
+                      stride,
+                      padding=padding,
+                      dilation=dilation,
+                      bias=False)
+        ]
        for i in range(1, num_layers):
-            layers.append(nn.Conv3d(
-                out_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False))
-        self.net = nn.Sequential(
-            *layers,
-        )
+            layers.append(
+                nn.Conv3d(out_channels,
+                          out_channels,
+                          kernel_size,
+                          stride,
+                          padding=padding,
+                          dilation=dilation,
+                          bias=False))
+        self.net = nn.Sequential(*layers, )
        self.shape = shape

    def forward(self, x):
-        return self.net(x)# .dense()
+        return self.net(x)  # .dense()


 class SparseDeConv3dTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
-                 stride, padding, dilation):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride, padding, dilation):
        super().__init__()
-        layers = [spconv.SparseConvTranspose3d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False)]
+        layers = [
+            spconv.SparseConvTranspose3d(in_channels,
+                                         out_channels,
+                                         kernel_size,
+                                         stride,
+                                         padding=padding,
+                                         dilation=dilation,
+                                         bias=False)
+        ]
        for i in range(1, num_layers):
-            layers.append(spconv.SparseConvTranspose3d(
-                out_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False))
-        self.net = spconv.SparseSequential(
-            *layers,
-        )
+            layers.append(
+                spconv.SparseConvTranspose3d(out_channels,
+                                             out_channels,
+                                             kernel_size,
+                                             stride,
+                                             padding=padding,
+                                             dilation=dilation,
+                                             bias=False))
+        self.net = spconv.SparseSequential(*layers, )
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
-        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size)
-        return self.net(x)# .dense()
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
+        return self.net(x)  # .dense()
+

 class DeConv3dTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
-                 stride, padding, dilation):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride, padding, dilation):
        super().__init__()
-        layers = [nn.ConvTranspose3d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False)]
+        layers = [
+            nn.ConvTranspose3d(in_channels,
+                               out_channels,
+                               kernel_size,
+                               stride,
+                               padding=padding,
+                               dilation=dilation,
+                               bias=False)
+        ]
        for i in range(1, num_layers):
-            layers.append(nn.ConvTranspose3d(
-                out_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                bias=False))
-        self.net = nn.Sequential(
-            *layers,
-        )
+            layers.append(
+                nn.ConvTranspose3d(out_channels,
+                                   out_channels,
+                                   kernel_size,
+                                   stride,
+                                   padding=padding,
+                                   dilation=dilation,
+                                   bias=False))
+        self.net = nn.Sequential(*layers, )
        self.shape = shape

    def forward(self, x):
-        return self.net(x)# .dense()
+        return self.net(x)  # .dense()


 class SparseMaxPoolTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, kernel_size,
-                 stride, padding, dilation):
+    def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
+                 dilation):
        super().__init__()
-        layers = [spconv.SparseMaxPool3d(
-                kernel_size,
-                stride, padding, dilation)]
+        layers = [
+            spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation)
+        ]
        for i in range(1, num_layers):
-            layers.append(spconv.SparseMaxPool3d(
-                kernel_size,
-                stride, padding, dilation))
-        self.net = spconv.SparseSequential(
-            *layers,
-        )
+            layers.append(
+                spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation))
+        self.net = spconv.SparseSequential(*layers, )
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
-        return self.net(x)# .dense()
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
+        return self.net(x)  # .dense()
+

 class MaxPool3dTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, kernel_size,
-                 stride, padding, dilation):
+    def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
+                 dilation):
        super().__init__()
-        layers = [nn.MaxPool3d(
-                kernel_size,
-                stride, padding, dilation)]
+        layers = [nn.MaxPool3d(kernel_size, stride, padding, dilation)]
        for i in range(1, num_layers):
-            layers.append(nn.MaxPool3d(
-                kernel_size,
-                stride, padding, dilation))
-        self.net = nn.Sequential(
-            *layers,
-        )
+            layers.append(nn.MaxPool3d(kernel_size, stride, padding, dilation))
+        self.net = nn.Sequential(*layers, )
        self.shape = shape

    def forward(self, x):
-        return self.net(x)# .dense()
+        return self.net(x)  # .dense()


 class SubmanifoldConvTestTorch(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, stride):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride):
        super().__init__()
-        layers = [spconv.SubMConv3d(
-                in_channels, out_channels, kernel_size, bias=False, indice_key="subm0")]
+        layers = [
+            spconv.SubMConv3d(in_channels,
+                              out_channels,
+                              kernel_size,
+                              bias=False,
+                              indice_key="subm0")
+        ]
        for i in range(1, num_layers):
-            layers.append(spconv.SubMConv3d(
-                out_channels, out_channels, kernel_size, bias=False))
-        self.net = nn.Sequential(
-            *layers,
-        )
+            layers.append(
+                spconv.SubMConv3d(out_channels,
+                                  out_channels,
+                                  kernel_size,
+                                  bias=False))
+        self.net = nn.Sequential(*layers, )
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
        return self.net(x)

+
 class SCNCoupleDeConvTest(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
-                 stride):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride):
        super().__init__()
        self.scn_input = scn.InputLayer(ndim, shape, mode=0)
        self.net = nn.Sequential(
-            scn.Convolution(
-                ndim,
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                bias=False),
-            scn.Deconvolution(
-                ndim,
-                out_channels,
-                in_channels,
-                kernel_size,
-                stride,
-                bias=False),
+            scn.Convolution(ndim,
+                            in_channels,
+                            out_channels,
+                            kernel_size,
+                            stride,
+                            bias=False),
+            scn.Deconvolution(ndim,
+                              out_channels,
+                              in_channels,
+                              kernel_size,
+                              stride,
+                              bias=False),
            scn.SparseToDense(ndim, in_channels),
        )

@@ -267,44 +268,44 @@ class SCNCoupleDeConvTest(nn.Module):
        x = self.scn_input((coors, features))
        return self.net(x)

+
 class SparseCoupleDeConvTest(nn.Module):
-    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
-                 stride):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
+                 kernel_size, stride):
        super().__init__()
        self.net = spconv.SparseSequential(
-            spconv.SparseConv3d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                indice_key="cp0",
-                bias=False),
-            spconv.SparseInverseConv3d(
-                out_channels,
-                in_channels,
-                kernel_size,
-                indice_key="cp0",
-                bias=False),
-            
+            spconv.SparseConv3d(in_channels,
+                                out_channels,
+                                kernel_size,
+                                stride,
+                                indice_key="cp0",
+                                bias=False),
+            spconv.SparseInverseConv3d(out_channels,
+                                       in_channels,
+                                       kernel_size,
+                                       indice_key="cp0",
+                                       bias=False),
        )
        self.todense = spconv.ToDense()
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
-        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size )
-        return self.todense(self.net(x))# .dense()
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
+        return self.todense(self.net(x))  # .dense()


 def gather_nd(params, indices):
    # this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
    ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + list(params.shape[indices.shape[-1]:])
+    output_shape = list(indices.shape[:-1]) + list(
+        params.shape[indices.shape[-1]:])
    flatted_indices = indices.view(-1, ndim)
    slices = [flatted_indices[:, i] for i in range(ndim)]
    slices += [Ellipsis]
    return params[slices].view(*output_shape)

+
 def scatter_nd(indices, updates, shape):
    """pytorch edition of tensorflow scatter_nd.
    this function don't contain except handle code. so use this carefully
@@ -322,7 +323,6 @@ def scatter_nd(indices, updates, shape):


 class TestSpConv(TestCase):
-    
    def testSpConv3d(self):
        np.random.seed(484)
        devices = ["cpu:0"]
@@ -337,36 +337,44 @@ class TestSpConv(TestCase):
        dilations = [1, 2, 3]

        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
-            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
-            strides, paddings, dilations):
+                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
+                strides, paddings, dilations):
            if all([s > 1, d > 1]):
-                continue # don't support this.
+                continue  # don't support this.
            device = torch.device(dev)
            num_points = [1000] * bs

            sparse_dict = generate_sparse_data(shape, num_points, IC)

-            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(
+                np.float32)
+            indices = np.ascontiguousarray(
+                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                    OC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
            features_dense_t = torch.from_numpy(features_dense).to(device)
            features_dense_t.requires_grad = True
-            net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
-            net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
+            net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
+                                        d).to(device)
+            net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
+                                      d).to(device)
            filters_t = torch.from_numpy(filters).to(device)
-            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
+            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
+                                                              2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs).dense()
-            dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
+            dout = np.random.uniform(-0.2, 0.2,
+                                     out_ref.shape).astype(features.dtype)
            dout_t = torch.from_numpy(dout).to(device)
            out.backward(dout_t)
            out_ref.backward(dout_t)
-            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
+            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
+                                                               1).contiguous()
            din_sparse = gather_nd(din_dense, indices_t.long())
            din = features_t.grad.detach()
            din_np = din.cpu().numpy()
@@ -381,7 +389,7 @@ class TestSpConv(TestCase):
            out_np = out.detach().cpu().numpy()
            out_ref_np = out_ref.detach().cpu().numpy()
            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
-    
+
    def testSpDeConv3d(self):
        np.random.seed(484)
        devices = ["cuda:0", "cpu:0"]
@@ -396,36 +404,44 @@ class TestSpConv(TestCase):
        dilations = [1, 2, 3]

        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
-            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
-            strides, paddings, dilations):
+                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
+                strides, paddings, dilations):
            if all([s > 1, d > 1]):
-                continue # don't support this.
+                continue  # don't support this.
            device = torch.device(dev)
            num_points = [1000] * bs

            sparse_dict = generate_sparse_data(shape, num_points, IC)

-            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(
+                np.float32)
+            indices = np.ascontiguousarray(
+                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                    OC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
            features_dense_t = torch.from_numpy(features_dense).to(device)
            features_dense_t.requires_grad = True
-            net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
-            net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
+            net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
+                                          d).to(device)
+            net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
+                                        d).to(device)
            filters_t = torch.from_numpy(filters).to(device)
-            net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1, 2).contiguous()
+            net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+                                                              2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs).dense()
-            dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
+            dout = np.random.uniform(-0.2, 0.2,
+                                     out_ref.shape).astype(features.dtype)
            dout_t = torch.from_numpy(dout).to(device)
            out.backward(dout_t)
            out_ref.backward(dout_t)
-            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
+            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
+                                                               1).contiguous()
            din_sparse = gather_nd(din_dense, indices_t.long())
            din = features_t.grad.detach()
            din_np = din.cpu().numpy()
@@ -440,7 +456,7 @@ class TestSpConv(TestCase):
            out_np = out.detach().cpu().numpy()
            out_ref_np = out_ref.detach().cpu().numpy()
            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
-    
+
    def testSpCpConv3d(self):
        np.random.seed(484)
        devices = ["cuda:0", "cpu:0"]
@@ -455,19 +471,23 @@ class TestSpConv(TestCase):
        dilations = [1, 2, 3]

        for dev, shape, bs, IC, OC, k, s in params_grid(
-            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
-            strides):
+                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
+                strides):
            device = torch.device(dev)
            num_points = [1000] * bs

            sparse_dict = generate_sparse_data(shape, num_points, IC)

-            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(
+                np.float32)
+            indices = np.ascontiguousarray(
+                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                    OC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
-            indices_scn_t = torch.from_numpy(indices[:, [1, 2, 3, 0]]).int().to(device)
+            indices_scn_t = torch.from_numpy(
+                indices[:, [1, 2, 3, 0]]).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
            features_ref_t = torch.from_numpy(features).to(device)
@@ -475,11 +495,14 @@ class TestSpConv(TestCase):

            net_ref = SCNCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
            net = SparseCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
-            net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(*net_ref.net[0].weight.shape)
-            net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(*net_ref.net[1].weight.shape)
+            net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(
+                *net_ref.net[0].weight.shape)
+            net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(
+                *net_ref.net[1].weight.shape)
            out_ref = net_ref(features_ref_t, indices_scn_t, bs)
            out = net(features_t, indices_t, bs)
-            dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
+            dout = np.random.uniform(-0.2, 0.2,
+                                     out_ref.shape).astype(features.dtype)
            dout_t = torch.from_numpy(dout).to(device)
            out.backward(dout_t)
            out_ref.backward(dout_t)
@@ -490,15 +513,14 @@ class TestSpConv(TestCase):
            self.assertAllClose(din_ref_np, din_np, atol=1e-4)
            for layer, layer_ref in zip(net.net, net_ref.net):
                dw = layer.weight.grad.detach().cpu().numpy()
-                dw_ref = layer_ref.weight.grad.detach().cpu().view(*dw.shape).numpy()
+                dw_ref = layer_ref.weight.grad.detach().cpu().view(
+                    *dw.shape).numpy()
                self.assertAllClose(dw, dw_ref, atol=1e-4)

            out_np = out.detach().cpu().numpy()
            out_ref_np = out_ref.detach().cpu().numpy()
            self.assertAllClose(out_np, out_ref_np, atol=1e-4)

-
-    
    def testSpMaxPool3d(self):
        np.random.seed(485)
        devices = ["cuda:0", "cpu:0"]
@@ -513,19 +535,25 @@ class TestSpConv(TestCase):
        dilations = [1, 2, 3]

        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
-            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
-            strides, paddings, dilations):
+                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
+                strides, paddings, dilations):
            if all([s > 1, d > 1]):
-                continue # don't support this.
+                continue  # don't support this.
            device = torch.device(dev)
            num_points = [1000] * bs
            # when data contains negative, sparse maxpool is not equal to dense maxpool.
-            sparse_dict = generate_sparse_data(shape, num_points, IC, data_range=[0.1, 1])
-
-            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            sparse_dict = generate_sparse_data(shape,
+                                               num_points,
+                                               IC,
+                                               data_range=[0.1, 1])
+
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(
+                np.float32)
+            indices = np.ascontiguousarray(
+                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                    OC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
@@ -540,24 +568,27 @@ class TestSpConv(TestCase):
            outfeatures = out.features
            out_dense = out.dense(channels_first=False)
            out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
-            
-            dout_sparse = np.random.uniform(-0.2, 0.2, outfeatures.shape).astype(features.dtype)
+
+            dout_sparse = np.random.uniform(
+                -0.2, 0.2, outfeatures.shape).astype(features.dtype)
            dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
-            dout_t = scatter_nd(outids.long(), dout_sparse_t, list(out_dense.shape))
+            dout_t = scatter_nd(outids.long(), dout_sparse_t,
+                                list(out_dense.shape))
            dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
            out.backward(dout_t)
            out_ref.backward(dout_t)
-            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
+            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
+                                                               1).contiguous()
            din_sparse = gather_nd(din_dense, indices_t.long())
            din = features_t.grad.detach()
            din_np = din.cpu().numpy()
            din_sparse_np = din_sparse.cpu().numpy()
            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
-            
+
            out_np = out.detach().cpu().numpy()
            out_ref_np = out_ref.detach().cpu().numpy()
            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
-    
+

 def main():
    # function for develop.
@@ -567,7 +598,6 @@ def main():
    shapes = [[50, 30, 30]]
    batchsizes = [2]

-    
    in_channels = [256]
    out_channels = [256]
    ksizes = [(3, 1, 1)]
@@ -576,8 +606,8 @@ def main():
    dilations = [1]

    for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
-        devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
-        strides, paddings, dilations):
+            devices, shapes, batchsizes, in_channels, out_channels, ksizes,
+            strides, paddings, dilations):
        if all([s > 1, d > 1]):
            continue
        device = torch.device(dev)
@@ -585,19 +615,25 @@ def main():

        sparse_dict = generate_sparse_data(shape, num_points, IC)

-        features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-        indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+        features = np.ascontiguousarray(sparse_dict["features"]).astype(
+            np.float32)
+        indices = np.ascontiguousarray(
+            sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
        features_dense = sparse_dict["features_dense"].astype(np.float32)
        indices_t = torch.from_numpy(indices)
-        filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, OC]).astype(np.float32)
+        filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC,
+                                                OC]).astype(np.float32)
        indices_t = torch.from_numpy(indices).int().to(device).float()
        features_t = torch.from_numpy(features).to(device).float()
-        
+
        features_dense_t = torch.from_numpy(features_dense).to(device).float()
-        net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
-        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
+        net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
+                                    d).to(device).float()
+        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
+                                  d).to(device).float()
        filters_t = torch.from_numpy(filters).to(device).float()
-        net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
+        net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
+                                                     2).contiguous()
        net.net[0].weight[:] = filters_t
        out_ref = net_ref(features_dense_t)
        times = []
@@ -607,16 +643,16 @@ def main():
            torch.cuda.synchronize()
            times.append(time.time() - t)
        # print((net.grid == -1).float().sum(), net.grid.numel())
-            # print("spconv time", time.time() - t)
+        # print("spconv time", time.time() - t)
        print("spconv time", np.mean(times[2:]))
        out = net(features_t, indices_t, bs)
        # print(out.indices)
        out = out.dense()
-        print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy()))
+        print(
+            np.linalg.norm(out.detach().cpu().numpy() -
+                           out_ref.detach().cpu().numpy()))


 if __name__ == '__main__':
    main()
    # unittest.main()
-
-
--- a/pybind11 @ 3b1dbeba
+++ b/pybind11 @ 3b1dbeba
-Subproject commit 085a29436a8c472caaaf7157aa644b571079bcaa
+Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879