Merge branch 'develop'

a6abf55d · yan.yan · fad30002 · 79a3eaf2 · fad30002 · fad30002
Commit a6abf55d authored Oct 20, 2021 by yan.yan
20 changed files
--- a/src/spconv/pillar_scatter.cu
+++ b/src/spconv/pillar_scatter.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/pillar_scatter_functor.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-#include <utility/timer.h>
-namespace spconv {
-template <typename T, typename Index>
-__global__ void pointPillarsScatterKernel(tv::TensorView<T> canvas,
-                                          tv::TensorView<const T> features,
-                                          tv::TensorView<const T> coors) {
-  auto numFeatures = features.dim(0);
-  auto numPoints = features.dim(1);
-  for (int i : tv::KernelLoopX<int>(numPoints)) {
-    for (int ifeature : tv::KernelLoopY<int>(numFeatures)) {
-      canvas(int(coors(0, i)), ifeature, int(coors(2, i)), int(coors(3, i))) =
-          features(ifeature, i);
-    }
-  }
-}
-namespace functor {
-template <typename T, typename Index>
-struct PointPillarScatter<tv::GPU, T, Index> {
-  void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const T> coors) {
-    auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
-                     tv::cuda::DivUp(features.dim(0), 32));
-    pointPillarsScatterKernel<T, Index>
-        <<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
-    TV_CHECK_CUDA_ERR();
-  }
-};
-} // namespace functor
-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::PointPillarScatter<tv::GPU, T, Index>;
-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/pool_ops.cc
+++ b/src/spconv/pool_ops.cc
-#include <spconv/pool_ops.h>
-namespace spconv {
-torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                            torch::Tensor indiceNum, int64_t numAct) {
-  auto device = features.device().type();
-  auto kernelVolume = indiceNum.size(0);
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
-  double totalTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    if (device == torch::kCPU) {
-      maxpool_fwd_cpu(output, features, indicePairs[0][i], indicePairs[1][i],
-                      nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      maxpool_fwd_cuda(output, features, indicePairs[0][i], indicePairs[1][i],
-                       nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    // totalTime += timer.report() / 1000.0;
-  }
-  // std::cout << "maxpool forward time " << totalTime << std::endl;
-  return output;
-}
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                    torch::Tensor outFeatures,
-                                    torch::Tensor outGrad,
-                                    torch::Tensor indicePairs,
-                                    torch::Tensor indiceNum) {
-  auto device = features.device().type();
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  auto kernelVolume = indiceNum.size(0);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      maxpool_bwd_cpu(outFeatures, features, outGrad, inputGrad,
-                      indicePairs[0][i], indicePairs[1][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      maxpool_bwd_cuda(outFeatures, features, outGrad, inputGrad,
-                       indicePairs[0][i], indicePairs[1][i], nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-  }
-  return inputGrad;
-}
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/reordering.cc
+++ b/src/spconv/reordering.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <ATen/Parallel.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-namespace spconv {
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
-                       torch::Tensor indices, int size) {
-  int numPlanes = features.size(1);
-  auto dtype = features.scalar_type();
-  auto int_dtype = indices.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      Index *indices_data = indices.data_ptr<Index>();
-      T *buffer_data = buffer.data_ptr<T>();
-      const T *features_data = features.data_ptr<T>();
-      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
-        for (int i = begin; i < end; ++i) {
-          std::memcpy(buffer_data + i * numPlanes,
-                      features_data + indices_data[i] * numPlanes,
-                      sizeof(T) * numPlanes);
-        }
-      });
-    });
-  });
-}
-void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
-                            torch::Tensor indices, int size) {
-  int numPlanes = outFeatures.size(1);
-  auto dtype = outFeatures.scalar_type();
-  auto int_dtype = indices.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      Index *indices_data = indices.data_ptr<Index>();
-      const T *buffer_data = buffer.data_ptr<T>();
-      T *features_data = outFeatures.data_ptr<T>();
-      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
-        const T *buf = buffer.data_ptr<T>();
-        T *out = outFeatures.data_ptr<T>();
-        for (int i = begin; i < end; ++i) {
-          buf = buffer_data + i * numPlanes;
-          out = features_data + indices_data[i] * numPlanes;
-          for (int j = 0; j < numPlanes; ++j) {
-            out[j] += buf[j];
-          }
-        }
-      });
-    });
-  });
-}
-} // namespace spconv
--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-#include <type_traits>
-#include <utility/timer.h>
-namespace spconv {
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-template <typename T>
-struct half_vec{
-  using type = typename std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
-};
-template <typename T>
-struct half_vec_sadd{
-  using type = typename std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
-};
-using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                        torch::Tensor indices, int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = features.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = features.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    using vecload_type_t = typename half_vec_sadd<T>::type;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
-                                       &notFound](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                   vecload_type_t>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                              vecload_type_t>));
-              tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                  <<<dim3(1, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                               features.data_ptr<T>(),
-                               indices.data_ptr<Index>() + nHotBlock,
-                               size - nHotBlock, numPlanes / vecloadFactor);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, gatherVecKernel<T, Index, int(NumTLP), NumILP,
-                                         vecload_type_t>));
-              tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>,
-                          int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        gatherGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                buffer.data_ptr<T>(), features.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
-        tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>,
-                    int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
-                             torch::Tensor indices, int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = outFeatures.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = outFeatures.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    using vecload_type_t = typename half_vec_sadd<T>::type;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor =
-          sizeof(vecload_type_t) / sizeof(T); // important for half.
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
-                                       &notFound](auto NumTLP) {
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
-        // vecloadFactor));
-        constexpr int NumILP = NumTLP / 4;
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                                  vecload_type_t>));
-              tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                  indices.data_ptr<Index>() + nHotBlock,
-                                  size - nHotBlock, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr,
-                  scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
-              tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        scatterAddGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
-        tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>,
-                    tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                              torch::Tensor indices, int size) {
-  // indices: [volume, inds_stride]
-  // buffer: [volume, num_points, num_features]
-  // size == volume * num_points
-  if (size <= 0)
-    return;
-  int numPlanes = features.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = features.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  int inds_stride = indices.size(1);
-  int feature_stride = buffer.size(1);
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    using vecload_type_t = typename half_vec<T>::type;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>(
-          [=, &buffer, &features, &indices, &notFound](auto NumTLP) {
-            constexpr int NumILP = NumTLP / 4;
-            // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-            int nHotBlock = (size / NumTLP) * NumTLP;
-            if (notFound) {
-              if (numPlanes % NumTLP == 0) {
-                if (nHotBlock >= NumTLP) {
-                  batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                            vecload_type_t>
-                      <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
-                                   indices.data_ptr<Index>(), nHotBlock,
-                                   numPlanes / vecloadFactor, inds_stride,
-                                   feature_stride);
-                  TV_CHECK_CUDA_ERR_V2("batchGatherVecBlockKernel");
-                }
-                if (size - nHotBlock > 0) {
-                  batchGatherVecKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                      <<<dim3(1, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                   features.data_ptr<T>(),
-                                   indices.data_ptr<Index>(), size - nHotBlock,
-                                   nHotBlock, numPlanes / vecloadFactor,
-                                   inds_stride, feature_stride);
-                  TV_CHECK_CUDA_ERR_V2("batchGatherVecKernel");
-                }
-                notFound = false;
-              }
-            }
-          });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        batchGatherGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                buffer.data_ptr<T>(), features.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes, inds_stride,
-                feature_stride);
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
-                                   torch::Tensor outFeatures,
-                                   torch::Tensor indices, int size) {
-  // indices: [volume, inds_stride]
-  // buffer: [volume, num_points, num_features]
-  // size == volume * num_points
-  if (size <= 0)
-    return;
-  int numPlanes = outFeatures.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = outFeatures.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  int inds_stride = indices.size(1);
-  int feature_stride = buffer.size(1);
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    using vecload_type_t = typename half_vec_sadd<T>::type;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = 1; // important for half.
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
-                                       &notFound](auto NumTLP) {
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
-        // vecloadFactor));
-        constexpr int NumILP = NumTLP / 4;
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor, inds_stride,
-                               feature_stride);
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              batchScatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                  indices.data_ptr<Index>(), size - nHotBlock,
-                                  nHotBlock, numPlanes, inds_stride,
-                                  feature_stride);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        batchScatterAddGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, 0, numPlanes, inds_stride,
-                feature_stride);
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-} // namespace spconv
--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
-#include <spconv/spconv_ops.h>
-namespace spconv {
-std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, int64_t batchSize,
-               std::vector<int64_t> outSpatialShape,
-               std::vector<int64_t> spatialShape,
-               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-               std::vector<int64_t> outPadding, int64_t _subM,
-               int64_t _transpose, int64_t _useHash) {
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  auto NDim = kernelSize.size();
-  // CPU always use hash (tsl::robin_map).
-  bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  std::string msg = "due to limits of cuda hash, the volume of dense space "
-                    "include batch size ";
-  msg += "must less than std::numeric_limits<int>::max() = 2e9";
-  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
-                   msg);
-  torch::Tensor indicePairs =
-      torch::full({2, kernelVolume, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  auto gridSize = batchSize * outputVolume;
-  if (useHash) {
-    gridSize = batchSize;
-  }
-  torch::Tensor gridOut = torch::full(
-      {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
-  gridOut = gridOut.view({batchSize, -1});
-  int64_t numActOut = -1;
-  for (int i = 0; i < NDim; ++i) {
-    if (subM) {
-      padding[i] = kernelSize[i] / 2;
-      stride[i] = 1;
-    }
-  }
-  // tv::ssprint("prepare", timer.report() / 1000.0);
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      numActOut = create_submconv_indice_pair_cpu(
-          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
-          dilation, outSpatialShape, transpose, false, useHash);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      numActOut = create_submconv_indice_pair_cuda(
-          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
-          dilation, outSpatialShape, transpose, false, useHash);
-      if (numActOut == -1) {
-        auto device = indices.device();
-        indicePairs = indicePairs.to({torch::kCPU});
-        indiceNum = indiceNum.to({torch::kCPU});
-        indices = indices.to({torch::kCPU});
-        numActOut = create_submconv_indice_pair_cpu(
-            indices, gridOut, indicePairs, indiceNum, kernelSize, stride,
-            padding, dilation, outSpatialShape, transpose, false, useHash);
-        return {indices.to(device), indicePairs.to(device),
-                indiceNum.to(device)};
-      }
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // tv::ssprint("subm", timer.report() / 1000.0);
-    return {indices, indicePairs, indiceNum};
-  } else {
-    auto indicePairUnique = torch::full(
-        {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-        torch::dtype(torch::kInt32).device(indices.device()));
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      numActOut = create_conv_indice_pair_cpu(
-          indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
-          padding, dilation, outSpatialShape, transpose, false, useHash);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      numActOut = create_conv_indice_pair_p1_cuda(
-          indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
-          padding, dilation, outSpatialShape, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = create_conv_indice_pair_p2_cuda(
-            indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
-            outSpatialShape, transpose, false, useHash);
-        if (numActOut == -1) {
-          auto device = indices.device();
-          outInds = outInds.to({torch::kCPU});
-          indicePairs = indicePairs.to({torch::kCPU});
-          indiceNum = indiceNum.to({torch::kCPU});
-          indices = indices.to({torch::kCPU});
-          numActOut = create_conv_indice_pair_cpu(
-              indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,
-              stride, padding, dilation, outSpatialShape, transpose, false,
-              useHash);
-          return {outInds.to(device).slice(0, 0, numActOut),
-                  indicePairs.to(device), indiceNum.to(device)};
-        }
-      }
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM,
-                         int64_t algo) {
-  auto kernelVolume = indiceNum.size(0);
-  switch (algo) {
-  case kBatchGemmGather:
-  case kBatch: {
-    if (kernelVolume != 1) {
-      return indiceConvBatch(features, filters, indicePairs, indiceNum,
-                             numActOut, _inverse, _subM,
-                             algo != kBatchGemmGather);
-    } else {
-      break;
-    }
-  }
-  case kNative:
-    break;
-  default:
-    TV_THROW_RT_ERR("unknown algo");
-  }
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = numActOut;
-  if (subM) { // the center index of subm conv don't need gather and scatter
-    // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
-    if (indicePairMaxSize == 0) {
-      return output;
-    }
-  } else {
-    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  }
-  torch::Tensor inputBuffer =
-      torch::empty({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({indicePairMaxSize, numOutPlanes}, options);
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // TODO torch::from_blob is a little slow
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
-                              nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
-  return output;
-}
-torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
-                              torch::Tensor indicePairs,
-                              torch::Tensor indiceNum, int64_t numActOut,
-                              int64_t _inverse, int64_t _subM,
-                              bool batchScatter) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indiceNum.size(0);
-  TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  // auto timer = spconv::CudaContextTimer<>();
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairNumVec =
-      std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
-                   indicePairNumVec.end(), std::greater<int>());
-  int indicePairTop2Size = indicePairNumVec[1];
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto indice_dtype = indicePairs.scalar_type();
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  // we cant use batch conv in subm directly because
-  // number of indice in the center of filter is much more than other
-  // filter location.
-  // so we first use top2 indice num to do batch conv, then
-  // do native conv (gemm) in center.
-  int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
-  int maxKernelVolumePart = kernelVolume;
-  std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
-  filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
-  if (subM) {
-    maxKernelVolumePart = std::max(indicePairMaxOffset,
-                                   int(kernelVolume - indicePairMaxOffset - 1));
-    part_ranges = {{0, indicePairMaxOffset},
-                   {indicePairMaxOffset + 1, kernelVolume}};
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-    if (indicePairTop2Size == 0) {
-      return output;
-    }
-  }
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0);
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  torch::Tensor inputBuffer =
-      torch::empty({maxKernelVolumePart, bufferSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
-  for (auto &range : part_ranges) {
-    int start = range.first;
-    int end = range.second;
-    int length = end - start;
-    int64_t size = length * bufferSize;
-    auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
-    auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
-    auto indicePairs1Part =
-        tv::torch_slice_first_axis(indicePairs[inverse], start, end);
-    auto indicePairs2Part =
-        tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
-    auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
-                               size);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::bmm_out(outputBufferPart, inputBufferPart, filtersPart);
-    // totalGEMMTime += timer.report() / 1000.0;
-    if (batchScatter) {
-      if (device == torch::kCPU) {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-#ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        batch_sparse_scatter_add_cuda(outputBufferPart, output,
-                                      indicePairs2Part, size);
-      }
-#endif
-      else {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-    } else {
-      for (int i = 0; i < length; ++i) {
-        auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
-        if (nHot <= 0) {
-          continue;
-        }
-        if (device == torch::kCPU) {
-          sparse_scatter_add_cpu(outputBufferPart[i], output,
-                                 indicePairs2Part[i], nHot);
-        }
-#ifdef TV_CUDA
-        else if (device == torch::kCUDA) {
-          sparse_scatter_add_cuda(outputBufferPart[i], output,
-                                  indicePairs2Part[i], nHot);
-        }
-#endif
-        else {
-          TV_THROW_INVALID_ARG("unknown device type");
-        }
-      }
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
-  return output;
-}
-std::vector<torch::Tensor>
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
-                   int64_t algo) {
-  auto kernelVolume = indiceNum.size(0);
-  switch (algo) {
-  case kBatchGemmGather:
-  case kBatch: {
-    if (kernelVolume != 1) {
-      return indiceConvBackwardBatch(features, filters, outGrad, indicePairs,
-                                     indiceNum, _inverse, _subM,
-                                     algo != kBatchGemmGather);
-    } else {
-      break;
-    }
-  }
-  case kNative:
-    break;
-  default:
-    TV_THROW_RT_ERR("unknown algo");
-  }
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::empty(filterShape, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
-    if (indicePairMaxSize == 0) {
-      return {inputGrad, filtersGrad.view(filterShape)};
-    }
-  } else {
-    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  }
-  torch::Tensor inputBuffer =
-      torch::empty({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({indicePairMaxSize, numOutPlanes}, options);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
-      sparse_gather_cpu(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
-      sparse_gather_cuda(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    auto filterGradSub = filtersGrad[i];
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(inputBuffer, inputGrad, indicePairs[inverse][i],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[inverse][i],
-                              nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-std::vector<torch::Tensor>
-indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
-                        torch::Tensor outGrad, torch::Tensor indicePairs,
-                        torch::Tensor indiceNum, int64_t _inverse,
-                        int64_t _subM, bool batchScatter) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indiceNum.size(0);
-  TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairNumVec =
-      std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
-                   indicePairNumVec.end(), std::greater<int>());
-  int indicePairTop2Size = indicePairNumVec[1];
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto indice_dtype = indicePairs.scalar_type();
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-  std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
-  int maxKernelVolumePart = kernelVolume;
-  if (subM) {
-    maxKernelVolumePart = std::max(indicePairMaxOffset,
-                                   int(kernelVolume - indicePairMaxOffset - 1));
-    part_ranges = {{0, indicePairMaxOffset},
-                   {indicePairMaxOffset + 1, kernelVolume}};
-    auto filtersGradSub = filtersGrad[indicePairMaxOffset];
-    auto filtersSub = filters[indicePairMaxOffset];
-    torch::mm_out(filtersGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filtersSub.t());
-    if (indicePairTop2Size == 0) {
-      return {inputGrad, filtersGrad.view(filterShape)};
-    }
-  }
-  torch::Tensor inputBuffer =
-      torch::zeros({maxKernelVolumePart, bufferSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
-  for (auto &range : part_ranges) {
-    int start = range.first;
-    int end = range.second;
-    int length = end - start;
-    int64_t size = length * bufferSize;
-    auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
-    auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
-    auto indicePairs1Part =
-        tv::torch_slice_first_axis(indicePairs[inverse], start, end);
-    auto indicePairs2Part =
-        tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
-    auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
-    auto filtersGradPart = tv::torch_slice_first_axis(filtersGrad, start, end);
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
-                               size);
-      batch_sparse_gather_cuda(outputBufferPart, outGrad, indicePairs2Part,
-                               size);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // filters: KV, I, O, inputBuffer: [KV, buffer, I]
-    // outputBuffer: [KV, buffer, O]
-    torch::bmm_out(filtersGradPart, inputBufferPart.permute({0, 2, 1}),
-                   outputBufferPart);
-    torch::bmm_out(inputBuffer, outputBufferPart,
-                   filtersPart.permute({0, 2, 1}));
-    if (batchScatter) {
-      if (device == torch::kCPU) {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-#ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        batch_sparse_scatter_add_cuda(inputBufferPart, inputGrad,
-                                      indicePairs1Part, size);
-      }
-#endif
-      else {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-    } else {
-      for (int i = 0; i < length; ++i) {
-        auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
-        if (nHot <= 0) {
-          continue;
-        }
-        if (device == torch::kCPU) {
-          sparse_scatter_add_cpu(inputBufferPart[i], inputGrad,
-                                 indicePairs1Part[i], nHot);
-        }
-#ifdef TV_CUDA
-        else if (device == torch::kCUDA) {
-          sparse_scatter_add_cuda(inputBufferPart[i], inputGrad,
-                                  indicePairs1Part[i], nHot);
-        }
-#endif
-        else {
-          TV_THROW_INVALID_ARG("unknown device type");
-        }
-      }
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-} // namespace spconv
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
-if (SPCONV_BuildCUDA)
-    add_library(spconv_nms STATIC nms.cu)
-    set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
-    set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
-    target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
-    set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
-    set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
-    set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(spconv_nms ${CUDA_CUDART})
-    install (TARGETS spconv_nms DESTINATION lib)
-endif()
-add_library(spconv_utils SHARED all.cc)
-set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
-set_target_properties(spconv_utils PROPERTIES SOVERSION 1)
-target_include_directories(spconv_utils PRIVATE ${ALL_INCLUDE}
-                    ${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
-set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
-set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
-set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
-                                         SUFFIX "${PYTHON_MODULE_EXTENSION}")
-if (SPCONV_BuildCUDA)
-    target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
-else()
-    target_link_libraries(spconv_utils pybind11::module)
-endif()
-install (TARGETS spconv_utils DESTINATION lib)
--- a/src/utils/all.cc
+++ b/src/utils/all.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <spconv/box_iou.h>
-#include <spconv/nms.h>
-#include <spconv/point2voxel.h>
-namespace py = pybind11;
-using namespace pybind11::literals;
-PYBIND11_MODULE(spconv_utils, m) {
-  m.doc() = "util pybind11 functions for spconv";
-#ifdef TV_CUDA
-  m.def("non_max_suppression", &spconv::non_max_suppression<double>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-  m.def("non_max_suppression", &spconv::non_max_suppression<float>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-#endif
-  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-  m.def("rotate_non_max_suppression_cpu",
-        &spconv::rotate_non_max_suppression_cpu<float>,
-        py::return_value_policy::reference_internal, "bbox iou",
-        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
-        "thresh"_a = 4);
-  m.def("rotate_non_max_suppression_cpu",
-        &spconv::rotate_non_max_suppression_cpu<double>,
-        py::return_value_policy::reference_internal, "bbox iou",
-        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
-        "thresh"_a = 4);
-  m.def("rbbox_iou", &spconv::rbbox_iou<double>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_iou", &spconv::rbbox_iou<float>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
-        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
-        "max_points"_a = 9, "max_voxels"_a = 10);
-  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
-        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
-        "max_points"_a = 9, "max_voxels"_a = 10);
-  m.def("points_to_voxel_3d_np_mean",
-        &spconv::points_to_voxel_3d_np_mean<float, 3>, "matrix tensor_square",
-        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
-        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
-        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
-        "max_voxels"_a = 11);
-  m.def("points_to_voxel_3d_np_mean",
-        &spconv::points_to_voxel_3d_np_mean<double, 3>, "matrix tensor_square",
-        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
-        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
-        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
-        "max_voxels"_a = 11);
-  m.def("points_to_voxel_3d_with_filtering",
-        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
-        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
-        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
-        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
-        "block_size"_a = 15, "height_threshold"_a = 16,
-        "height_high_threshold"_a = 17);
-  m.def("points_to_voxel_3d_with_filtering",
-        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
-        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
-        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
-        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
-        "block_size"_a = 15, "height_threshold"_a = 16,
-        "height_high_threshold"_a = 17);
-}
\ No newline at end of file
--- a/src/utils/nms.cu
+++ b/src/utils/nms.cu
-// ------------------------------------------------------------------
-// Deformable Convolutional Networks
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN
-// (https://github.com/shaoqingren/faster_rcnn)
-// ------------------------------------------------------------------
-#include <cuda_runtime.h>
-#include <iostream>
-#include <spconv/nms_gpu.h>
-#include <vector>
-#define CUDA_CHECK(condition)                                                  \
-  /* Code block avoids redefinition of cudaError_t error */                    \
-  do {                                                                         \
-    cudaError_t error = condition;                                             \
-    if (error != cudaSuccess) {                                                \
-      std::cout << cudaGetErrorString(error) << std::endl;                     \
-    }                                                                          \
-  } while (0)
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b) {
-  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
-  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  DType interS = width * height;
-  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-template <typename DType, int BLOCK_THREADS>
-__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes,
-                           unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-  // if (row_start > col_start) return;
-  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
-  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size) {
-#pragma unroll
-    for (int i = 0; i < 5; ++i) {
-      block_boxes[threadIdx.x * 5 + i] =
-          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
-    }
-  }
-  __syncthreads();
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
-    const DType *cur_box = dev_boxes + cur_box_idx * 5;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (int i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-void _set_device(int device_id) {
-  int current_device;
-  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id) {
-    return;
-  }
-  // The call to cudaSetDevice must come before any calls to Get, which
-  // may perform initialization using the GPU.
-  CUDA_CHECK(cudaSetDevice(device_id));
-}
-template <typename DType, int BLOCK_THREADS>
-int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-             int boxes_dim, DType nms_overlap_thresh, int device_id) {
-  _set_device(device_id);
-  DType *boxes_dev = NULL;
-  unsigned long long *mask_dev = NULL;
-  const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
-  CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
-  CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
-                        boxes_num * boxes_dim * sizeof(DType),
-                        cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMalloc(&mask_dev,
-                        boxes_num * col_blocks * sizeof(unsigned long long)));
-  dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
-  dim3 threads(BLOCK_THREADS);
-  nms_kernel<DType, BLOCK_THREADS>
-      <<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
-  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
-                        sizeof(unsigned long long) * boxes_num * col_blocks,
-                        cudaMemcpyDeviceToHost));
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / BLOCK_THREADS;
-    int inblock = i % BLOCK_THREADS;
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-  CUDA_CHECK(cudaFree(boxes_dev));
-  CUDA_CHECK(cudaFree(mask_dev));
-  return num_to_keep;
-}
-// template<>
-template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
-                                              const float *boxes_host,
-                                              int boxes_num, int boxes_dim,
-                                              float nms_overlap_thresh,
-                                              int device_id);
-// template<>
-template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
-                                               const double *boxes_host,
-                                               int boxes_num, int boxes_dim,
-                                               double nms_overlap_thresh,
-                                               int device_id);
\ No newline at end of file
--- a/test/aaa.py
+++ b/test/aaa.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+STR = """
+BWG 0.0008761882781982422
+BWG 0.0008311271667480469
+BWG 0.002079486846923828
+BWG 0.002329587936401367
+BWG 0.0025458335876464844
+BWG 0.0026700496673583984
+BWG 0.002583742141723633
+BWG 0.0025262832641601562
+BWG 0.003481149673461914
+BWG 0.003238201141357422
+BWG 0.005095958709716797
+BWG 0.0037899017333984375
+BWG 0.003931283950805664
+BWG 0.003300189971923828
+"""
+"""
+0.003921985626220703
+0.0049707889556884766
+0.0052530765533447266
+0.0060312747955322266
+0.0036766529083251953
+0.00421142578125
+0.002129793167114258
+0.0023038387298583984
+0.0013151168823242188
+0.0015285015106201172
+0.0008392333984375
+0.0008127689361572266
+0.0002486705780029297
+0.00030994415283203125
+"""
+STR1 = """
+SUBM 0.00036716461181640625
+G 0.0010955333709716797
+G 0.0010745525360107422
+REGU 0.0006923675537109375
+M 0.0005242824554443359
+SUBM 0.0003108978271484375
+G 0.0010905265808105469
+G 0.0011067390441894531
+REGU 0.00058746337890625
+M 0.0005304813385009766
+SUBM 0.0002682209014892578
+G 0.0010945796966552734
+G 0.0011165142059326172
+REGU 0.0005419254302978516
+M 0.0005164146423339844
+SUBM 0.00021505355834960938
+G 0.0010805130004882812
+G 0.0010516643524169922
+REGU 0.00052642822265625
+M 0.0004677772521972656
+SUBM 0.0002262592315673828
+G 0.0010986328125
+G 0.0010256767272949219
+REGU 0.0005693435668945312
+M 0.00048661231994628906
+SUBM 0.0002319812774658203
+G 0.0011110305786132812
+G 0.0011196136474609375
+REGU 0.0005295276641845703
+M 0.0005729198455810547
+SUBM 0.00023889541625976562
+G 0.0005326271057128906
+G 0.0005140304565429688
+"""
+STR2 = """
+SUBM 0.0003352165222167969
+G 0.001149892807006836
+G 0.0017066001892089844
+REGU 0.0006349086761474609
+M 0.00048804283142089844
+SUBM 0.00029850006103515625
+G 0.001767873764038086
+G 0.0020656585693359375
+REGU 0.0005462169647216797
+M 0.0005753040313720703
+SUBM 0.0002789497375488281
+G 0.0012230873107910156
+G 0.0014438629150390625
+REGU 0.0005102157592773438
+M 0.0005676746368408203
+SUBM 0.00020241737365722656
+G 0.00102996826171875
+G 0.0011174678802490234
+REGU 0.0005424022674560547
+M 0.0005102157592773438
+SUBM 0.0001976490020751953
+G 0.0010385513305664062
+G 0.0010204315185546875
+REGU 0.0005321502685546875
+M 0.00047278404235839844
+SUBM 0.00021529197692871094
+G 0.0010280609130859375
+G 0.0010151863098144531
+REGU 0.0004942417144775391
+M 0.0004811286926269531
+SUBM 0.00020694732666015625
+G 0.0005142688751220703
+G 0.0005171298980712891
+"""
+def _handle_lines(s: str):
+    arr = s.split(" ")
+    return (arr[0], float(arr[-1]))
+from cumm.gemm.codeops import group_by
+def print_str(s: str):
+    nums = list(map(_handle_lines, s.strip().split("\n")))
+    num_dict = group_by(lambda x: x[0], nums)
+    num_dict_ = {k: sum([vv[1] for vv in v]) for k, v in num_dict.items()}
+    print(num_dict_)
+print_str(STR1)
+print_str(STR2)
\ No newline at end of file
--- a/test/benchmark.py
+++ b/test/benchmark.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from torch import nn
+from cumm import tensorview as tv 
+import spconv.pytorch as spconv
+from spconv.utils import Point2VoxelCPU3d
+def waymo_data(batch_size=1):
+    gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
+                           150000, 1)
+    # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
+    #                        150000)
+    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
+    pc = np.ascontiguousarray(data["pc"])
+    print(pc.shape)
+    voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
+    voxels = voxels_tv.numpy().reshape(-1, 3)
+    coors = indices_tv.numpy()
+    N = coors.shape[0]
+    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
+    return voxels, coors, gen.grid_size
+class Net(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
+                              algo=algo),
+            # spconv.SubMConv3d(32,
+            #                   32,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c0",
+            #                   algo=algo),
+            # # nn.BatchNorm1d(32),
+            # # nn.ReLU(),
+            # # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
+            # #                   algo=algo),
+            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(64,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(96,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            # nn.BatchNorm1d(64),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(96,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            spconv.SubMConv3d(128,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(128,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            spconv.SubMConv3d(160,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(160,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            spconv.SubMConv3d(192,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(192,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            spconv.SubMConv3d(224,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(224,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+    def forward(self, features, coors, batch_size):
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)
+class Net2(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 256, 3, bias=False, indice_key="c0",
+                              algo=algo),
+            # spconv.SubMConv3d(32,
+            #                   32,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c0",
+            #                   algo=algo),
+            # # nn.BatchNorm1d(32),
+            # # nn.ReLU(),
+            # # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
+            # #                   algo=algo),
+            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(256,
+                              512,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(512,
+                              512,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+    def forward(self, features, coors, batch_size):
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)
+def main():
+    import pickle 
+    np.random.seed(50051)
+    torch.manual_seed(50051)
+    # voxels, coors, spatial_shape = waymo_data()
+    # with open("/home/yy/test_spconv.pkl", "wb") as f:
+    #     pickle.dump((voxels, coors, spatial_shape), f)
+    with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
+        (voxels, coors, spatial_shape) = pickle.load(f)
+    print(spatial_shape)
+    print(voxels.shape)
+    # voxels = voxels[:100]
+    # coors = coors[:100]
+    dtype = torch.float32
+    voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
+    coors_th = torch.from_numpy(coors).cuda().int()
+    voxels_th.requires_grad = True
+    algo = spconv.ConvAlgo.Native
+    net = Net(spatial_shape, algo).cuda().eval().to(dtype)
+    print(coors_th.shape)
+    out = net(voxels_th, coors_th, 1)
+    print(out.spatial_shape)
+    print(voxels.mean(),  voxels.max(), voxels.min())
+    dout = np.random.uniform(-0.2, 0.2,
+                                out.features.shape).astype(np.float32)
+    dout_t = torch.from_numpy(dout).cuda().to(dtype)
+    print(out.spatial_shape, out.features.mean(),  out.features.max(),  out.features.min())
+    times = []
+    with torch.no_grad():
+        for i in range(20):
+            print("------------")
+            torch.cuda.synchronize()
+            t = time.time()
+            out_nograd = net(voxels_th, coors_th, 1)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+    print("spconv time", np.mean(times[10:]))
+    times = []
+    for i in range(10):
+        out = net(voxels_th, coors_th, 1)
+        print("------------")
+        torch.cuda.synchronize()
+        t = time.time()
+        out.features.backward(dout_t)
+        torch.cuda.synchronize()
+        times.append(time.time() - t)
+    # print((net.grid == -1).float().sum(), net.grid.numel())
+    # print("spconv time", time.time() - t)
+    print("spconv bw time", np.mean(times[5:]))
+if __name__ == "__main__":
+    main()
--- a/test/data/benchmark-pc.npz
+++ b/test/data/benchmark-pc.npz
--- a/test/data/test_spconv.pkl
+++ b/test/data/test_spconv.pkl
--- a/test/src/catch_main.cpp
+++ b/test/src/catch_main.cpp
-// 000-CatchMain.cpp
-// In a Catch project with multiple files, dedicate one file to compile the
-// source code of Catch itself and reuse the resulting object file for linking.
-// Let Catch provide main():
-#define CATCH_CONFIG_MAIN
-#include "catch.hpp"
-// That's it
-// Compile implementation of Catch for use with files that do contain tests:
-// - g++ -std=c++11 -Wall -I$(CATCH_SINGLE_INCLUDE) -c 000-CatchMain.cpp
-// - cl -EHsc -I%CATCH_SINGLE_INCLUDE% -c 000-CatchMain.cpp
--- a/test/src/test_conv_rule.cpp
+++ b/test/src/test_conv_rule.cpp
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include "catch.hpp"
-#include <prettyprint.h>
-#include <string>
-#include <vector>
-#include <exception>
-#include <numeric>
-#include <pybind11/embed.h> // everything needed for embedding
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <tuple>
-#include <pybind11_utils.h>
-#include <spconv/spconv_ops.h>
-namespace py = pybind11;
-TEST_CASE("GetConvIndPair", "[SpConvNet]")
-{
-    using namespace py::literals;
-    py::scoped_interpreter guard{}; // start the interpreter and keep it alive
-    py::exec(R"(
-    from __future__ import print_function
-    import numpy as np
-    import math
-    # import spconv
-    # import torch
-    def get_convolution_output_size(input_size,
-                                    kernel_size,
-                                    stride,
-                                    padding=None,
-                                    rate=None):
-        ndim = len(input_size)
-        if padding is None:
-            padding = [0] * ndim
-        output_size = []
-        for i in range(ndim):
-            output_size.append((input_size[i] + 2 * padding[i] - (
-                (kernel_size[i] - 1) + 1)) // stride[i] + 1)
-        return output_size
-    def get_test_sparse_data(shape,
-                            num_points,
-                            num_channels,
-                            integer=False,
-                            dtype=np.float32):
-        dense_shape = shape
-        ndim = len(dense_shape)
-        # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
-        num_points = np.array(num_points)
-        # num_points = np.array([3, 2])
-        batch_size = len(num_points)
-        batch_indices = []
-        coors_total = np.stack(
-            np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
-        coors_total = coors_total.reshape(-1, ndim)
-        for i in range(batch_size):
-            np.random.shuffle(coors_total)
-            inds_total = coors_total[:num_points[i]]
-            inds_total = np.pad(
-                inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
-            batch_indices.append(inds_total)
-        if integer:
-            sparse_data = np.random.randint(
-                20, 100, size=[num_points.sum(), num_channels]).astype(dtype)
-        else:
-            sparse_data = np.random.uniform(
-                -1, 1, size=[num_points.sum(), num_channels]).astype(dtype)
-        # sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
-        dense_data = np.zeros(
-            [batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
-        start = 0
-        for i, inds in enumerate(batch_indices):
-            for j, ind in enumerate(inds):
-                dense_slice = (i, slice(None), *ind[:-1])
-                dense_data[dense_slice] = sparse_data[start + j]
-            start += len(inds)
-        batch_indices = np.concatenate(batch_indices, axis=0)
-        return {
-            "features": sparse_data.astype(dtype),
-            "indices": batch_indices.astype(np.int32),
-            "features_dense": dense_data.astype(dtype),
-        }
-    shape = [50, 30, 30]
-    num_points = [5000] * 1
-    # np.random.seed(np.random.randint(1, 100000))
-    in_channels = 64
-    sparse_dict = get_test_sparse_data(shape, num_points, in_channels)
-    features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-    indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
-    features_dense = sparse_dict["features_dense"]
-    # indices_t = torch.from_numpy(indices)
-    filters = np.random.uniform(0, 1, size=[3, 3, 3, 64, 64]).astype(np.float32)
-    # print(outids.shape)
-    )");
-    SECTION("DebugTest"){
-        auto inds = array2TensorView<int>(py::array(py::globals()["indices"]));
-        auto inds_tensor = torch::from_blob(inds.data(), {inds.dim(0), inds.dim(1)}, torch::dtype(torch::kInt32));
-        auto inds_gpu = inds_tensor.to(torch::Device(torch::kCPU));
-        auto features = array2TensorView<float>(py::array(py::globals()["features"]));
-        auto features_tensor = torch::from_blob(features.data(), {features.dim(0), features.dim(1)}, torch::dtype(torch::kFloat));
-        auto features_gpu = features_tensor.to(torch::Device(torch::kCUDA, 0));
-        auto filters = array2TensorView<float>(py::array(py::globals()["filters"]));
-        auto filters_tensor = torch::from_blob(filters.data(), {filters.dim(0), filters.dim(1), filters.dim(2), filters.dim(3), filters.dim(4)}, torch::dtype(torch::kFloat));
-        auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
-        auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
-            {1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
-        // std::cout << outputs[2] << std::endl;
-        /*
-        auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
-        std::cout << output << std::endl;*/
-    }
-}
\ No newline at end of file
--- a/test/test_conv.py
+++ b/test/test_conv.py
-# Copyright 2019 Yan Yan
+# Copyright 2021 Yan Yan
-#
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,9 +20,9 @@ import numpy as np
 import torch
 from torch import nn
-import spconv
+import spconv.pytorch as spconv
 from spconv.test_utils import TestCase, generate_sparse_data, params_grid
+from spconv.constants import FILTER_HWIO
 # import sparseconvnet as scn
@@ -47,7 +47,6 @@ class SparseConv3dTestTorch(nn.Module):
                                padding=padding,
                                dilation=dilation,
                                bias=False,
-                                use_hash=False,
                                algo=algo)
        ]
        for i in range(1, num_layers):
@@ -59,7 +58,6 @@ class SparseConv3dTestTorch(nn.Module):
                                    padding=padding,
                                    dilation=dilation,
                                    bias=False,
-                                    use_hash=False,
                                    algo=algo))
        self.net = spconv.SparseSequential(*layers, )
        # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
@@ -349,16 +347,19 @@ def scatter_nd(indices, updates, shape):
 class TestSpConv(TestCase):
    def testSpConv3d(self):
        np.random.seed(484)
-        devices = ["cpu:0"]
+        devices = ["cuda:0"]
        shapes = [[19, 18, 17]]
        batchsizes = [1, 2]
-        in_channels = [64]
+        in_channels = [32]
        out_channels = [32, 48, 64]
        ksizes = [2, 3]
        strides = [1, 2, 3]
        paddings = [0, 1, 2]
        dilations = [1, 2, 3]
+        # strides = [1]
+        # paddings = [0]
+        # dilations = [1]
        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -367,7 +368,6 @@ class TestSpConv(TestCase):
                continue  # don't support this.
            device = torch.device(dev)
            num_points = [1000] * bs
            sparse_dict = generate_sparse_data(shape, num_points, IC)
            features = np.ascontiguousarray(sparse_dict["features"]).astype(
@@ -375,23 +375,36 @@ class TestSpConv(TestCase):
            indices = np.ascontiguousarray(
                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+            if FILTER_HWIO:
-                                                    OC]).astype(np.float32)
+                filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                        OC]).astype(np.float32)
+            else:
+                filters = np.random.uniform(0, 1, size=[k, k, k, OC,
+                                                        IC]).astype(np.float32)
+            dtype = torch.float16
            indices_t = torch.from_numpy(indices).int().to(device)
-            features_t = torch.from_numpy(features).to(device)
+            features_t = torch.from_numpy(features).to(device).to(dtype)
            features_t.requires_grad = True
-            features_dense_t = torch.from_numpy(features_dense).to(device)
+            features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
            features_dense_t.requires_grad = True
            net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
-                                        d).to(device)
+                                        d).to(device).to(dtype)
            net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
-                                      d).to(device)
+                                      d).to(device).to(dtype)
-            filters_t = torch.from_numpy(filters).to(device)
+            filters_t = torch.from_numpy(filters).to(device).to(dtype)
-            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
+            if FILTER_HWIO:
-                                                              2).contiguous()
+                net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
+                                                                2).contiguous()
+            else:
+                net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+                                                                2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs).dense()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
            dout = np.random.uniform(-0.2, 0.2,
                                     out_ref.shape).astype(features.dtype)
            dout_t = torch.from_numpy(dout).to(device)
@@ -401,18 +414,21 @@ class TestSpConv(TestCase):
                                                               1).contiguous()
            din_sparse = gather_nd(din_dense, indices_t.long())
            din = features_t.grad.detach()
            din_np = din.cpu().numpy()
            din_sparse_np = din_sparse.cpu().numpy()
-            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
            for layer, layer_ref in zip(net.net, net_ref.net):
                dw = layer.weight.grad.detach().cpu().numpy()
                dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
-                dw = dw.transpose(4, 3, 0, 1, 2)
+                if FILTER_HWIO:
+                    dw = dw.transpose(4, 3, 0, 1, 2)
+                else:
+                    dw = dw.transpose(3, 4, 0, 1, 2)
                self.assertAllClose(dw, dw_ref, atol=1e-4)
+            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
    def testSpDeConv3d(self):
        np.random.seed(484)
@@ -426,6 +442,11 @@ class TestSpConv(TestCase):
        strides = [2, 3]
        paddings = [0, 1, 2]
        dilations = [1, 2, 3]
+        ksizes = [3]
+        strides = [1]
+        paddings = [0]
+        dilations = [1]
        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -442,8 +463,13 @@ class TestSpConv(TestCase):
            indices = np.ascontiguousarray(
                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+            if FILTER_HWIO:
-                                                    OC]).astype(np.float32)
+                filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                        OC]).astype(np.float32)
+            else:
+                filters = np.random.uniform(0, 1, size=[k, k, k, OC,
+                                                        IC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
@@ -454,11 +480,20 @@ class TestSpConv(TestCase):
            net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                        d).to(device)
            filters_t = torch.from_numpy(filters).to(device)
-            net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+            print(net_ref.net[0].weight.shape)
-                                                              2).contiguous()
+            if FILTER_HWIO:
+                net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+                                                                2).contiguous()
+            else:
+                net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
+                                                                2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs).dense()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
            dout = np.random.uniform(-0.2, 0.2,
                                     out_ref.shape).astype(features.dtype)
            dout_t = torch.from_numpy(dout).to(device)
@@ -474,12 +509,12 @@ class TestSpConv(TestCase):
            for layer, layer_ref in zip(net.net, net_ref.net):
                dw = layer.weight.grad.detach().cpu().numpy()
                dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
-                dw = dw.transpose(3, 4, 0, 1, 2)
+                if FILTER_HWIO:
+                    dw = dw.transpose(3, 4, 0, 1, 2)
+                else:
+                    dw = dw.transpose(4, 3, 0, 1, 2)
                self.assertAllClose(dw, dw_ref, atol=1e-4)
-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
    def testSpCpConv3d(self):
        np.random.seed(484)
@@ -551,12 +586,16 @@ class TestSpConv(TestCase):
        shapes = [[19, 18, 17]]
        batchsizes = [1, 2]
-        in_channels = [62]
+        in_channels = [64]
-        out_channels = [62]
+        out_channels = [64]
        ksizes = [2, 3]
        strides = [1, 2, 3]
        paddings = [0, 1]
        dilations = [1, 2, 3]
+        ksizes = [2]
+        strides = [2]
+        paddings = [0]
+        dilations = [1]
        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -565,6 +604,7 @@ class TestSpConv(TestCase):
                continue  # don't support this.
            device = torch.device(dev)
            num_points = [1000] * bs
            # when data contains negative, sparse maxpool is not equal to dense maxpool.
            sparse_dict = generate_sparse_data(shape,
                                               num_points,
@@ -576,8 +616,8 @@ class TestSpConv(TestCase):
            indices = np.ascontiguousarray(
                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+            filters = np.random.uniform(0, 1, size=[k, k, k, OC,
-                                                    OC]).astype(np.float32)
+                                                    IC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
@@ -588,11 +628,15 @@ class TestSpConv(TestCase):
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs)
            outids = out.indices
            outfeatures = out.features
            outids_dev = outids.float()
            out_dense = out.dense(channels_first=False)
            out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
            dout_sparse = np.random.uniform(
                -0.2, 0.2, outfeatures.shape).astype(features.dtype)
@@ -607,9 +651,6 @@ class TestSpConv(TestCase):
            din_sparse = gather_nd(din_dense, indices_t.long())
            din = features_t.grad.detach()
-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
            din_np = din.cpu().numpy()
            din_sparse_np = din_sparse.cpu().numpy()
            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
@@ -623,8 +664,8 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
    shapes = [[400, 400, 15]]
    batchsizes = [2]
-    in_channels = [32]
+    in_channels = [19]
-    out_channels = [64]
+    out_channels = [17]
    ksizes = [(3, 3, 3)]
    strides = [1]
    paddings = [0]
@@ -752,8 +793,8 @@ def main_subm(algo, dtype=torch.float32):
 if __name__ == '__main__':
-    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
+    # main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
-    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half)
+    # main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
    # TestCase().assertAllClose(out_my, out_ref)
    # unittest.main()
-    # TestSpConv().testSpConv3d()
+    TestSpConv().testSpConv3d()
--- a/third_party/catch2/catch.hpp
+++ b/third_party/catch2/catch.hpp
--- a/pybind11 @ 3b1dbeba
+++ b/pybind11 @ 3b1dbeba
-Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
--- a/tools/README.md
+++ b/tools/README.md
+<!--
+ Copyright 2021 Yan Yan
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+## How to debug manylinux build
+```Bash
+docker run --rm -it -e PLAT=manylinux2014_x86_64 -v `pwd`:/io -v $HOME:/myhome scrin/manylinux2014-cuda:cu114-devel bash
+/io/tools/build-wheels.sh
+```
+## Windows C++ Tips
+* cuda attributes such as ```__device__``` must put before return type. when you see ```warning: __declspec attributes ignored```, this means ```__device__``` is ignored because you put it after return type, then cause error.
--- a/tools/build-wheels.sh
+++ b/tools/build-wheels.sh
+#!/bin/bash
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e -u -x
+function repair_wheel {
+    wheel="$1"
+    outpath="$2"
+    if ! auditwheel show "$wheel"; then
+        echo "Skipping non-platform wheel $wheel"
+    else
+        auditwheel repair "$wheel" --plat "$PLAT" -w "$outpath"
+    fi
+}
+export SPCONV_DISABLE_JIT="1"
+export CUMM_CUDA_ARCH_LIST="all"
+# export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
+# Compile wheels, we only support 3.6-3.10.
+# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+for PYVER in ${SPCONV_PYTHON_LIST//;/ }
+do
+    PYVER2=`echo "$PYVER" | sed 's/\.//'`
+    PYVER_CP="cp$PYVER2-cp$PYVER2"
+    if [ "$PYVER2" = "36" ]; then
+        PYVER_CP="cp$PYVER2-cp${PYVER2}m"
+    fi
+    if [ "$PYVER2" = "37" ]; then
+        PYVER_CP="cp$PYVER2-cp${PYVER2}m"
+    fi
+    "/opt/python/$PYVER_CP/bin/pip" wheel /io/  -v --no-deps -w /io/wheelhouse_tmp
+done
+# Bundle external shared libraries into the wheels
+for whl in /io/wheelhouse_tmp/*.whl; do
+    repair_wheel "$whl" /io/dist
+done
+rm -rf /io/wheelhouse_tmp
\ No newline at end of file
--- a/tools/install_windows_cuda.ps1
+++ b/tools/install_windows_cuda.ps1
+## -------------------
+## Constants
+## -------------------
+# Dictionary of known cuda versions and thier download URLS, which do not follow a consistent pattern :(
+$CUDA_KNOWN_URLS = @{
+    "10.2" = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe";
+    "11.0" = "http://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe";
+    "11.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe";
+    "11.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe";
+    "11.3" = "https://developer.download.nvidia.com/compute/cuda/11.3.1/network_installers/cuda_11.3.1_win10_network.exe";
+    "11.4" = "https://developer.download.nvidia.com/compute/cuda/11.4.2/network_installers/cuda_11.4.2_win10_network.exe";
+}
+# cuda_runtime.h is in nvcc <= 10.2, but cudart >= 11.0
+# @todo - make this easier to vary per CUDA version.
+$CUDA_PACKAGES_IN = @(
+    "nvcc";
+    "visual_studio_integration";
+    "curand_dev";
+    "nvrtc_dev";
+    "cudart";
+)
+## -------------------
+## Select CUDA version
+## -------------------
+# Get the cuda version from the environment as env:cuda.
+$CUDA_VERSION_FULL = $env:cuda
+# Make sure CUDA_VERSION_FULL is set and valid, otherwise error.
+# Validate CUDA version, extracting components via regex
+$cuda_ver_matched = $CUDA_VERSION_FULL -match "^(?<major>[1-9][0-9]*)\.(?<minor>[0-9]+)$"
+if(-not $cuda_ver_matched){
+    Write-Output "Invalid CUDA version specified, <major>.<minor> required. '$CUDA_VERSION_FULL'."
+    exit 1
+}
+$CUDA_MAJOR=$Matches.major
+$CUDA_MINOR=$Matches.minor
+## ------------------------------------------------
+## Select CUDA packages to install from environment
+## ------------------------------------------------
+$CUDA_PACKAGES = ""
+# for CUDA >= 11 cudart is a required package.
+# if([version]$CUDA_VERSION_FULL -ge [version]"11.0") {
+#     if(-not $CUDA_PACKAGES_IN -contains "cudart") {
+#         $CUDA_PACKAGES_IN += 'cudart'
+#     }
+# }
+Foreach ($package in $CUDA_PACKAGES_IN) {
+    # Make sure the correct package name is used for nvcc.
+    if($package -eq "nvcc" -and [version]$CUDA_VERSION_FULL -lt [version]"9.1"){
+        $package="compiler"
+    } elseif($package -eq "compiler" -and [version]$CUDA_VERSION_FULL -ge [version]"9.1") {
+        $package="nvcc"
+    }
+    $CUDA_PACKAGES += " $($package)_$($CUDA_MAJOR).$($CUDA_MINOR)"
+}
+echo "$($CUDA_PACKAGES)"
+## -----------------
+## Prepare download
+## -----------------
+# Select the download link if known, otherwise have a guess.
+$CUDA_REPO_PKG_REMOTE=""
+if($CUDA_KNOWN_URLS.containsKey($CUDA_VERSION_FULL)){
+    $CUDA_REPO_PKG_REMOTE=$CUDA_KNOWN_URLS[$CUDA_VERSION_FULL]
+} else{
+    # Guess what the url is given the most recent pattern (at the time of writing, 10.1)
+    Write-Output "note: URL for CUDA ${$CUDA_VERSION_FULL} not known, estimating."
+    $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
+}
+$CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
+## ------------
+## Install CUDA
+## ------------
+# Get CUDA network installer
+Write-Output "Downloading CUDA Network Installer for $($CUDA_VERSION_FULL) from: $($CUDA_REPO_PKG_REMOTE)"
+Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
+if(Test-Path -Path $CUDA_REPO_PKG_LOCAL){
+    Write-Output "Downloading Complete"
+} else {
+    Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) from $($CUDA_REPO_PKG_REMOTE)"
+    exit 1
+}
+# Invoke silent install of CUDA (via network installer)
+Write-Output "Installing CUDA $($CUDA_VERSION_FULL). Subpackages $($CUDA_PACKAGES)"
+Start-Process -Wait -FilePath .\"$($CUDA_REPO_PKG_LOCAL)" -ArgumentList "-s $($CUDA_PACKAGES)"
+# Check the return status of the CUDA installer.
+if (!$?) {
+    Write-Output "Error: CUDA installer reported error. $($LASTEXITCODE)"
+    exit 1 
+}
+# Store the CUDA_PATH in the environment for the current session, to be forwarded in the action.
+$CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$($CUDA_MAJOR).$($CUDA_MINOR)"
+$CUDA_PATH_VX_Y = "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)" 
+# Set environmental variables in this session
+$env:CUDA_PATH = "$($CUDA_PATH)"
+$env:CUDA_PATH_VX_Y = "$($CUDA_PATH_VX_Y)"
+Write-Output "CUDA_PATH $($CUDA_PATH)"
+Write-Output "CUDA_PATH_VX_Y $($CUDA_PATH_VX_Y)"
+# PATH needs updating elsewhere, anything in here won't persist.
+# Append $CUDA_PATH/bin to path.
+# Set CUDA_PATH as an environmental variable
+# If executing on github actions, emit the appropriate echo statements to update environment variables
+if (Test-Path "env:GITHUB_ACTIONS") { 
+    # Set paths for subsequent steps, using $env:CUDA_PATH
+    echo "Adding CUDA to CUDA_PATH, CUDA_PATH_X_Y and PATH"
+    echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+    echo "$env:CUDA_PATH_VX_Y=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+    echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+}