working on tensor core test

01ed382c · yan.yan · 3517290c · 3517290c · 3517290c · 3517290c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/include/cuhash/hash_table.h
+++ b/include/cuhash/hash_table.h
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.h
- *
- * @brief Header for a basic hash table that stores one value per key.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-
-#include "definitions.h"
-#include "hash_functions.h"
-
-#include <cstdio>
-
-/** \addtogroup cudpp_app
- * @{
- */
-
-/** \addtogroup cudpp_hash_data_structures
- * @{
- */
-
-/* --------------------------------------------------------------------------
-   Doxygen definitions.
-   -------------------------------------------------------------------------- */
-/*! @namespace CudaHT
- *  @brief Encapsulates the hash table library.
- */
-
-/*! @namespace CuckooHashing
- *  @brief Encapsulates the cuckoo hash table that uses stashes.
- */
-
-/* -------------------------------------------------------------------------
-   Hash table code.
-   ------------------------------------------------------------------------- */
-namespace cuhash {
-
-//! Compute how many thread blocks are required for the given number of threads.
-dim3 ComputeGridDim(unsigned threads);
-
-//! Compute how long an eviction chain is allowed to become for a given input
-//! size.
-/*! \param[in] num_keys       Number of keys in the input.
- *  \param[in] table_size     Number of slots in the hash table.
- *  \param[in] num_functions  Number of hash functions being used.
- *  \returns The number of iterations that should be allowed.
- *
- *  The latter two parameters are only needed when using an empirical
- *  formula for computing the chain length.
- */
-unsigned ComputeMaxIterations(const unsigned num_keys,
-                              const unsigned table_size,
-                              const unsigned num_functions);
-
-//! Basic hash table that stores one value for each key.
-/*! The input consists of two unsigned arrays of keys and values.
- *  None of the keys are expected to be repeated.
- *
- *  @todo Templatize the interface without forcing the header file to
- *  have CUDA calls.
- *  @ingroup cudpp_app
- */
-class HashTable {
-public:
-  HashTable();
-
-  virtual ~HashTable() { Release(); }
-
-  //! Initialize the hash table's memory. Must be called before \ref
-  //! Build() and after the random number generator has been seeded.
-  /*! @param[in] max_input_size   Largest expected number of items in the input.
-   *  @param[in] space_usage Size of the hash table relative to the
-   *                         input. Bigger tables are faster to build
-   *                         and retrieve from.
-   *  @param[in] num_functions Number of hash functions to use. May be
-   *                           2-5. More hash functions make it easier
-   *                           to build the table, but increase
-   *                           retrieval times.
-   *  @returns Whether the hash table was initialized successfully (true)
-   *           or not (false).
-   *
-   *  The minimum space usage is dependent on the number of functions
-   *  being used; for two through five functions, the minimum space
-   *  usage is 2.1, 1.1, 1.03, and 1.02 respectively.
-   */
-  virtual bool Initialize(const unsigned max_input_size,
-                          const float space_usage = 1.25,
-                          const unsigned num_functions = 4);
-
-  //! Free all memory.
-  virtual void Release();
-
-  //! Build the hash table.
-  /*! @param[in] input_size   Number of key-value pairs being inserted.
-   *  @param[in] d_keys       Device memory array containing all of the input
-   *                          keys.
-   *  @param[in] d_vals       Device memory array containing the keys' values.
-   *  @returns Whether the hash table was built successfully (true) or
-   *           not (false).
-   *
-   *  Several attempts are allowed to build the hash table in case of failure.
-   *  The input keys are expected to be completely unique.
-   *  To reduce the chance of a failure, increase the space usage or number of
-   *  functions.
-   *  Keys are not allowed to be equal to cuhash::kKeyEmpty.
-   */
-  virtual bool Build(const unsigned input_size, const unsigned *d_keys,
-                     const unsigned *d_vals);
-
-  //! Query the hash table.
-  /*! @param[in] n_queries        Number of keys in the query set.
-   *  @param[in] d_query_keys     Device memory array containing all of
-   *                              the query keys.
-   *  @param[in] d_query_results  Values for the query keys.
-   *
-   *  kNotFound is returned for any query key that failed to be found
-   *  in the table.
-   */
-  virtual void Retrieve(const unsigned n_queries, const unsigned *d_query_keys,
-                        unsigned *d_query_results);
-
-  //! @name Accessors
-  /// @brief Mainly needed to use the __device__ CudaHT::retrieve()
-  /// function directly.
-  /// @{
-
-  //! Returns how many slots the hash table has.
-  inline unsigned get_table_size() const { return table_size_; }
-
-  //! Returns how many items are stored in the stash.
-  inline unsigned get_stash_count() const { return stash_count_; }
-
-  //! Returns the constants used by the stash.
-  inline uint2 get_stash_constants() const { return stash_constants_; }
-
-  //! Returns the hash table contents.
-  inline const Entry *get_contents() const { return d_contents_; }
-
-  //! Returns the number of hash functions being used.
-  inline unsigned get_num_hash_functions() const { return num_hash_functions_; }
-
-  //! When using two hash functions, returns the constants.
-  inline Functions<2> get_constants_2() const { return constants_2_; }
-
-  //! When using three hash functions, returns the constants.
-  inline Functions<3> get_constants_3() const { return constants_3_; }
-
-  //! When using four hash functions, returns the constants.
-  inline Functions<4> get_constants_4() const { return constants_4_; }
-
-  //! When using five hash functions, returns the constants.
-  inline Functions<5> get_constants_5() const { return constants_5_; }
-
-  /// @}
-  inline Entry *data() { return d_contents_; }
-  inline const Entry *data() const { return d_contents_; }
-
-protected:
-  unsigned table_size_;         //!< Size of the hash table.
-  unsigned num_hash_functions_; //!< Number of hash functions being used.
-  Entry *d_contents_; //!< Device memory: The hash table contents.  The stash is
-                      //!< stored at the end.
-  unsigned stash_count_;  //!< Number of key-value pairs currently stored.
-  uint2 stash_constants_; //!< Hash function constants for the stash.
-
-  Functions<2> constants_2_; //!< Constants for a set of two hash functions.
-  Functions<3> constants_3_; //!< Constants for a set of three hash functions.
-  Functions<4> constants_4_; //!< Constants for a set of four hash functions.
-  Functions<5> constants_5_; //!< Constants for a set of five hash functions.
-
-  unsigned *d_failures_; //!< Device memory: General use error flag.
-};
-
-/*! @name Internal
- *  @{
- */
-namespace CUDAWrapper {
-//! Fills a 64-bit array with a particular value.
-void ClearTable(const unsigned slots_in_table, const Entry fill_value,
-                Entry *d_array);
-
-//! Calls the Cuckoo Hash construction kernel.
-void CallCuckooHash(const unsigned n_entries, const unsigned num_hash_functions,
-                    const unsigned *d_keys, const unsigned *d_values,
-                    const unsigned table_size, const Functions<2> constants_2,
-                    const Functions<3> constants_3,
-                    const Functions<4> constants_4,
-                    const Functions<5> constants_5,
-                    const unsigned max_iteration_attempts, Entry *d_contents,
-                    uint2 stash_constants, unsigned *d_stash_count,
-                    unsigned *d_failures, unsigned *d_iterations_taken);
-
-//! Calls the kernel that performs retrievals.
-void CallHashRetrieve(const unsigned n_queries,
-                      const unsigned num_hash_functions,
-                      const unsigned *keys_in, const unsigned table_size,
-                      const Entry *table, const Functions<2> constants_2,
-                      const Functions<3> constants_3,
-                      const Functions<4> constants_4,
-                      const Functions<5> constants_5,
-                      const uint2 stash_constants, const unsigned stash_count,
-                      unsigned *values_out);
-}; // namespace CUDAWrapper
-/// @}
-
-}; // namespace cuhash
-
-/** @} */ // end hash table data structures
-/** @} */ // end cudpp_app
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/include/paramsgrid.h
+++ b/include/paramsgrid.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is used for c++ unit test, but pytorch jit ops don't support c++
-// debug build.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-
-namespace detail {
-template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
-
-template <class T, class... TArgs>
-int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
-
-template <int Idx, class TT, class T>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-
-template <int Idx, class TT, class T, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-} // namespace detail
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-
-#endif
\ No newline at end of file
--- a/include/spconv/box_iou.h
+++ b/include/spconv/box_iou.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BOX_IOU_H
-#define BOX_IOU_H
-
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-
-namespace spconv {
-// #include "voxelnet/core/cc/pybind11_helper.h"
-namespace py = pybind11;
-using namespace pybind11::literals;
-template <typename DType, typename ShapeContainer>
-inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
-  // create ROWMAJOR array.
-  py::array_t<DType> array(shape);
-  std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
-  return array;
-}
-
-template <typename DType>
-inline py::array_t<DType> zeros(std::vector<long int> shape) {
-  return constant<DType, std::vector<long int>>(shape, 0);
-}
-
-template <typename DType>
-py::array_t<DType>
-rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        bg::union_(poly, qpoly, poly_union);
-        if (!poly_union.empty()) {
-          union_area = bg::area(poly_union.front());
-          overlaps_rw(n, k) = inter_area / union_area;
-        }
-        poly_union.clear();
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-template <typename DType>
-py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
-                                      py::array_t<DType> qbox_corners,
-                                      py::array_t<DType> standup_iou,
-                                      DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        overlaps_rw(n, k) = inter_area;
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-} // namespace spconv
-#endif
\ No newline at end of file
--- a/include/spconv/cublas_gemm.h
+++ b/include/spconv/cublas_gemm.h
-#pragma once
-#include <cublas_v2.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-
-template <class T>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const T *alpha, const T *A, int lda, const T *B,
-                           int ldb, const T *beta, T *C, int ldc);
-
-template <class T>
-cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k,
-                              const T *alpha, const T *A, int lda, const T *B,
-                              int ldb, const T *beta, T *C, int ldc) {
-  return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
-                        beta, C, ldc);
-}
-
-template <class T> inline T constant_scalar(float data) { return T(data); }
-
-template <class T>
-cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
-                    const tv::TensorView<T> A, const tv::TensorView<T> B,
-                    tv::TensorView<T> C) {
-  TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
-  TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
-  auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
-  int m = transa ? A.dim(1) : A.dim(0);
-  int n = transb ? B.dim(0) : B.dim(1);
-  int ka = transa ? A.dim(0) : A.dim(1);
-  int kb = transb ? B.dim(1) : B.dim(0);
-  int lda = transa ? m : ka;
-  int ldb = transb ? ka : n;
-  int ldc = n;
-  TV_ASSERT_RT_ERR(ka == kb, "error");
-  T alpha = constant_scalar<T>(1);
-  T beta = constant_scalar<T>(0);
-  return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
-                           &alpha, A.data(), lda, B.data(), ldb, &beta,
-                           C.data(), ldc);
-}
-
-} // namespace spconv
--- a/include/spconv/fused_conv.cu.h
+++ b/include/spconv/fused_conv.cu.h
--- a/include/spconv/fused_conv.h
+++ b/include/spconv/fused_conv.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <cuda_runtime_api.h>
-#include <tensorview/tensor.h>
-#include <torch/script.h>
-namespace spconv {
-
-enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
-using all_fused_conv_algos_t =
-    tv::mp_list_c<int, kFSparseConvNet, kFMinkowskiEngine>;
-
-void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
-                     torch::Tensor filters, torch::Tensor indicesIn,
-                     torch::Tensor indicesOut, int nHot);
-
-void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
-                              torch::Tensor dout, torch::Tensor filters,
-                              torch::Tensor dfilters, torch::Tensor indicesIn,
-                              torch::Tensor indicesOut, int nHot);
-
-void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
-                               torch::Tensor filters, torch::Tensor indicesIn,
-                               torch::Tensor indicesOut, int nHot);
-void fused_conv_backward_cuda_minkowski(torch::Tensor features,
-                                        torch::Tensor din, torch::Tensor dout,
-                                        torch::Tensor filters,
-                                        torch::Tensor dfilters,
-                                        torch::Tensor indicesIn,
-                                        torch::Tensor indicesOut, int nHot);
-
-template <int Algo> struct FusedConvDispatch;
-
-template <> struct FusedConvDispatch<kFSparseConvNet> {
-  constexpr static auto *fwd = fused_conv_cuda;
-  constexpr static auto *bwd = fused_conv_backward_cuda;
-};
-
-template <> struct FusedConvDispatch<kFMinkowskiEngine> {
-  constexpr static auto *fwd = fused_conv_cuda_minkowski;
-  constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
-};
-
-} // namespace spconv
--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSED_SPARSE_CONV_OP_H_
-#define FUSED_SPARSE_CONV_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-torch::Tensor
-fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor bias, torch::Tensor indicePairs,
-                         torch::Tensor indiceNum, int64_t numActOut,
-                         int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output =
-      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
-                              nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/geometry.h
+++ b/include/spconv/geometry.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include <iostream>
-#include <limits>
-#include <tensorview/tensorview.h>
-#include <tsl/robin_map.h>
-#include <unordered_map>
-namespace spconv {
-
-namespace detail {
-
-template <typename T> struct ToUnsigned;
-
-template <> struct ToUnsigned<int> { using type = uint32_t; };
-
-template <> struct ToUnsigned<long> { using type = uint64_t; };
-
-template <typename T> struct FNVInternal;
-template <> struct FNVInternal<uint32_t> {
-  constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
-  constexpr static uint32_t prime = 0x01000193;
-};
-
-template <> struct FNVInternal<uint64_t> {
-  constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
-  constexpr static uint64_t prime = 0x100000001b3;
-};
-
-} // namespace detail
-template <typename T>
-using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
-
-template <typename T> struct FNV1a : detail::FNVInternal<T> {
-  std::size_t operator()(const T *data, std::size_t size) {
-    to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
-    for (std::size_t i = 0; i < size; ++i) {
-      hash *= detail::FNVInternal<T>::prime;
-      hash ^= static_cast<to_unsigned_t<T>>(data[i]);
-    }
-    return hash;
-  }
-};
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
-                                    const Index *kernelSize,
-                                    const Index *stride, const Index *padding,
-                                    const Index *dilation,
-                                    const Index *outSpatialShape, Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
-                 stride[i] + padding[i]) /
-                stride[i];
-    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPosTranspose(
-    const Index *input_pos, const Index *kernelSize, const Index *stride,
-    const Index *padding, const Index *dilation, const Index *outSpatialShape,
-    Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = input_pos[i] * stride[i] - padding[i];
-    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (val - lowers[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
--- a/include/spconv/indice.h
+++ b/include/spconv/indice.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
-#define SPARSE_CONV_INDICE_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-int create_conv_indice_pair_p1_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose);
-
-int create_conv_indice_pair_p2_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_submconv_indice_pair_cuda(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_conv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
-    bool useHash);
-
-int create_submconv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/maxpool.h
+++ b/include/spconv/maxpool.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
-#define SPARSE_MAXPOOL_FUNCTOR_H_
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor dout, torch::Tensor din,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor dout, torch::Tensor din,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/minkowski.cu.h
+++ b/include/spconv/minkowski.cu.h
-/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
- * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
- * of the code.
- */
-
-template <typename Dtype, typename Itype, int BLOCK_SIZE>
-__global__ void matmul(const Dtype *A, const int wA, const int hA,
-                       const Dtype *B, const int wB, const int hB, Dtype *C,
-                       const Itype *in_map, const Itype *out_map) {
-  // Use in_feat as A and kernel as B
-
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. x is for rows, y is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
-
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
-
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
-    Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
-
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
-
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
-    }
-
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < wB)
-    atomicAdd(&C[wB * out_row + x], Csub);
-  // C[wB * out_row + x] += Csub;
-}
-
-template <typename Dtype, typename Itype, int BLOCK_SIZE>
-__global__ void matmul2(const Dtype *A, const int wA, const int hA,
-                        const Dtype *B, const int wB, const int hB,
-                        const Dtype *D, const int wD, const int hD, Dtype *C,
-                        Dtype *E, const Itype *in_map, const Itype *out_map) {
-  // Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
-
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. y is for rows, x is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
-
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
-
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-  Dtype Esub = 0;
-
-  // Declaration of the shared memory array As used to
-  // store the sub-matrix of A
-  __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
-
-  // Declaration of the shared memory array Bs used to
-  // store the sub-matrix of B
-  __shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
-
-  // Declaration of the shared memory array Ds used to
-  // store the sub-matrix of D
-  __shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
-
-  // For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
-  DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
-
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
-
-    // Transposed kernel
-    BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
-
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
-
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * BTs[k][tx];
-    }
-
-    // For Esub, reset to 0
-    Esub = 0;
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Esub += DTs[k][ty] * As[k][tx];
-    }
-
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-
-    // For the E matrix which requires accmulation of multiple blocks, use
-    // atomic addition. This can be replaced with a more sophisticaed reduction
-    // algorithm.
-    if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
-      atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < hB)
-    atomicAdd(&C[hB * in_row + x], Csub);
-}
--- a/include/spconv/nms.h
+++ b/include/spconv/nms.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_CPU_H
-#define NMS_CPU_H
-#include <pybind11/pybind11.h>
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include "box_iou.h"
-#include "nms_gpu.h"
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <vector>
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType>
-std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
-                                         py::array_t<int> order, DType thresh,
-                                         DType eps = 0) {
-  auto ndets = boxes.shape(0);
-  auto boxes_r = boxes.template unchecked<2>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto area = zeros<DType>({int(ndets)});
-  auto area_rw = area.template mutable_unchecked<1>();
-  // get areas
-  for (int i = 0; i < ndets; ++i) {
-    area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
-                 (boxes_r(i, 3) - boxes_r(i, 1) + eps);
-  }
-  std::vector<int> keep;
-  int i, j;
-  DType xx1, xx2, w, h, inter, ovr;
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
-      xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
-      w = xx2 - xx1 + eps;
-      if (w > 0) {
-        xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
-        xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
-        h = xx2 - xx1 + eps;
-        if (h > 0) {
-          inter = w * h;
-          ovr = inter / (area_rw(i) + area_rw(j) - inter);
-          if (ovr >= thresh)
-            suppressed_rw(j) = 1;
-        }
-      }
-    }
-  }
-  return keep;
-}
-
-template <typename DType>
-std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
-                                                py::array_t<int> order,
-                                                py::array_t<DType> standup_iou,
-                                                DType thresh) {
-  auto ndets = box_corners.shape(0);
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  std::vector<int> keep;
-  int i, j;
-
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area, overlap;
-
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      if (standup_iou_r(i, j) <= 0.0)
-        continue;
-      // std::cout << "pre_poly" << std::endl;
-      try {
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::intersection(poly, qpoly, poly_inter);
-      } catch (const std::exception &e) {
-        std::cout << "box i corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
-                    << std::endl;
-        }
-        std::cout << "box j corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
-                    << std::endl;
-        }
-        // throw e;
-        continue;
-      }
-      // std::cout << "post_poly" << std::endl;
-      // std::cout << "post_intsec" << std::endl;
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        // std::cout << "pre_union" << " " << inter_area << std::endl;
-        bg::union_(poly, qpoly, poly_union);
-        /*
-        if (poly_union.empty()){
-            std::cout << "intsec area:" << " " << inter_area << std::endl;
-            std::cout << "box i corners:" << std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
-        k, 1) << std::endl;
-            }
-            std::cout << "box j corners:" <<  std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
-        k, 1) << std::endl;
-            }
-        }*/
-        // std::cout << "post_union" << poly_union.empty() << std::endl;
-        if (!poly_union.empty()) { // ignore invalid box
-          union_area = bg::area(poly_union.front());
-          // std::cout << "post union area" << std::endl;
-          // std::cout << union_area << "debug" << std::endl;
-          overlap = inter_area / union_area;
-          if (overlap >= thresh)
-            suppressed_rw(j) = 1;
-          poly_union.clear();
-        }
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return keep;
-}
-#ifdef TV_CUDA
-constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
-                        DType nms_overlap_thresh, int device_id) {
-  py::buffer_info info = boxes.request();
-  auto boxes_ptr = static_cast<DType *>(info.ptr);
-  py::buffer_info info_k = keep_out.request();
-  auto keep_out_ptr = static_cast<int *>(info_k.ptr);
-
-  return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
-                                          boxes.shape(0), boxes.shape(1),
-                                          nms_overlap_thresh, device_id);
-}
-#endif
-
-} // namespace spconv
-#endif
--- a/include/spconv/nms_functor.h
+++ b/include/spconv/nms_functor.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_FUNCTOR_H_
-#define NMS_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct NonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxes, T threshold, T eps);
-};
-
-template <typename Device, typename T, typename Index>
-struct rotateNonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxCorners,
-                   tv::TensorView<const T> standupIoU, T threshold);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/nms_gpu.h
+++ b/include/spconv/nms_gpu.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-template <typename DType, int BLOCK_THREADS>
-int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-             int boxes_dim, DType nms_overlap_thresh, int device_id);
--- a/include/spconv/nms_ops.h
+++ b/include/spconv/nms_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_TORCH_OP_H_
-#define NMS_TORCH_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/nms_functor.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-template <typename T>
-torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores,
-                                int64_t preMaxSize, int64_t postMaxSize,
-                                double thresh, double eps) {
-  // auto timer = spconv::CudaContextTimer<>();
-  tv::check_torch_dtype<T>(boxes);
-  auto resOptions =
-      torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
-  if (boxes.size(0) == 0) {
-    return torch::zeros({0}, resOptions);
-  }
-  torch::Tensor indices;
-  if (preMaxSize > 0) {
-    auto numKeepedScores = scores.size(0);
-    preMaxSize = std::min(numKeepedScores, preMaxSize);
-    auto res = torch::topk(scores, preMaxSize);
-    indices = std::get<1>(res);
-    boxes = torch::index_select(boxes, 0, indices);
-  } else {
-    indices = std::get<1>(torch::sort(scores));
-    boxes = torch::index_select(boxes, 0, indices);
-  }
-  if (boxes.size(0) == 0)
-    return torch::zeros({0}, resOptions);
-
-  auto keep = torch::zeros({boxes.size(0)}, resOptions);
-  int64_t keepNum = 0;
-  if (boxes.device().type() == torch::kCPU) {
-    auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
-    keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
-                         tv::torch2tv<const T>(boxes), T(thresh), T(eps));
-  } else {
-    TV_ASSERT_RT_ERR(false, "not implemented");
-  }
-  if (postMaxSize <= 0) {
-    postMaxSize = keepNum;
-  }
-  // std::cout << keep << std::endl;
-  keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
-  if (preMaxSize > 0) {
-    return torch::index_select(indices, 0, keep);
-  }
-  return keep;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/pillar_scatter_functor.h
+++ b/include/spconv/pillar_scatter_functor.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
-#define POINTPILLARS_SCATTER_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct PointPillarScatter {
-  void operator()(const Device &d, tv::TensorView<T> canvas,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const T> coors);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PILLAR_SCATTER_OP_H_
-#define PILLAR_SCATTER_OP_H_
-
-#include <spconv/pillar_scatter_functor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-template <typename T>
-torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
-                                 torch::Tensor shape) {
-  TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
-  TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
-  TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
-  TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
-  TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
-  TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
-  TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
-  TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
-
-  tv::check_torch_dtype<int>(shape);
-  tv::check_torch_dtype<T>(coors);
-  auto shapeData = shape.data_ptr<int>();
-  torch::Tensor canvas =
-      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
-                   features.options());
-  TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
-#ifdef TV_CUDA
-  functor::PointPillarScatter<tv::GPU, T, int> ftor;
-  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
-       tv::torch2tv<const T>(features.squeeze()),
-       tv::torch2tv<const T>(coors.squeeze()));
-#endif
-  return canvas;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/point2voxel.cu.h
+++ b/include/spconv/point2voxel.cu.h
-#pragma once
-
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-template <typename Index, unsigned NDim>
-__global__ void scatterPointToGridKernel(
-    tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
-    tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
-    tv::TensorView<Index> pointIndex,
-    const tv::SimpleVector<Index, NDim> gridShape) {
-  Index index;
-  int numPoints = points.dim(0);
-  int numFeatures = points.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numPoints)) {
-    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-        indexes.data() + ix * NDim, gridShape.data(), 0);
-    pointIndex(ix) = index;
-    atomicAdd(numPointsPerGrid.data() + index, Index(1));
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      atomicAdd(grids.data() + index * numFeatures + k,
-                *(points.data() + ix * numFeatures + k));
-    }
-  }
-}
-
-template <typename Index, unsigned NDim>
-__global__ void
-gatherPointFromGridKernel(tv::TensorView<const float> grids,
-                          tv::TensorView<const Index> numPointsPerGrid,
-                          tv::TensorView<const Index> pointIndexUnique,
-                          tv::TensorView<float> voxels,
-                          tv::TensorView<Index> coors,
-                          const tv::SimpleVector<Index, NDim> gridShape) {
-  Index index;
-  int numVoxels = voxels.dim(0);
-  int numFeatures = grids.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
-    index = pointIndexUnique(ix);
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
-    }
-    index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
-                                            gridShape.data());
-  }
-}
-
-template <typename Index>
-__global__ void resetGridKernel(tv::TensorView<float> grids,
-                                tv::TensorView<Index> numPointsPerGrid,
-                                tv::TensorView<Index> pointIndexUnique) {
-  Index index;
-  int numVoxels = pointIndexUnique.dim(0) - 1;
-  int numFeatures = grids.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
-    index = pointIndexUnique(ix);
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      grids(index, k) = 0;
-      numPointsPerGrid(index) = 0;
-    }
-  }
-}
-
-template <typename Index>
-__global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
-                                      const Index gridVolume) {
-  int num_max_points = pointIndex.dim(0) - 1;
-
-  for (int ix : tv::KernelLoopX<int>(num_max_points)) {
-    pointIndex(ix) = gridVolume;
-  }
-}
-} // namespace spconv
--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-// #include <vector>
-#include <iostream>
-#include <math.h>
-
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> voxel_point_mask,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(
-    py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
-    py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto means_rw = means.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) +=
-            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
-      }
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(i, j, k) = means_rw(i, k);
-      }
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
-    py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels, int block_factor, int block_size,
-    DType height_threshold, DType height_high_threshold) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto mins_rw = mins.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-
-  DType max_value, min_value;
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int block_shape_H = grid_size[1] / block_factor;
-  int block_shape_W = grid_size[0] / block_factor;
-  int voxelidx, num;
-  int block_coor[2];
-  int startx, stopx, starty, stopy;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      block_coor[0] = coor[1] / block_factor;
-      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) =
-          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) =
-          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor[1] = coors_rw(i, 1);
-    coor[2] = coors_rw(i, 2);
-    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
-    block_coor[0] = coor[1] / block_factor;
-    block_coor[1] = coor[2] / block_factor;
-    min_value = mins_rw(block_coor[0], block_coor[1]);
-    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx =
-        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    starty = std::max(0, block_coor[1] - block_size / 2);
-    stopy =
-        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-
-    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k) {
-        min_value = std::min(min_value, mins_rw(j, k));
-        max_value = std::max(max_value, maxs_rw(j, k));
-      }
-    }
-    voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
-                       ((max_value - min_value) < height_high_threshold);
-  }
-  return voxel_num;
-}
-
-} // namespace spconv