working on tensor core test

01ed382c · yan.yan · 3517290c · 3517290c · 3517290c · 3517290c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/include/cuhash/hash_table.h
+++ b/include/cuhash/hash_table.h
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.h
- *
- * @brief Header for a basic hash table that stores one value per key.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-
-#include "definitions.h"
-#include "hash_functions.h"
-
-#include <cstdio>
-
-/** \addtogroup cudpp_app
- * @{
- */
-
-/** \addtogroup cudpp_hash_data_structures
- * @{
- */
-
-/* --------------------------------------------------------------------------
-   Doxygen definitions.
-   -------------------------------------------------------------------------- */
-/*! @namespace CudaHT
- *  @brief Encapsulates the hash table library.
- */
-
-/*! @namespace CuckooHashing
- *  @brief Encapsulates the cuckoo hash table that uses stashes.
- */
-
-/* -------------------------------------------------------------------------
-   Hash table code.
-   ------------------------------------------------------------------------- */
-namespace cuhash {
-
-//! Compute how many thread blocks are required for the given number of threads.
-dim3 ComputeGridDim(unsigned threads);
-
-//! Compute how long an eviction chain is allowed to become for a given input
-//! size.
-/*! \param[in] num_keys       Number of keys in the input.
- *  \param[in] table_size     Number of slots in the hash table.
- *  \param[in] num_functions  Number of hash functions being used.
- *  \returns The number of iterations that should be allowed.
- *
- *  The latter two parameters are only needed when using an empirical
- *  formula for computing the chain length.
- */
-unsigned ComputeMaxIterations(const unsigned num_keys,
-                              const unsigned table_size,
-                              const unsigned num_functions);
-
-//! Basic hash table that stores one value for each key.
-/*! The input consists of two unsigned arrays of keys and values.
- *  None of the keys are expected to be repeated.
- *
- *  @todo Templatize the interface without forcing the header file to
- *  have CUDA calls.
- *  @ingroup cudpp_app
- */
-class HashTable {
-public:
-  HashTable();
-
-  virtual ~HashTable() { Release(); }
-
-  //! Initialize the hash table's memory. Must be called before \ref
-  //! Build() and after the random number generator has been seeded.
-  /*! @param[in] max_input_size   Largest expected number of items in the input.
-   *  @param[in] space_usage Size of the hash table relative to the
-   *                         input. Bigger tables are faster to build
-   *                         and retrieve from.
-   *  @param[in] num_functions Number of hash functions to use. May be
-   *                           2-5. More hash functions make it easier
-   *                           to build the table, but increase
-   *                           retrieval times.
-   *  @returns Whether the hash table was initialized successfully (true)
-   *           or not (false).
-   *
-   *  The minimum space usage is dependent on the number of functions
-   *  being used; for two through five functions, the minimum space
-   *  usage is 2.1, 1.1, 1.03, and 1.02 respectively.
-   */
-  virtual bool Initialize(const unsigned max_input_size,
-                          const float space_usage = 1.25,
-                          const unsigned num_functions = 4);
-
-  //! Free all memory.
-  virtual void Release();
-
-  //! Build the hash table.
-  /*! @param[in] input_size   Number of key-value pairs being inserted.
-   *  @param[in] d_keys       Device memory array containing all of the input
-   *                          keys.
-   *  @param[in] d_vals       Device memory array containing the keys' values.
-   *  @returns Whether the hash table was built successfully (true) or
-   *           not (false).
-   *
-   *  Several attempts are allowed to build the hash table in case of failure.
-   *  The input keys are expected to be completely unique.
-   *  To reduce the chance of a failure, increase the space usage or number of
-   *  functions.
-   *  Keys are not allowed to be equal to cuhash::kKeyEmpty.
-   */
-  virtual bool Build(const unsigned input_size, const unsigned *d_keys,
-                     const unsigned *d_vals);
-
-  //! Query the hash table.
-  /*! @param[in] n_queries        Number of keys in the query set.
-   *  @param[in] d_query_keys     Device memory array containing all of
-   *                              the query keys.
-   *  @param[in] d_query_results  Values for the query keys.
-   *
-   *  kNotFound is returned for any query key that failed to be found
-   *  in the table.
-   */
-  virtual void Retrieve(const unsigned n_queries, const unsigned *d_query_keys,
-                        unsigned *d_query_results);
-
-  //! @name Accessors
-  /// @brief Mainly needed to use the __device__ CudaHT::retrieve()
-  /// function directly.
-  /// @{
-
-  //! Returns how many slots the hash table has.
-  inline unsigned get_table_size() const { return table_size_; }
-
-  //! Returns how many items are stored in the stash.
-  inline unsigned get_stash_count() const { return stash_count_; }
-
-  //! Returns the constants used by the stash.
-  inline uint2 get_stash_constants() const { return stash_constants_; }
-
-  //! Returns the hash table contents.
-  inline const Entry *get_contents() const { return d_contents_; }
-
-  //! Returns the number of hash functions being used.
-  inline unsigned get_num_hash_functions() const { return num_hash_functions_; }
-
-  //! When using two hash functions, returns the constants.
-  inline Functions<2> get_constants_2() const { return constants_2_; }
-
-  //! When using three hash functions, returns the constants.
-  inline Functions<3> get_constants_3() const { return constants_3_; }
-
-  //! When using four hash functions, returns the constants.
-  inline Functions<4> get_constants_4() const { return constants_4_; }
-
-  //! When using five hash functions, returns the constants.
-  inline Functions<5> get_constants_5() const { return constants_5_; }
-
-  /// @}
-  inline Entry *data() { return d_contents_; }
-  inline const Entry *data() const { return d_contents_; }
-
-protected:
-  unsigned table_size_;         //!< Size of the hash table.
-  unsigned num_hash_functions_; //!< Number of hash functions being used.
-  Entry *d_contents_; //!< Device memory: The hash table contents.  The stash is
-                      //!< stored at the end.
-  unsigned stash_count_;  //!< Number of key-value pairs currently stored.
-  uint2 stash_constants_; //!< Hash function constants for the stash.
-
-  Functions<2> constants_2_; //!< Constants for a set of two hash functions.
-  Functions<3> constants_3_; //!< Constants for a set of three hash functions.
-  Functions<4> constants_4_; //!< Constants for a set of four hash functions.
-  Functions<5> constants_5_; //!< Constants for a set of five hash functions.
-
-  unsigned *d_failures_; //!< Device memory: General use error flag.
-};
-
-/*! @name Internal
- *  @{
- */
-namespace CUDAWrapper {
-//! Fills a 64-bit array with a particular value.
-void ClearTable(const unsigned slots_in_table, const Entry fill_value,
-                Entry *d_array);
-
-//! Calls the Cuckoo Hash construction kernel.
-void CallCuckooHash(const unsigned n_entries, const unsigned num_hash_functions,
-                    const unsigned *d_keys, const unsigned *d_values,
-                    const unsigned table_size, const Functions<2> constants_2,
-                    const Functions<3> constants_3,
-                    const Functions<4> constants_4,
-                    const Functions<5> constants_5,
-                    const unsigned max_iteration_attempts, Entry *d_contents,
-                    uint2 stash_constants, unsigned *d_stash_count,
-                    unsigned *d_failures, unsigned *d_iterations_taken);
-
-//! Calls the kernel that performs retrievals.
-void CallHashRetrieve(const unsigned n_queries,
-                      const unsigned num_hash_functions,
-                      const unsigned *keys_in, const unsigned table_size,
-                      const Entry *table, const Functions<2> constants_2,
-                      const Functions<3> constants_3,
-                      const Functions<4> constants_4,
-                      const Functions<5> constants_5,
-                      const uint2 stash_constants, const unsigned stash_count,
-                      unsigned *values_out);
-}; // namespace CUDAWrapper
-/// @}
-
-}; // namespace cuhash
-
-/** @} */ // end hash table data structures
-/** @} */ // end cudpp_app
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/include/paramsgrid.h
+++ b/include/paramsgrid.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is used for c++ unit test, but pytorch jit ops don't support c++
-// debug build.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-
-namespace detail {
-template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
-
-template <class T, class... TArgs>
-int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
-
-template <int Idx, class TT, class T>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-
-template <int Idx, class TT, class T, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-} // namespace detail
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-
-#endif
\ No newline at end of file
--- a/include/spconv/box_iou.h
+++ b/include/spconv/box_iou.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BOX_IOU_H
-#define BOX_IOU_H
-
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-
-namespace spconv {
-// #include "voxelnet/core/cc/pybind11_helper.h"
-namespace py = pybind11;
-using namespace pybind11::literals;
-template <typename DType, typename ShapeContainer>
-inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
-  // create ROWMAJOR array.
-  py::array_t<DType> array(shape);
-  std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
-  return array;
-}
-
-template <typename DType>
-inline py::array_t<DType> zeros(std::vector<long int> shape) {
-  return constant<DType, std::vector<long int>>(shape, 0);
-}
-
-template <typename DType>
-py::array_t<DType>
-rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        bg::union_(poly, qpoly, poly_union);
-        if (!poly_union.empty()) {
-          union_area = bg::area(poly_union.front());
-          overlaps_rw(n, k) = inter_area / union_area;
-        }
-        poly_union.clear();
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-template <typename DType>
-py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
-                                      py::array_t<DType> qbox_corners,
-                                      py::array_t<DType> standup_iou,
-                                      DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        overlaps_rw(n, k) = inter_area;
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-} // namespace spconv
-#endif
\ No newline at end of file
--- a/include/spconv/cublas_gemm.h
+++ b/include/spconv/cublas_gemm.h
-#pragma once
-#include <cublas_v2.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-
-template <class T>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const T *alpha, const T *A, int lda, const T *B,
-                           int ldb, const T *beta, T *C, int ldc);
-
-template <class T>
-cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k,
-                              const T *alpha, const T *A, int lda, const T *B,
-                              int ldb, const T *beta, T *C, int ldc) {
-  return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
-                        beta, C, ldc);
-}
-
-template <class T> inline T constant_scalar(float data) { return T(data); }
-
-template <class T>
-cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
-                    const tv::TensorView<T> A, const tv::TensorView<T> B,
-                    tv::TensorView<T> C) {
-  TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
-  TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
-  auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
-  int m = transa ? A.dim(1) : A.dim(0);
-  int n = transb ? B.dim(0) : B.dim(1);
-  int ka = transa ? A.dim(0) : A.dim(1);
-  int kb = transb ? B.dim(1) : B.dim(0);
-  int lda = transa ? m : ka;
-  int ldb = transb ? ka : n;
-  int ldc = n;
-  TV_ASSERT_RT_ERR(ka == kb, "error");
-  T alpha = constant_scalar<T>(1);
-  T beta = constant_scalar<T>(0);
-  return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
-                           &alpha, A.data(), lda, B.data(), ldb, &beta,
-                           C.data(), ldc);
-}
-
-} // namespace spconv
--- a/include/spconv/fused_conv.cu.h
+++ b/include/spconv/fused_conv.cu.h
-
-/*
-BSD License
-
-For SparseConvNet software
-
-Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name Facebook nor the names of its contributors may be used to
-   endorse or promote products derived from this software without specific
-   prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define TACC double
-
-template <typename T, int32_t K, int32_t V>
-__global__ void
-dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w,
-                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                            int32_t input_nPlanes, int32_t input_stride,
-                            int32_t output_nPlanes, int32_t output_stride) {
-  // nHot must be a multiple of K!!
-
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  int32_t M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  int32_t n = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += g * input_nPlanes;
-  outFeatures += n * K + g * output_nPlanes;
-  w += n * K + g * input_nPlanes * output_nPlanes;
-
-  TACC O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int32_t m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        R0[v] = rulesIn[s + ty[v]];
-        R1[v] = rulesOut[s + ty[v]];
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-template <typename T, int32_t K, int32_t V>
-__global__ void
-dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w,
-                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                            int32_t input_nPlanes, int32_t input_stride,
-                            int32_t output_nPlanes, int32_t output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  int32_t M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  int32_t n = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += g * input_nPlanes;
-  outFeatures += n * K + g * output_nPlanes;
-  w += n * K + g * input_nPlanes * output_nPlanes;
-
-  TACC O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int32_t m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rulesIn[s + ty[v]];
-          R1[v] = rulesOut[s + ty[v]];
-        }
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      int32_t o = (nHot / K) * K;                                              \
-      if (o >= K)                                                              \
-        dConvolution_KMxKN_forwardA<T, K, V>                                   \
-            <<<dim3(std::min(o / K, (int32_t)512), output_nPlanes / K,         \
-                    nGroups),                                                  \
-               dim3(K, K / V), 0, s>>>(                                        \
-                inFeatures, outFeatures, w, rulesIn, rulesOut, o,              \
-                input_nPlanes, input_stride, output_nPlanes, output_stride);   \
-      if (nHot > o)                                                            \
-        dConvolution_KMxKN_forwardB<T, K, V>                                   \
-            <<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>(  \
-                inFeatures, outFeatures, w, rulesIn + o, rulesOut + o,         \
-                nHot - o, input_nPlanes, input_stride, output_nPlanes,         \
-                output_stride);                                                \
-      return;                                                                  \
-    }                                                                          \
-  }
-template <typename T>
-void dConvolution_forward(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
-                          int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                          int32_t input_nPlanes, int32_t input_stride,
-                          int32_t output_nPlanes, int32_t output_stride,
-                          int32_t nGroups) {
-  FOO(T, 64, 16)
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-template <>
-void dConvolution_forward<double>(cudaStream_t s, double *inFeatures,
-                                  double *outFeatures, double *w,
-                                  int32_t *rulesIn, int32_t *rulesOut,
-                                  int32_t nHot, int32_t input_nPlanes,
-                                  int32_t input_stride, int32_t output_nPlanes,
-                                  int32_t output_stride, int32_t nGroups) {
-  FOO(double, 32, 8)
-  FOO(double, 16, 4)
-  FOO(double, 8, 2)
-  assert(false);
-}
-#undef FOO
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template <typename T, int32_t K, int32_t V>
-__global__ void dConvolution_KMxKN_backward_dW_A(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
-    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
-    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  int32_t N = output_nPlanes / K;
-  int32_t m = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += m * K + g * input_nPlanes;
-  dInFeatures += m * K + g * input_nPlanes;
-  dOutFeatures += g * output_nPlanes;
-  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  TACC dI[V];
-  TACC dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int32_t v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        R0[v] = rulesIn[s + ty[v]];
-        R1[v] = rulesOut[s + ty[v]];
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-      }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template <typename T, int32_t K, int32_t V>
-__global__ void dConvolution_KMxKN_backward_dW_B(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
-    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
-    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  int32_t N = output_nPlanes / K;
-  int32_t m = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += m * K + g * input_nPlanes;
-  dInFeatures += m * K + g * input_nPlanes;
-  dOutFeatures += g * output_nPlanes;
-  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  TACC dI[V];
-  TACC dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int32_t v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rulesIn[s + ty[v]];
-          R1[v] = rulesOut[s + ty[v]];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot) {
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-        } else {
-          I[ty[v]][tx] = 0;
-          dO[ty[v]][tx] = 0;
-        }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      int32_t o = (nHot / K) * K;                                              \
-      if (o >= K)                                                              \
-        dConvolution_KMxKN_backward_dW_A<T, K, V>                              \
-            <<<dim3(std::min(o / K, (int32_t)512), input_nPlanes / K,          \
-                    nGroups),                                                  \
-               dim3(K, K / V), 0, s>>>(inFeatures, dInFeatures, dOutFeatures,  \
-                                       w, dw, rulesIn, rulesOut, o,            \
-                                       input_nPlanes, input_stride,            \
-                                       output_nPlanes, output_stride);         \
-      if (nHot > o)                                                            \
-        dConvolution_KMxKN_backward_dW_B<T, K, V>                              \
-            <<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>(   \
-                inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o,     \
-                rulesOut + o, nHot - o, input_nPlanes, input_stride,           \
-                output_nPlanes, output_stride);                                \
-      return;                                                                  \
-    }                                                                          \
-  }
-template <typename T>
-void dConvolution_backward_dW(cudaStream_t s, T *inFeatures, T *dInFeatures,
-                              T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
-                              int32_t *rulesOut, int32_t nHot,
-                              int32_t input_nPlanes, int32_t input_stride,
-                              int32_t output_nPlanes, int32_t output_stride,
-                              int32_t nGroups) {
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-#undef FOO
-template <typename T, int32_t K, int32_t V>
-__global__ void
-dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w,
-                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                            int32_t input_nPlanes, int32_t input_stride,
-                            int32_t output_nPlanes, int32_t output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
-  // K is a multiple of V,
-  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
-  // - parallel over N,nHot - loop over M
-  int32_t M = (input_nPlanes + K - 1) / K;
-  // N = gridDim.y ~ output_nPlanes/K
-  int32_t n = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += g * input_nPlanes;
-  outFeatures += n * K + g * output_nPlanes;
-  w += n * K + g * input_nPlanes * output_nPlanes;
-  int32_t KO = min(K, output_nPlanes - K * n);
-  TACC O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  __shared__ int32_t R[K * 2];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t m = 0; m < M; m++) {
-    int32_t KI = min(K, input_nPlanes - K * m);
-// Read w
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (ty[v] < 1) {
-          if (s + tx < nHot) {
-            R[2 * tx] = rulesIn[s + tx];
-            R[2 * tx + 1] = rulesOut[s + tx];
-          }
-          // R[q] = rules[2 * s + q];
-        }
-      }
-      __syncthreads();
-// Read input, reset O[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < KI; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-      __syncthreads();
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (tx < KO and s + ty[v] < nHot)
-          outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template <typename T, int32_t K, int32_t V>
-__global__ void dConvolution_KMxKN_backward_dW2(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
-    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
-    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  int32_t N = (output_nPlanes + K - 1) / K;
-  int32_t m = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += m * K + g * input_nPlanes;
-  dInFeatures += m * K + g * input_nPlanes;
-  dOutFeatures += g * output_nPlanes;
-  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  int32_t KI = min(K, input_nPlanes - K * m);
-  TACC dI[V];
-  TACC dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  __shared__ int32_t R[K * 2];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t n = 0; n < N; n++) {
-    int32_t KO = min(K, output_nPlanes - K * n);
-// Read w, reset dW
-#pragma unroll
-    for (int32_t v = 0; v < V; v++) {
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs, reset dI[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (ty[v] < 1) {
-          if (s + tx < nHot) {
-            R[2 * tx] = rulesIn[s + tx];
-            R[2 * tx + 1] = rulesOut[s + tx];
-          }
-          // R[q] = rules[2 * s + q];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        else
-          I[ty[v]][tx] = 0;
-        if (tx < KO and s + ty[v] < nHot)
-          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
-        else
-          dO[ty[v]][tx] = 0;
-      }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < KO; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-      __syncthreads();
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (tx < KI and s + ty[v] < nHot)
-          dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-template <typename T>
-void dConvolution_forward2(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
-                           int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                           int32_t input_nPlanes, int32_t input_stride,
-                           int32_t output_nPlanes, int32_t output_stride,
-                           int32_t nGroups) {
-  int32_t c = input_nPlanes * output_nPlanes * nGroups;
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int32_t K = 16;
-    const int32_t V = 4;
-    dConvolution_KMxKN_forward2<T, K, V>
-        <<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
-           s>>>(inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
-                input_nPlanes, input_stride, output_nPlanes, output_stride);
-
-  } else {
-    dConvolution_forward(s, inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
-                         input_nPlanes, input_stride, output_nPlanes,
-                         output_stride, nGroups);
-  }
-}
-template <typename T>
-void dConvolution_backward_dW2(cudaStream_t s, T *inFeatures, T *dInFeatures,
-                               T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
-                               int32_t *rulesOut, int32_t nHot,
-                               int32_t input_nPlanes, int32_t input_stride,
-                               int32_t output_nPlanes, int32_t output_stride,
-                               int32_t nGroups) {
-  int32_t c = input_nPlanes * output_nPlanes * nGroups;
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int32_t K = 16;
-    const int32_t V = 4;
-    dConvolution_KMxKN_backward_dW2<T, K, V>
-        <<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
-           s>>>(inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn, rulesOut,
-                nHot, input_nPlanes, input_stride, output_nPlanes,
-                output_stride);
-  } else {
-    dConvolution_backward_dW(s, inFeatures, dInFeatures, dOutFeatures, w, dw,
-                             rulesIn, rulesOut, nHot, input_nPlanes,
-                             input_stride, output_nPlanes, output_stride,
-                             nGroups);
-  }
-}
-#undef TACC
\ No newline at end of file
--- a/include/spconv/fused_conv.h
+++ b/include/spconv/fused_conv.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <cuda_runtime_api.h>
-#include <tensorview/tensor.h>
-#include <torch/script.h>
-namespace spconv {
-
-enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
-using all_fused_conv_algos_t =
-    tv::mp_list_c<int, kFSparseConvNet, kFMinkowskiEngine>;
-
-void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
-                     torch::Tensor filters, torch::Tensor indicesIn,
-                     torch::Tensor indicesOut, int nHot);
-
-void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
-                              torch::Tensor dout, torch::Tensor filters,
-                              torch::Tensor dfilters, torch::Tensor indicesIn,
-                              torch::Tensor indicesOut, int nHot);
-
-void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
-                               torch::Tensor filters, torch::Tensor indicesIn,
-                               torch::Tensor indicesOut, int nHot);
-void fused_conv_backward_cuda_minkowski(torch::Tensor features,
-                                        torch::Tensor din, torch::Tensor dout,
-                                        torch::Tensor filters,
-                                        torch::Tensor dfilters,
-                                        torch::Tensor indicesIn,
-                                        torch::Tensor indicesOut, int nHot);
-
-template <int Algo> struct FusedConvDispatch;
-
-template <> struct FusedConvDispatch<kFSparseConvNet> {
-  constexpr static auto *fwd = fused_conv_cuda;
-  constexpr static auto *bwd = fused_conv_backward_cuda;
-};
-
-template <> struct FusedConvDispatch<kFMinkowskiEngine> {
-  constexpr static auto *fwd = fused_conv_cuda_minkowski;
-  constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
-};
-
-} // namespace spconv
--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSED_SPARSE_CONV_OP_H_
-#define FUSED_SPARSE_CONV_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-torch::Tensor
-fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor bias, torch::Tensor indicePairs,
-                         torch::Tensor indiceNum, int64_t numActOut,
-                         int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output =
-      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
-                              nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/geometry.h
+++ b/include/spconv/geometry.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include <iostream>
-#include <limits>
-#include <tensorview/tensorview.h>
-#include <tsl/robin_map.h>
-#include <unordered_map>
-namespace spconv {
-
-namespace detail {
-
-template <typename T> struct ToUnsigned;
-
-template <> struct ToUnsigned<int> { using type = uint32_t; };
-
-template <> struct ToUnsigned<long> { using type = uint64_t; };
-
-template <typename T> struct FNVInternal;
-template <> struct FNVInternal<uint32_t> {
-  constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
-  constexpr static uint32_t prime = 0x01000193;
-};
-
-template <> struct FNVInternal<uint64_t> {
-  constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
-  constexpr static uint64_t prime = 0x100000001b3;
-};
-
-} // namespace detail
-template <typename T>
-using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
-
-template <typename T> struct FNV1a : detail::FNVInternal<T> {
-  std::size_t operator()(const T *data, std::size_t size) {
-    to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
-    for (std::size_t i = 0; i < size; ++i) {
-      hash *= detail::FNVInternal<T>::prime;
-      hash ^= static_cast<to_unsigned_t<T>>(data[i]);
-    }
-    return hash;
-  }
-};
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
-                                    const Index *kernelSize,
-                                    const Index *stride, const Index *padding,
-                                    const Index *dilation,
-                                    const Index *outSpatialShape, Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
-                 stride[i] + padding[i]) /
-                stride[i];
-    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPosTranspose(
-    const Index *input_pos, const Index *kernelSize, const Index *stride,
-    const Index *padding, const Index *dilation, const Index *outSpatialShape,
-    Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = input_pos[i] * stride[i] - padding[i];
-    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (val - lowers[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INDICE_CU_H_
-#define INDICE_CU_H_
-#include <cuhash/hash_table.cuh>
-#include <spconv/geometry.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-
-template <bool UseDeconv, typename Index, unsigned NDim>
-struct ConvIndiceDispatch;
-
-template <typename Index, unsigned NDim>
-struct ConvIndiceDispatch<true, Index, NDim> {
-  constexpr static auto *func = getValidOutPosTranspose<Index, NDim>;
-};
-template <typename Index, unsigned NDim>
-struct ConvIndiceDispatch<false, Index, NDim> {
-  constexpr static auto *func = getValidOutPos<Index, NDim>;
-};
-
-template <typename Index, unsigned NDim, bool UseDeconv,
-          int KernelMaxVolume = 256, typename Index1D = int>
-__global__ void prepareIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(0, offset, oldNum) = ix;
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(1, offset, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignGridAndIndiceOutKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numAct, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
-
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    index = indicePairUnique[ix];
-    gridsOut[index] = ix;
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
-__global__ void
-assignIndiceOutKernel(tv::TensorView<Index> indicesOut, int numAct,
-                      tv::TensorView<Index> indicePairUnique,
-                      const tv::SimpleVector<Index, NDim> outSpatialShape,
-                      int batchSize) {
-
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (unsigned ix : tv::KernelLoopX<unsigned>(numAct)) {
-    index = indicePairUnique[ix];
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
-__global__ void
-assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
-                            tv::TensorView<Index> indicePairs,
-                            tv::TensorView<Index> indicePairUnique,
-                            unsigned table_size, const cuhash::Entry *table,
-                            cuhash::Functions<kNumHashFunctions> constants,
-                            uint2 stash_constants, unsigned stash_count) {
-
-  Index index;
-  int kernelVolume = indicePairs.dim(1);
-  auto indicePairsOut = indicePairs.subview(1);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairsOut(i, ix);
-      if (index > -1) {
-        auto val = cuhash::retrieve((unsigned)(index), table_size, table,
-                                    constants, stash_constants, stash_count);
-        assert(val != cuhash::kNotFound);
-        indicePairsOut(i, ix) = (unsigned)val;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
-                        tv::TensorView<IndexGrid> gridsOut, int numActIn,
-                        tv::TensorView<Index> indicePairs,
-                        tv::TensorView<Index> indicePairUnique,
-                        const tv::SimpleVector<Index, NDim> outSpatialShape) {
-
-  Index index;
-  int kernelVolume = indicePairs.dim(1);
-  auto indicePairsOut = indicePairs.subview(1);
-
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairsOut(i, ix);
-      if (index > -1) {
-        indicePairsOut(i, ix) = gridsOut[index];
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-assignIndicePairsLimitedKernel(tv::TensorView<IndexGrid> gridsOut, int numActIn,
-                               tv::TensorView<Index> indicePairs,
-                               tv::TensorView<Index> indiceNum) {
-
-  Index index, val;
-  int kernelVolume = indicePairs.dim(0);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairs(i, 1, ix);
-      if (index != -1) {
-        val = gridsOut[index];
-        if (val != -1) {
-          auto oldNum = atomicAdd(indiceNum.data() + i, Index(1));
-          indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix);
-          indicePairs(i, 1, oldNum) = val;
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void prepareSubMGridKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index =
-        tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-            indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
-        spatialVolume * indicesIn(ix, 0);
-    gridsOut[index] = ix;
-  }
-}
-
-template <typename Index, unsigned NDim>
-__global__ void
-prepareSubMHashKernel(tv::TensorView<const Index> indicesIn, unsigned *keys,
-                      unsigned *values,
-                      const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
-                                         outSpatialShape.data()) +
-            spatialVolume * indicesIn(ix, 0);
-    keys[ix] = index;
-    values[ix] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void getSubMIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      if (gridsOut[index] > -1) {
-        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(1, offset, oldNum) = gridsOut[index];
-        indicePairs(0, offset, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
-          unsigned K2>
-__global__ void getSubMIndicePairsUnrollKernel3(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-
-  Index point[3];
-  Index index = 0;
-  Index offset;
-  constexpr unsigned KV = K0 * K1 * K2;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-#pragma unroll
-        for (int k = 0; k < K2; ++k) {
-          offset = i * K1 * K2 + j * K2 + k;
-          if (offset > center) {
-            continue;
-          }
-          if (center == offset) {
-            // center of subm indice pairs dont need atomicadd
-            indicePairs(1, offset, ix) = ix;
-            indicePairs(0, offset, ix) = ix;
-          } else {
-            point[2] = indice_data[3] - k + K2 / 2;
-            point[1] = indice_data[2] - j + K1 / 2;
-            point[0] = indice_data[1] - i + K0 / 2;
-            if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
-                point[2] >= 0 && point[2] < outSpatialShape[2] &&
-                point[0] >= 0 && point[0] < outSpatialShape[0]) {
-              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
-                          point, outSpatialShape.data(), 0) +
-                      spatialVolume * indice_data[0];
-              if (gridsOut[index] != -1) {
-                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
-                // - 1]
-                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-                indicePairs(1, offset, oldNum) = gridsOut[index];
-                indicePairs(0, offset, oldNum) = ix;
-                indicePairs(1, KV - offset - 1, oldNum) = ix;
-                indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
-__global__ void getSubMIndicePairsUnrollKernel2(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-  Index point[2];
-  Index index = 0;
-  Index offset;
-  constexpr unsigned KV = K0 * K1;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-        offset = i * K1 + j;
-        if (offset > center) {
-          continue;
-        }
-        if (center == offset) {
-          // center of subm indice pairs dont need atomicadd
-          indicePairs(1, offset, ix) = ix;
-          indicePairs(0, offset, ix) = ix;
-        } else {
-          point[1] = indice_data[2] - j + K1 / 2;
-          point[0] = indice_data[1] - i + K0 / 2;
-          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
-              point[0] < outSpatialShape[0]) {
-            index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
-                        point, outSpatialShape.data(), 0) +
-                    spatialVolume * indice_data[0];
-            if (gridsOut[index] > -1) {
-              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-              atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-              indicePairs(1, offset, oldNum) = gridsOut[index];
-              indicePairs(0, offset, oldNum) = ix;
-              indicePairs(1, KV - offset - 1, oldNum) = ix;
-              indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
-          unsigned kNumHashFunctions = 4>
-__global__ void getSubMIndicePairsHashKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, unsigned table_size,
-    const cuhash::Entry *table, cuhash::Functions<kNumHashFunctions> constants,
-    uint2 stash_constants, unsigned stash_count) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      auto val = cuhash::retrieve((unsigned)(index), table_size, table,
-                                  constants, stash_constants, stash_count);
-      if (val != cuhash::kNotFound) {
-        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(1, offset, oldNum) = val;
-        indicePairs(0, offset, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, unsigned K0, unsigned K1, unsigned K2,
-          unsigned kNumHashFunctions = 4>
-__global__ void getSubMIndicePairsHashUnrollKernel3(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume,
-    unsigned table_size, const cuhash::Entry *table,
-    cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
-    unsigned stash_count) {
-  auto numActIn = indicesIn.dim(0);
-  Index index = 0;
-  Index offset;
-  Index point[3];
-  constexpr unsigned KV = K0 * K1 * K2;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-#pragma unroll
-        for (int k = 0; k < K2; ++k) {
-          offset = i * K1 * K2 + j * K2 + k;
-          if (offset > center) {
-            continue;
-          }
-          if (center == offset) {
-            // center of subm indice pairs dont need atomicadd
-            indicePairs(1, offset, ix) = ix;
-            indicePairs(0, offset, ix) = ix;
-          } else {
-            point[2] = indice_data[3] - k + K2 / 2;
-            point[1] = indice_data[2] - j + K1 / 2;
-            point[0] = indice_data[1] - i + K0 / 2;
-            if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
-                point[2] >= 0 && point[2] < outSpatialShape[2] &&
-                point[0] >= 0 && point[0] < outSpatialShape[0]) {
-              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
-                          point, outSpatialShape.data(), 0) +
-                      spatialVolume * indice_data[0];
-              auto val =
-                  cuhash::retrieve((unsigned)(index), table_size, table,
-                                   constants, stash_constants, stash_count);
-
-              if (val != cuhash::kNotFound) {
-                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
-                // - 1]
-                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-                indicePairs(1, offset, oldNum) = val;
-                indicePairs(0, offset, oldNum) = ix;
-                indicePairs(1, KV - offset - 1, oldNum) = ix;
-                indicePairs(0, KV - offset - 1, oldNum) = val;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, unsigned K0, unsigned K1,
-          unsigned kNumHashFunctions = 4>
-__global__ void getSubMIndicePairsHashUnrollKernel2(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume,
-    unsigned table_size, const cuhash::Entry *table,
-    cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
-    unsigned stash_count) {
-  auto numActIn = indicesIn.dim(0);
-  Index index = 0;
-  Index offset;
-  Index point[2];
-  constexpr unsigned KV = K0 * K1;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-        offset = i * K1 + j;
-        if (offset > center) {
-          continue;
-        }
-        if (center == offset) {
-          // center of subm indice pairs dont need atomicadd
-          indicePairs(1, offset, ix) = ix;
-          indicePairs(0, offset, ix) = ix;
-        } else {
-          point[1] = indice_data[2] - j + K1 / 2;
-          point[0] = indice_data[1] - i + K0 / 2;
-          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
-              point[0] < outSpatialShape[0]) {
-            index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
-                        point, outSpatialShape.data(), 0) +
-                    spatialVolume * indice_data[0];
-            auto val =
-                cuhash::retrieve((unsigned)(index), table_size, table,
-                                 constants, stash_constants, stash_count);
-
-            if (val != cuhash::kNotFound) {
-              // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
-              // 1]
-              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-              atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-              indicePairs(1, offset, oldNum) = val;
-              indicePairs(0, offset, oldNum) = ix;
-              indicePairs(1, KV - offset - 1, oldNum) = ix;
-              indicePairs(0, KV - offset - 1, oldNum) = val;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridKernel(const Index *indicePairUnique,
-                                tv::TensorView<IndexGrid> gridsOut,
-                                int numAct) {
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    gridsOut[indicePairUnique[ix]] = -1;
-  }
-}
-
-template <typename T> __global__ void arangeKernel(T *data, int size) {
-  for (int ix : tv::KernelLoopX<int>(size)) {
-    data[ix] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-                    const tv::SimpleVector<Index, NDim> outSpatialShape,
-                    int numAct, Index spatialVolume) {
-  auto indsPtr = indices;
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    indsPtr = indices + ix * (NDim + 1);
-    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-        indsPtr + 1, outSpatialShape.data(), 0);
-    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
-  }
-}
-
-} // namespace spconv
-
-#undef atomicAdd
-
-#endif
\ No newline at end of file
--- a/include/spconv/indice.h
+++ b/include/spconv/indice.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
-#define SPARSE_CONV_INDICE_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-int create_conv_indice_pair_p1_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose);
-
-int create_conv_indice_pair_p2_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_submconv_indice_pair_cuda(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_conv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
-    bool useHash);
-
-int create_submconv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/maxpool.h
+++ b/include/spconv/maxpool.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
-#define SPARSE_MAXPOOL_FUNCTOR_H_
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor dout, torch::Tensor din,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor dout, torch::Tensor din,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/minkowski.cu.h
+++ b/include/spconv/minkowski.cu.h
-/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
- * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
- * of the code.
- */
-
-template <typename Dtype, typename Itype, int BLOCK_SIZE>
-__global__ void matmul(const Dtype *A, const int wA, const int hA,
-                       const Dtype *B, const int wB, const int hB, Dtype *C,
-                       const Itype *in_map, const Itype *out_map) {
-  // Use in_feat as A and kernel as B
-
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. x is for rows, y is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
-
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
-
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
-    Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
-
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
-
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
-    }
-
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < wB)
-    atomicAdd(&C[wB * out_row + x], Csub);
-  // C[wB * out_row + x] += Csub;
-}
-
-template <typename Dtype, typename Itype, int BLOCK_SIZE>
-__global__ void matmul2(const Dtype *A, const int wA, const int hA,
-                        const Dtype *B, const int wB, const int hB,
-                        const Dtype *D, const int wD, const int hD, Dtype *C,
-                        Dtype *E, const Itype *in_map, const Itype *out_map) {
-  // Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
-
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. y is for rows, x is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
-
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
-
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-  Dtype Esub = 0;
-
-  // Declaration of the shared memory array As used to
-  // store the sub-matrix of A
-  __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
-
-  // Declaration of the shared memory array Bs used to
-  // store the sub-matrix of B
-  __shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
-
-  // Declaration of the shared memory array Ds used to
-  // store the sub-matrix of D
-  __shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
-
-  // For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
-  DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
-
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
-
-    // Transposed kernel
-    BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
-
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
-
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * BTs[k][tx];
-    }
-
-    // For Esub, reset to 0
-    Esub = 0;
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Esub += DTs[k][ty] * As[k][tx];
-    }
-
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-
-    // For the E matrix which requires accmulation of multiple blocks, use
-    // atomic addition. This can be replaced with a more sophisticaed reduction
-    // algorithm.
-    if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
-      atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < hB)
-    atomicAdd(&C[hB * in_row + x], Csub);
-}
--- a/include/spconv/nms.h
+++ b/include/spconv/nms.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_CPU_H
-#define NMS_CPU_H
-#include <pybind11/pybind11.h>
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include "box_iou.h"
-#include "nms_gpu.h"
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <vector>
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType>
-std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
-                                         py::array_t<int> order, DType thresh,
-                                         DType eps = 0) {
-  auto ndets = boxes.shape(0);
-  auto boxes_r = boxes.template unchecked<2>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto area = zeros<DType>({int(ndets)});
-  auto area_rw = area.template mutable_unchecked<1>();
-  // get areas
-  for (int i = 0; i < ndets; ++i) {
-    area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
-                 (boxes_r(i, 3) - boxes_r(i, 1) + eps);
-  }
-  std::vector<int> keep;
-  int i, j;
-  DType xx1, xx2, w, h, inter, ovr;
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
-      xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
-      w = xx2 - xx1 + eps;
-      if (w > 0) {
-        xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
-        xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
-        h = xx2 - xx1 + eps;
-        if (h > 0) {
-          inter = w * h;
-          ovr = inter / (area_rw(i) + area_rw(j) - inter);
-          if (ovr >= thresh)
-            suppressed_rw(j) = 1;
-        }
-      }
-    }
-  }
-  return keep;
-}
-
-template <typename DType>
-std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
-                                                py::array_t<int> order,
-                                                py::array_t<DType> standup_iou,
-                                                DType thresh) {
-  auto ndets = box_corners.shape(0);
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  std::vector<int> keep;
-  int i, j;
-
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area, overlap;
-
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      if (standup_iou_r(i, j) <= 0.0)
-        continue;
-      // std::cout << "pre_poly" << std::endl;
-      try {
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::intersection(poly, qpoly, poly_inter);
-      } catch (const std::exception &e) {
-        std::cout << "box i corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
-                    << std::endl;
-        }
-        std::cout << "box j corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
-                    << std::endl;
-        }
-        // throw e;
-        continue;
-      }
-      // std::cout << "post_poly" << std::endl;
-      // std::cout << "post_intsec" << std::endl;
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        // std::cout << "pre_union" << " " << inter_area << std::endl;
-        bg::union_(poly, qpoly, poly_union);
-        /*
-        if (poly_union.empty()){
-            std::cout << "intsec area:" << " " << inter_area << std::endl;
-            std::cout << "box i corners:" << std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
-        k, 1) << std::endl;
-            }
-            std::cout << "box j corners:" <<  std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
-        k, 1) << std::endl;
-            }
-        }*/
-        // std::cout << "post_union" << poly_union.empty() << std::endl;
-        if (!poly_union.empty()) { // ignore invalid box
-          union_area = bg::area(poly_union.front());
-          // std::cout << "post union area" << std::endl;
-          // std::cout << union_area << "debug" << std::endl;
-          overlap = inter_area / union_area;
-          if (overlap >= thresh)
-            suppressed_rw(j) = 1;
-          poly_union.clear();
-        }
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return keep;
-}
-#ifdef TV_CUDA
-constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
-                        DType nms_overlap_thresh, int device_id) {
-  py::buffer_info info = boxes.request();
-  auto boxes_ptr = static_cast<DType *>(info.ptr);
-  py::buffer_info info_k = keep_out.request();
-  auto keep_out_ptr = static_cast<int *>(info_k.ptr);
-
-  return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
-                                          boxes.shape(0), boxes.shape(1),
-                                          nms_overlap_thresh, device_id);
-}
-#endif
-
-} // namespace spconv
-#endif
--- a/include/spconv/nms_functor.h
+++ b/include/spconv/nms_functor.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_FUNCTOR_H_
-#define NMS_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct NonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxes, T threshold, T eps);
-};
-
-template <typename Device, typename T, typename Index>
-struct rotateNonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxCorners,
-                   tv::TensorView<const T> standupIoU, T threshold);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/nms_gpu.h
+++ b/include/spconv/nms_gpu.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-template <typename DType, int BLOCK_THREADS>
-int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-             int boxes_dim, DType nms_overlap_thresh, int device_id);
--- a/include/spconv/nms_ops.h
+++ b/include/spconv/nms_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_TORCH_OP_H_
-#define NMS_TORCH_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/nms_functor.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-template <typename T>
-torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores,
-                                int64_t preMaxSize, int64_t postMaxSize,
-                                double thresh, double eps) {
-  // auto timer = spconv::CudaContextTimer<>();
-  tv::check_torch_dtype<T>(boxes);
-  auto resOptions =
-      torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
-  if (boxes.size(0) == 0) {
-    return torch::zeros({0}, resOptions);
-  }
-  torch::Tensor indices;
-  if (preMaxSize > 0) {
-    auto numKeepedScores = scores.size(0);
-    preMaxSize = std::min(numKeepedScores, preMaxSize);
-    auto res = torch::topk(scores, preMaxSize);
-    indices = std::get<1>(res);
-    boxes = torch::index_select(boxes, 0, indices);
-  } else {
-    indices = std::get<1>(torch::sort(scores));
-    boxes = torch::index_select(boxes, 0, indices);
-  }
-  if (boxes.size(0) == 0)
-    return torch::zeros({0}, resOptions);
-
-  auto keep = torch::zeros({boxes.size(0)}, resOptions);
-  int64_t keepNum = 0;
-  if (boxes.device().type() == torch::kCPU) {
-    auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
-    keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
-                         tv::torch2tv<const T>(boxes), T(thresh), T(eps));
-  } else {
-    TV_ASSERT_RT_ERR(false, "not implemented");
-  }
-  if (postMaxSize <= 0) {
-    postMaxSize = keepNum;
-  }
-  // std::cout << keep << std::endl;
-  keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
-  if (preMaxSize > 0) {
-    return torch::index_select(indices, 0, keep);
-  }
-  return keep;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/pillar_scatter_functor.h
+++ b/include/spconv/pillar_scatter_functor.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
-#define POINTPILLARS_SCATTER_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct PointPillarScatter {
-  void operator()(const Device &d, tv::TensorView<T> canvas,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const T> coors);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PILLAR_SCATTER_OP_H_
-#define PILLAR_SCATTER_OP_H_
-
-#include <spconv/pillar_scatter_functor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-template <typename T>
-torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
-                                 torch::Tensor shape) {
-  TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
-  TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
-  TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
-  TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
-  TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
-  TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
-  TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
-  TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
-
-  tv::check_torch_dtype<int>(shape);
-  tv::check_torch_dtype<T>(coors);
-  auto shapeData = shape.data_ptr<int>();
-  torch::Tensor canvas =
-      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
-                   features.options());
-  TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
-#ifdef TV_CUDA
-  functor::PointPillarScatter<tv::GPU, T, int> ftor;
-  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
-       tv::torch2tv<const T>(features.squeeze()),
-       tv::torch2tv<const T>(coors.squeeze()));
-#endif
-  return canvas;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/point2voxel.cu.h
+++ b/include/spconv/point2voxel.cu.h
-#pragma once
-
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-template <typename Index, unsigned NDim>
-__global__ void scatterPointToGridKernel(
-    tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
-    tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
-    tv::TensorView<Index> pointIndex,
-    const tv::SimpleVector<Index, NDim> gridShape) {
-  Index index;
-  int numPoints = points.dim(0);
-  int numFeatures = points.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numPoints)) {
-    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-        indexes.data() + ix * NDim, gridShape.data(), 0);
-    pointIndex(ix) = index;
-    atomicAdd(numPointsPerGrid.data() + index, Index(1));
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      atomicAdd(grids.data() + index * numFeatures + k,
-                *(points.data() + ix * numFeatures + k));
-    }
-  }
-}
-
-template <typename Index, unsigned NDim>
-__global__ void
-gatherPointFromGridKernel(tv::TensorView<const float> grids,
-                          tv::TensorView<const Index> numPointsPerGrid,
-                          tv::TensorView<const Index> pointIndexUnique,
-                          tv::TensorView<float> voxels,
-                          tv::TensorView<Index> coors,
-                          const tv::SimpleVector<Index, NDim> gridShape) {
-  Index index;
-  int numVoxels = voxels.dim(0);
-  int numFeatures = grids.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
-    index = pointIndexUnique(ix);
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
-    }
-    index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
-                                            gridShape.data());
-  }
-}
-
-template <typename Index>
-__global__ void resetGridKernel(tv::TensorView<float> grids,
-                                tv::TensorView<Index> numPointsPerGrid,
-                                tv::TensorView<Index> pointIndexUnique) {
-  Index index;
-  int numVoxels = pointIndexUnique.dim(0) - 1;
-  int numFeatures = grids.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
-    index = pointIndexUnique(ix);
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      grids(index, k) = 0;
-      numPointsPerGrid(index) = 0;
-    }
-  }
-}
-
-template <typename Index>
-__global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
-                                      const Index gridVolume) {
-  int num_max_points = pointIndex.dim(0) - 1;
-
-  for (int ix : tv::KernelLoopX<int>(num_max_points)) {
-    pointIndex(ix) = gridVolume;
-  }
-}
-} // namespace spconv
--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-// #include <vector>
-#include <iostream>
-#include <math.h>
-
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> voxel_point_mask,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(
-    py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
-    py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto means_rw = means.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) +=
-            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
-      }
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(i, j, k) = means_rw(i, k);
-      }
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
-    py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels, int block_factor, int block_size,
-    DType height_threshold, DType height_high_threshold) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto mins_rw = mins.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-
-  DType max_value, min_value;
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int block_shape_H = grid_size[1] / block_factor;
-  int block_shape_W = grid_size[0] / block_factor;
-  int voxelidx, num;
-  int block_coor[2];
-  int startx, stopx, starty, stopy;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      block_coor[0] = coor[1] / block_factor;
-      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) =
-          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) =
-          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor[1] = coors_rw(i, 1);
-    coor[2] = coors_rw(i, 2);
-    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
-    block_coor[0] = coor[1] / block_factor;
-    block_coor[1] = coor[2] / block_factor;
-    min_value = mins_rw(block_coor[0], block_coor[1]);
-    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx =
-        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    starty = std::max(0, block_coor[1] - block_size / 2);
-    stopy =
-        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-
-    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k) {
-        min_value = std::min(min_value, mins_rw(j, k));
-        max_value = std::max(max_value, maxs_rw(j, k));
-      }
-    }
-    voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
-                       ((max_value - min_value) < height_high_threshold);
-  }
-  return voxel_num;
-}
-
-} // namespace spconv