Merge branch 'develop'

a6abf55d · yan.yan · fad30002 · 79a3eaf2 · fad30002 · fad30002
Commit a6abf55d authored Oct 20, 2021 by yan.yan
20 changed files
--- a/include/paramsgrid.h
+++ b/include/paramsgrid.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is used for c++ unit test, but pytorch jit ops don't support c++
-// debug build.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-
-namespace detail {
-template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
-
-template <class T, class... TArgs>
-int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
-
-template <int Idx, class TT, class T>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-
-template <int Idx, class TT, class T, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-} // namespace detail
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-
-#endif
\ No newline at end of file
--- a/include/spconv/box_iou.h
+++ b/include/spconv/box_iou.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BOX_IOU_H
-#define BOX_IOU_H
-
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-
-namespace spconv {
-// #include "voxelnet/core/cc/pybind11_helper.h"
-namespace py = pybind11;
-using namespace pybind11::literals;
-template <typename DType, typename ShapeContainer>
-inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
-  // create ROWMAJOR array.
-  py::array_t<DType> array(shape);
-  std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
-  return array;
-}
-
-template <typename DType>
-inline py::array_t<DType> zeros(std::vector<long int> shape) {
-  return constant<DType, std::vector<long int>>(shape, 0);
-}
-
-template <typename DType>
-py::array_t<DType>
-rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        bg::union_(poly, qpoly, poly_union);
-        if (!poly_union.empty()) {
-          union_area = bg::area(poly_union.front());
-          overlaps_rw(n, k) = inter_area / union_area;
-        }
-        poly_union.clear();
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-template <typename DType>
-py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
-                                      py::array_t<DType> qbox_corners,
-                                      py::array_t<DType> standup_iou,
-                                      DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        overlaps_rw(n, k) = inter_area;
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-} // namespace spconv
-#endif
\ No newline at end of file
--- a/include/spconv/cublas_gemm.h
+++ b/include/spconv/cublas_gemm.h
-#pragma once
-#include <cublas_v2.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-
-template <class T>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const T *alpha, const T *A, int lda, const T *B,
-                           int ldb, const T *beta, T *C, int ldc);
-
-template <class T>
-cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k,
-                              const T *alpha, const T *A, int lda, const T *B,
-                              int ldb, const T *beta, T *C, int ldc) {
-  return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
-                        beta, C, ldc);
-}
-
-template <class T> inline T constant_scalar(float data) { return T(data); }
-
-template <class T>
-cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
-                    const tv::TensorView<T> A, const tv::TensorView<T> B,
-                    tv::TensorView<T> C) {
-  TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
-  TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
-  auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
-  int m = transa ? A.dim(1) : A.dim(0);
-  int n = transb ? B.dim(0) : B.dim(1);
-  int ka = transa ? A.dim(0) : A.dim(1);
-  int kb = transb ? B.dim(1) : B.dim(0);
-  int lda = transa ? m : ka;
-  int ldb = transb ? ka : n;
-  int ldc = n;
-  TV_ASSERT_RT_ERR(ka == kb, "error");
-  T alpha = constant_scalar<T>(1);
-  T beta = constant_scalar<T>(0);
-  return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
-                           &alpha, A.data(), lda, B.data(), ldb, &beta,
-                           C.data(), ldc);
-}
-
-} // namespace spconv
--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSED_SPARSE_CONV_OP_H_
-#define FUSED_SPARSE_CONV_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-torch::Tensor
-fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor bias, torch::Tensor indicePairs,
-                         torch::Tensor indiceNum, int64_t numActOut,
-                         int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output =
-      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
-                              nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/geometry.h
+++ b/include/spconv/geometry.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include <iostream>
-#include <limits>
-#include <tensorview/tensorview.h>
-#include <tsl/robin_map.h>
-#include <unordered_map>
-namespace spconv {
-
-namespace detail {
-
-template <typename T> struct ToUnsigned;
-
-template <> struct ToUnsigned<int> { using type = uint32_t; };
-
-template <> struct ToUnsigned<long> { using type = uint64_t; };
-
-template <typename T> struct FNVInternal;
-template <> struct FNVInternal<uint32_t> {
-  constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
-  constexpr static uint32_t prime = 0x01000193;
-};
-
-template <> struct FNVInternal<uint64_t> {
-  constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
-  constexpr static uint64_t prime = 0x100000001b3;
-};
-
-} // namespace detail
-template <typename T>
-using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
-
-template <typename T> struct FNV1a : detail::FNVInternal<T> {
-  std::size_t operator()(const T *data, std::size_t size) {
-    to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
-    for (std::size_t i = 0; i < size; ++i) {
-      hash *= detail::FNVInternal<T>::prime;
-      hash ^= static_cast<to_unsigned_t<T>>(data[i]);
-    }
-    return hash;
-  }
-};
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
-                                    const Index *kernelSize,
-                                    const Index *stride, const Index *padding,
-                                    const Index *dilation,
-                                    const Index *outSpatialShape, Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
-                 stride[i] + padding[i]) /
-                stride[i];
-    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPosTranspose(
-    const Index *input_pos, const Index *kernelSize, const Index *stride,
-    const Index *padding, const Index *dilation, const Index *outSpatialShape,
-    Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = input_pos[i] * stride[i] - padding[i];
-    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (val - lowers[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INDICE_CU_H_
-#define INDICE_CU_H_
-#include <cuhash/hash_table.cuh>
-#include <spconv/geometry.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
-          typename Index1D = int>
-__global__ void prepareIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(0, offset, oldNum) = ix;
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(1, offset, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, unsigned NDim, int KernelMaxVolume = 256>
-__global__ void prepareDeConvIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(0, offset, oldNum) = ix;
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(1, offset, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignGridAndIndiceOutKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numAct, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
-
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    index = indicePairUnique[ix];
-    gridsOut[index] = ix;
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
-__global__ void
-assignIndiceOutKernel(tv::TensorView<Index> indicesOut, int numAct,
-                      tv::TensorView<Index> indicePairUnique,
-                      const tv::SimpleVector<Index, NDim> outSpatialShape,
-                      int batchSize) {
-
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (unsigned ix : tv::KernelLoopX<unsigned>(numAct)) {
-    index = indicePairUnique[ix];
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
-__global__ void
-assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
-                            tv::TensorView<Index> indicePairs,
-                            tv::TensorView<Index> indicePairUnique,
-                            unsigned table_size, const cuhash::Entry *table,
-                            cuhash::Functions<kNumHashFunctions> constants,
-                            uint2 stash_constants, unsigned stash_count) {
-
-  Index index;
-  int kernelVolume = indicePairs.dim(1);
-  auto indicePairsOut = indicePairs.subview(1);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairsOut(i, ix);
-      if (index > -1) {
-        auto val = cuhash::retrieve((unsigned)(index), table_size, table,
-                                    constants, stash_constants, stash_count);
-        assert(val != cuhash::kNotFound);
-        indicePairsOut(i, ix) = (unsigned)val;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
-                        tv::TensorView<IndexGrid> gridsOut, int numActIn,
-                        tv::TensorView<Index> indicePairs,
-                        tv::TensorView<Index> indicePairUnique,
-                        const tv::SimpleVector<Index, NDim> outSpatialShape) {
-
-  Index index;
-  int kernelVolume = indicePairs.dim(1);
-  auto indicePairsOut = indicePairs.subview(1);
-
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairsOut(i, ix);
-      if (index > -1) {
-        indicePairsOut(i, ix) = gridsOut[index];
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void prepareSubMGridKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index =
-        tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-            indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
-        spatialVolume * indicesIn(ix, 0);
-    gridsOut[index] = ix;
-  }
-}
-
-template <typename Index, unsigned NDim>
-__global__ void
-prepareSubMHashKernel(tv::TensorView<const Index> indicesIn, unsigned *keys,
-                      unsigned *values,
-                      const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
-                                         outSpatialShape.data()) +
-            spatialVolume * indicesIn(ix, 0);
-    keys[ix] = index;
-    values[ix] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void getSubMIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      if (gridsOut[index] > -1) {
-        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(1, offset, oldNum) = gridsOut[index];
-        indicePairs(0, offset, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
-          unsigned K2>
-__global__ void getSubMIndicePairsKernel3(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-
-  Index point[3];
-  Index index = 0;
-  Index offset;
-  constexpr unsigned KV = K0 * K1 * K2;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-#pragma unroll
-        for (int k = 0; k < K2; ++k) {
-          offset = i * K1 * K2 + j * K2 + k;
-          if (offset > center){
-            continue;
-          }
-          if (center == offset){
-              // center of subm indice pairs dont need atomicadd
-              indicePairs(1, offset, ix) = ix;
-              indicePairs(0, offset, ix) = ix;
-          }else{
-            point[2] = indice_data[3] - k + K2 / 2;
-            point[1] = indice_data[2] - j + K1 / 2;
-            point[0] = indice_data[1] - i + K0 / 2;
-            if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
-                point[2] < outSpatialShape[2] && point[0] >= 0 &&
-                point[0] < outSpatialShape[0]) {
-              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
-                          point, outSpatialShape.data(), 0) +
-                      spatialVolume * indice_data[0];
-              if (gridsOut[index] != -1) {
-                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
-                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-                indicePairs(1, offset, oldNum) = gridsOut[index];
-                indicePairs(0, offset, oldNum) = ix;
-                indicePairs(1, KV - offset - 1, oldNum) = ix;
-                indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
-__global__ void getSubMIndicePairsKernel2(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-  Index point[2];
-  Index index = 0;
-  Index offset;
-  constexpr unsigned KV = K0 * K1;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-        offset = i * K1 + j;
-        if (offset > center){
-          continue;
-        }
-        if (center == offset){
-            // center of subm indice pairs dont need atomicadd
-            indicePairs(1, offset, ix) = ix;
-            indicePairs(0, offset, ix) = ix;
-        }else{
-          point[1] = indice_data[2] - j + K1 / 2;
-          point[0] = indice_data[1] - i + K0 / 2;
-          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
-              point[0] < outSpatialShape[0]) {
-            index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
-                        point, outSpatialShape.data(), 0) +
-                    spatialVolume * indice_data[0];
-            if (gridsOut[index] > -1) {
-              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-              atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-              indicePairs(1, offset, oldNum) = gridsOut[index];
-              indicePairs(0, offset, oldNum) = ix;
-              indicePairs(1, KV - offset - 1, oldNum) = ix;
-              indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
-          unsigned kNumHashFunctions = 4>
-__global__ void getSubMIndicePairsHashKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, unsigned table_size,
-    const cuhash::Entry *table, cuhash::Functions<kNumHashFunctions> constants,
-    uint2 stash_constants, unsigned stash_count) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      auto val = cuhash::retrieve((unsigned)(index), table_size, table,
-                                  constants, stash_constants, stash_count);
-      if (val != cuhash::kNotFound) {
-        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(1, offset, oldNum) = val;
-        indicePairs(0, offset, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridKernel(const Index *indicePairUnique,
-                                tv::TensorView<IndexGrid> gridsOut,
-                                int numAct) {
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    gridsOut[indicePairUnique[ix]] = -1;
-  }
-}
-
-template <typename T> __global__ void arangeKernel(T *data, int size) {
-  for (int ix : tv::KernelLoopX<int>(size)) {
-    data[ix] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-                    const tv::SimpleVector<Index, NDim> outSpatialShape,
-                    int numAct) {
-  Index outSpatialShapeReg[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShapeReg[i] = outSpatialShape[i];
-  }
-  Index spatialVolume = 1;
-  auto indsPtr = indices;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    indsPtr = indices + ix * (NDim + 1);
-    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
-                                                        outSpatialShapeReg, 0);
-    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
-  }
-}
-
-} // namespace spconv
-
-#undef atomicAdd
-
-#endif
\ No newline at end of file
--- a/include/spconv/indice.h
+++ b/include/spconv/indice.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
-#define SPARSE_CONV_INDICE_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-int create_conv_indice_pair_p1_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose);
-
-int create_conv_indice_pair_p2_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_submconv_indice_pair_cuda(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_conv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
-    bool useHash);
-
-int create_submconv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/maxpool.h
+++ b/include/spconv/maxpool.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
-#define SPARSE_MAXPOOL_FUNCTOR_H_
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor dout, torch::Tensor din,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor dout, torch::Tensor din,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/nms.h
+++ b/include/spconv/nms.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_CPU_H
-#define NMS_CPU_H
-#include <pybind11/pybind11.h>
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include "box_iou.h"
-#include "nms_gpu.h"
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <vector>
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType>
-std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
-                                         py::array_t<int> order, DType thresh,
-                                         DType eps = 0) {
-  auto ndets = boxes.shape(0);
-  auto boxes_r = boxes.template unchecked<2>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto area = zeros<DType>({int(ndets)});
-  auto area_rw = area.template mutable_unchecked<1>();
-  // get areas
-  for (int i = 0; i < ndets; ++i) {
-    area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
-                 (boxes_r(i, 3) - boxes_r(i, 1) + eps);
-  }
-  std::vector<int> keep;
-  int i, j;
-  DType xx1, xx2, w, h, inter, ovr;
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
-      xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
-      w = xx2 - xx1 + eps;
-      if (w > 0) {
-        xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
-        xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
-        h = xx2 - xx1 + eps;
-        if (h > 0) {
-          inter = w * h;
-          ovr = inter / (area_rw(i) + area_rw(j) - inter);
-          if (ovr >= thresh)
-            suppressed_rw(j) = 1;
-        }
-      }
-    }
-  }
-  return keep;
-}
-
-template <typename DType>
-std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
-                                                py::array_t<int> order,
-                                                py::array_t<DType> standup_iou,
-                                                DType thresh) {
-  auto ndets = box_corners.shape(0);
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  std::vector<int> keep;
-  int i, j;
-
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area, overlap;
-
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      if (standup_iou_r(i, j) <= 0.0)
-        continue;
-      // std::cout << "pre_poly" << std::endl;
-      try {
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::intersection(poly, qpoly, poly_inter);
-      } catch (const std::exception &e) {
-        std::cout << "box i corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
-                    << std::endl;
-        }
-        std::cout << "box j corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
-                    << std::endl;
-        }
-        // throw e;
-        continue;
-      }
-      // std::cout << "post_poly" << std::endl;
-      // std::cout << "post_intsec" << std::endl;
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        // std::cout << "pre_union" << " " << inter_area << std::endl;
-        bg::union_(poly, qpoly, poly_union);
-        /*
-        if (poly_union.empty()){
-            std::cout << "intsec area:" << " " << inter_area << std::endl;
-            std::cout << "box i corners:" << std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
-        k, 1) << std::endl;
-            }
-            std::cout << "box j corners:" <<  std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
-        k, 1) << std::endl;
-            }
-        }*/
-        // std::cout << "post_union" << poly_union.empty() << std::endl;
-        if (!poly_union.empty()) { // ignore invalid box
-          union_area = bg::area(poly_union.front());
-          // std::cout << "post union area" << std::endl;
-          // std::cout << union_area << "debug" << std::endl;
-          overlap = inter_area / union_area;
-          if (overlap >= thresh)
-            suppressed_rw(j) = 1;
-          poly_union.clear();
-        }
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return keep;
-}
-#ifdef TV_CUDA
-constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
-                        DType nms_overlap_thresh, int device_id) {
-  py::buffer_info info = boxes.request();
-  auto boxes_ptr = static_cast<DType *>(info.ptr);
-  py::buffer_info info_k = keep_out.request();
-  auto keep_out_ptr = static_cast<int *>(info_k.ptr);
-
-  return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
-                                          boxes.shape(0), boxes.shape(1),
-                                          nms_overlap_thresh, device_id);
-}
-#endif
-
-} // namespace spconv
-#endif
--- a/include/spconv/nms_functor.h
+++ b/include/spconv/nms_functor.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_FUNCTOR_H_
-#define NMS_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct NonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxes, T threshold, T eps);
-};
-
-template <typename Device, typename T, typename Index>
-struct rotateNonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxCorners,
-                   tv::TensorView<const T> standupIoU, T threshold);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/nms_gpu.h
+++ b/include/spconv/nms_gpu.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-template <typename DType, int BLOCK_THREADS>
-int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-             int boxes_dim, DType nms_overlap_thresh, int device_id);
--- a/include/spconv/nms_ops.h
+++ b/include/spconv/nms_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_TORCH_OP_H_
-#define NMS_TORCH_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/nms_functor.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-template <typename T>
-torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores,
-                                int64_t preMaxSize, int64_t postMaxSize,
-                                double thresh, double eps) {
-  // auto timer = spconv::CudaContextTimer<>();
-  tv::check_torch_dtype<T>(boxes);
-  auto resOptions =
-      torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
-  if (boxes.size(0) == 0) {
-    return torch::zeros({0}, resOptions);
-  }
-  torch::Tensor indices;
-  if (preMaxSize > 0) {
-    auto numKeepedScores = scores.size(0);
-    preMaxSize = std::min(numKeepedScores, preMaxSize);
-    auto res = torch::topk(scores, preMaxSize);
-    indices = std::get<1>(res);
-    boxes = torch::index_select(boxes, 0, indices);
-  } else {
-    indices = std::get<1>(torch::sort(scores));
-    boxes = torch::index_select(boxes, 0, indices);
-  }
-  if (boxes.size(0) == 0)
-    return torch::zeros({0}, resOptions);
-
-  auto keep = torch::zeros({boxes.size(0)}, resOptions);
-  int64_t keepNum = 0;
-  if (boxes.device().type() == torch::kCPU) {
-    auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
-    keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
-                         tv::torch2tv<const T>(boxes), T(thresh), T(eps));
-  } else {
-    TV_ASSERT_RT_ERR(false, "not implemented");
-  }
-  if (postMaxSize <= 0) {
-    postMaxSize = keepNum;
-  }
-  // std::cout << keep << std::endl;
-  keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
-  if (preMaxSize > 0) {
-    return torch::index_select(indices, 0, keep);
-  }
-  return keep;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/pillar_scatter_functor.h
+++ b/include/spconv/pillar_scatter_functor.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
-#define POINTPILLARS_SCATTER_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct PointPillarScatter {
-  void operator()(const Device &d, tv::TensorView<T> canvas,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const T> coors);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PILLAR_SCATTER_OP_H_
-#define PILLAR_SCATTER_OP_H_
-
-#include <spconv/pillar_scatter_functor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-template <typename T>
-torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
-                                 torch::Tensor shape) {
-  TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
-  TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
-  TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
-  TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
-  TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
-  TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
-  TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
-  TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
-
-  tv::check_torch_dtype<int>(shape);
-  tv::check_torch_dtype<T>(coors);
-  auto shapeData = shape.data_ptr<int>();
-  torch::Tensor canvas =
-      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
-                   features.options());
-  TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
-#ifdef TV_CUDA
-  functor::PointPillarScatter<tv::GPU, T, int> ftor;
-  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
-       tv::torch2tv<const T>(features.squeeze()),
-       tv::torch2tv<const T>(coors.squeeze()));
-#endif
-  return canvas;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-// #include <vector>
-#include <iostream>
-#include <math.h>
-
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> voxel_point_mask,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(
-    py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
-    py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto means_rw = means.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) +=
-            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
-      }
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(i, j, k) = means_rw(i, k);
-      }
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
-    py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels, int block_factor, int block_size,
-    DType height_threshold, DType height_high_threshold) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto mins_rw = mins.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-
-  DType max_value, min_value;
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int block_shape_H = grid_size[1] / block_factor;
-  int block_shape_W = grid_size[0] / block_factor;
-  int voxelidx, num;
-  int block_coor[2];
-  int startx, stopx, starty, stopy;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      block_coor[0] = coor[1] / block_factor;
-      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) =
-          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) =
-          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor[1] = coors_rw(i, 1);
-    coor[2] = coors_rw(i, 2);
-    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
-    block_coor[0] = coor[1] / block_factor;
-    block_coor[1] = coor[2] / block_factor;
-    min_value = mins_rw(block_coor[0], block_coor[1]);
-    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx =
-        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    starty = std::max(0, block_coor[1] - block_size / 2);
-    stopy =
-        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-
-    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k) {
-        min_value = std::min(min_value, mins_rw(j, k));
-        max_value = std::max(max_value, maxs_rw(j, k));
-      }
-    }
-    voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
-                       ((max_value - min_value) < height_high_threshold);
-  }
-  return voxel_num;
-}
-
-} // namespace spconv
--- a/include/spconv/pool_ops.h
+++ b/include/spconv/pool_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_POOL_OP_H_
-#define SPARSE_POOL_OP_H_
-
-#include <spconv/maxpool.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                            torch::Tensor indiceNum, int64_t numAct);
-
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                    torch::Tensor outFeatures,
-                                    torch::Tensor outGrad,
-                                    torch::Tensor indicePairs,
-                                    torch::Tensor indiceNum);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/reordering.cu.h
+++ b/include/spconv/reordering.cu.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef REORDERING_CU_H_
-#define REORDERING_CU_H_
-#include <THC/THCAtomics.cuh>
-#include <THC/THCNumerics.cuh>
-#include <cuda_fp16.h>
-#include <tensorview/kernel_utils.h>
-
-#if PYTORCH_VERSION < 10500
-#define TH_ATOMIC_ADD atomicAdd
-#else
-#define TH_ATOMIC_ADD gpuAtomicAdd
-#endif
-
-// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-namespace spconv {
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void gatherGenericKernel(T *buffer, const T *features,
-                                    const Index *indices, int size,
-                                    int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              features[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void gatherVecKernel(T *buffer, const T *features,
-                                const Index *indices, int size, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          reinterpret_cast<VecType *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void gatherVecBlockKernel(T *buffer, const T *features,
-                                     const Index *indices, int size,
-                                     int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  features += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      reinterpret_cast<VecType *>(
-          buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-          reinterpret_cast<const VecType *>(
-              features)[indices[ix + ILPStrideX[ilp]] * numPlanes +
-                        threadIdx.y];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void batchGatherGenericKernel(T *buffer, const T *features,
-                                         const Index *indices, int size,
-                                         int numPlanes, int indice_batch_stride,
-                                         int feature_batch_stride) {
-  // size: max indice num * kernel volume
-  // inds: [volume, num_elems]
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp];
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          if (inds[ilp] != -1) {
-            buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                features[inds[ilp] * numPlanes + iy];
-
-          } else {
-            buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = T(0);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
-batchGatherVecKernel(T *buffer, const T *features, const Index *indices,
-                     int size, int feature_offset, int numPlanes,
-                     int indice_batch_stride, int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index zero[sizeof(VecType) / sizeof(T)];
-#pragma unroll
-  for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
-    zero[i] = T(0);
-  }
-
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          if (inds[ilp] != -1) {
-            reinterpret_cast<VecType *>(
-                buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                reinterpret_cast<const VecType *>(
-                    features)[inds[ilp] * numPlanes + iy];
-
-          } else {
-            reinterpret_cast<VecType *>(
-                buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                reinterpret_cast<const VecType *>(&zero)[0];
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void
-batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
-                          int size, int numPlanes, int indice_batch_stride,
-                          int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-  Index inds;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  features += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-
-  Index inds_elem;
-  Index zero[sizeof(VecType) / sizeof(T)];
-#pragma unroll
-  for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
-    zero[i] = T(0);
-  }
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      inds_elem = ix + ILPStrideX[ilp];
-      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                     inds_elem % feature_batch_stride];
-
-      if (inds != -1) {
-        reinterpret_cast<VecType *>(
-            buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-            reinterpret_cast<const VecType *>(
-                features)[inds * numPlanes + threadIdx.y];
-      } else {
-        reinterpret_cast<VecType *>(
-            buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-            reinterpret_cast<const VecType *>(&zero)[0];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
-                                        const Index *indices, int size,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          outFeatures[inds[ilp] + iy] +=
-              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
-                                         const Index *indices, int size,
-                                         int numPlanes) {
-  int ILPStrideX[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2);
-
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  T buf[vecloadFactor];
-  T buf2[vecloadFactor];
-  Index idx;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
-      reinterpret_cast<VecType *>(buf)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idx];
-      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
-          buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
-      if (std::is_same<T, at::Half>::value) {
-#if __CUDA_ARCH__ >= 530
-#pragma unroll
-        for (int i = 0; i < vecloadHalf2Factor; i++) {
-          reinterpret_cast<__half2 *>(buf)[i] =
-              __hadd2(reinterpret_cast<__half2 *>(buf)[i],
-                      reinterpret_cast<__half2 *>(buf2)[i]);
-        }
-#else
-#pragma unroll
-        for (int i = 0; i < vecloadFactor; i++) {
-          buf[i] += buf2[i];
-        }
-#endif
-      } else {
-#pragma unroll
-        for (int i = 0; i < vecloadFactor; i++) {
-          buf[i] += buf2[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idx] =
-          reinterpret_cast<VecType *>(buf)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
-                                      const Index *indices, int size,
-                                      int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] +=
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
-    }
-  }
-}
-
-#if __CUDA_ARCH__ >= 530
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
-                                          const Index *indices, int size,
-                                          int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  Index idx;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
-      reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2(
-          reinterpret_cast<__half2 *>(outFeatures)[idx],
-          reinterpret_cast<__half2 *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
-    }
-  }
-}
-#endif
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
-                                             const Index *indices, int size,
-                                             int feature_offset, int numPlanes,
-                                             int indice_batch_stride,
-                                             int feature_batch_stride) {
-  // batch scatter add is greatly slower than native scatter when the number of
-  // points is large. this may due to atomicAdd?
-  // batch scatter add is greatly faster than native when the number of points
-  // is small.
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
-          TH_ATOMIC_ADD(outFeatures + inds[ilp] * numPlanes + iy,
-                        buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
-                           const Index *indices, int size, int numPlanes,
-                           int indice_batch_stride, int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  Index inds, inds_elem;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      inds_elem = ix + ILPStrideX[ilp];
-      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                     inds_elem % feature_batch_stride];
-      if (inds != -1) {
-        TH_ATOMIC_ADD(outFeatures + inds * numPlanes + threadIdx.y,
-                      buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
-      }
-    }
-  }
-}
-
-} // namespace spconv
-
-#undef TH_ATOMIC_ADD
-
-#endif
\ No newline at end of file
--- a/include/spconv/reordering.h
+++ b/include/spconv/reordering.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_REORDERING_FUNCTOR_H_
-#define SPARSE_REORDERING_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                              torch::Tensor indices, int size);
-void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
-                                   torch::Tensor outFeatures,
-                                   torch::Tensor indices, int size);
-
-void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                        torch::Tensor indices, int size);
-void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
-                             torch::Tensor indices, int size);
-
-void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
-                       torch::Tensor indices, int size);
-void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
-                            torch::Tensor indices, int size);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_OP_H_
-#define SPARSE_CONV_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-
-enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
-
-// torch.jit's doc says only support int64, so we need to convert to int32.
-std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, int64_t batchSize,
-               std::vector<int64_t> outSpatialShape,
-               std::vector<int64_t> spatialShape,
-               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-               std::vector<int64_t> outPadding, int64_t _subM,
-               int64_t _transpose, int64_t _useHash);
-
-torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
-                              torch::Tensor indicePairs,
-                              torch::Tensor indiceNum, int64_t numActOut,
-                              int64_t _inverse, int64_t _subM,
-                              bool batchScatter);
-
-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM,
-                         int64_t algo);
-std::vector<torch::Tensor>
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
-                   int64_t algo);
-
-std::vector<torch::Tensor>
-indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
-                        torch::Tensor outGrad, torch::Tensor indicePairs,
-                        torch::Tensor indiceNum, int64_t _inverse,
-                        int64_t _subM, bool batchScatter);
-} // namespace spconv
-
-#endif
\ No newline at end of file
--- a/include/tensorrt/inference.h
+++ b/include/tensorrt/inference.h
-#include "NvInfer.h"
-#include <memory>
-#include <tensorview/tensor.h>
-#include <unordered_map>
-#include <vector>
-
-namespace trt {
-
-template <typename T> tv::DType trt_dtype_to_tv(T trt_dtype) {
-  switch (trt_dtype) {
-  case nvinfer1::DataType::kFLOAT:
-    return tv::float32;
-  case nvinfer1::DataType::kHALF:
-    return tv::float16;
-  case nvinfer1::DataType::kINT32:
-    return tv::int32;
-  case nvinfer1::DataType::kINT8:
-    return tv::int8;
-  default:;
-  }
-  TV_THROW_INVALID_ARG("unknown trt dtype");
-}
-
-struct InferDeleter {
-  template <typename T> void operator()(T *obj) const {
-    if (obj) {
-      obj->destroy();
-    }
-  }
-};
-
-template <typename T> using trt_unique_ptr_t = std::unique_ptr<T, InferDeleter>;
-
-class Logger : public nvinfer1::ILogger {
-public:
-  Logger(Severity severity = Severity::kWARNING)
-      : reportableSeverity(severity) {}
-
-  void log(Severity severity, const char *msg) override {
-    // suppress messages with severity enum value greater than the reportable
-    if (severity > reportableSeverity)
-      return;
-
-    switch (severity) {
-    case Severity::kINTERNAL_ERROR:
-      std::cerr << "INTERNAL_ERROR: ";
-      break;
-    case Severity::kERROR:
-      std::cerr << "ERROR: ";
-      break;
-    case Severity::kWARNING:
-      std::cerr << "WARNING: ";
-      break;
-    case Severity::kINFO:
-      std::cerr << "INFO: ";
-      break;
-    default:
-      std::cerr << "UNKNOWN: ";
-      break;
-    }
-    std::cerr << msg << std::endl;
-  }
-
-  Severity reportableSeverity;
-};
-
-class InferenceContext {
-public:
-  explicit InferenceContext(const std::string &engine_bin, int device)
-      : logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
-    TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    if (device >= deviceCount) {
-      TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ",
-                           deviceCount, " device.");
-    }
-    cudaSetDevice(device);
-    auto runtime = trt_unique_ptr_t<nvinfer1::IRuntime>(
-        nvinfer1::createInferRuntime(logger_));
-    engine_ =
-        trt_unique_ptr_t<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(
-            engine_bin.c_str(), engine_bin.size(), nullptr));
-    ctx_ = trt_unique_ptr_t<nvinfer1::IExecutionContext>(
-        engine_->createExecutionContext());
-
-    max_batch_size_ = engine_->getMaxBatchSize();
-    for (int i = 0; i < engine_->getNbBindings(); ++i) {
-      auto dims = engine_->getBindingDimensions(i);
-      std::vector<int> shape_vec(dims.d, dims.d + dims.nbDims);
-      shape_vec.insert(shape_vec.begin(), {max_batch_size_});
-      tv::TensorShape shape(shape_vec);
-      std::string name = engine_->getBindingName(i);
-      auto trt_dtype = engine_->getBindingDataType(i);
-      auto tv_dtype = trt_dtype_to_tv(trt_dtype);
-      bool isInput = engine_->bindingIsInput(i);
-      name_to_idx_[name] = i;
-      idx_to_name_[i] = name;
-      name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)});
-      name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)});
-      if (isInput)
-        inp_idxes_.push_back(i);
-      else
-        out_idxes_.push_back(i);
-      bindings_.push_back(name_to_dev_mem_[name].raw_data());
-    }
-    checkCudaErrors(cudaStreamCreate(&stream_));
-  }
-
-  std::unordered_map<std::string, tv::Tensor>
-  operator()(std::vector<tv::Tensor> inputs) {
-    TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide",
-                          inp_idxes_.size(), "inputs, but got", inputs.size());
-    // inference batch size
-    int bs = inputs[0].dim(0);
-    for (auto &inp : inputs) {
-      TV_ASSERT_INVALID_ARG(inp.dim(0) == bs,
-                            "batch sizes of all input must same");
-    }
-    TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs,
-                          max_batch_size_);
-    for (int i = 0; i < inputs.size(); ++i) {
-      auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]];
-      auto shape_inp = inputs[i].shape().subshape(1);
-      auto shape_dev = dev_mem.shape().subshape(1);
-      TV_ASSERT_INVALID_ARG(shape_inp == shape_dev,
-                            "shape except batch must same", shape_inp,
-                            shape_dev);
-      dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs),
-                                            stream_);
-    }
-
-    ctx_->enqueue(bs, bindings_.data(), stream_, nullptr);
-
-    for (int i : out_idxes_) {
-      name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_(
-          name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_);
-    }
-    checkCudaErrors(cudaStreamSynchronize(stream_));
-    std::unordered_map<std::string, tv::Tensor> output_map;
-    for (int i = 0; i < out_idxes_.size(); ++i) {
-      auto name = idx_to_name_[out_idxes_[i]];
-      output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs);
-    }
-    return output_map;
-  }
-
-  std::unordered_map<std::string, tv::Tensor>
-  operator()(std::unordered_map<std::string, tv::Tensor> inputs) {
-    std::vector<tv::Tensor> inputs_vec(inp_idxes_.size());
-    int count = 0;
-    for (auto &p : inputs) {
-      auto iter = name_to_idx_.find(p.first);
-      TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name",
-                            p.first);
-      inputs_vec[name_to_idx_[p.first]] = p.second;
-    }
-    TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough");
-    return (*this)(inputs_vec);
-  }
-
-  tv::Tensor operator[](std::string name) {
-    auto iter = name_to_host_mem_.find(name);
-    if (iter == name_to_host_mem_.end()) {
-      TV_THROW_INVALID_ARG(name, "not found.");
-    }
-    return iter->second;
-  }
-
-  std::string repr() {
-    std::stringstream ss;
-    ss << "InferenceContext[gpu=" << device_ << "]";
-    ss << "\n  Inputs:";
-    std::string name;
-    for (auto &i : inp_idxes_) {
-      name = idx_to_name_[i];
-      auto &mem = name_to_host_mem_[name];
-      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
-         << "]: " << mem.shape();
-    }
-    ss << "\n  Outputs:";
-    for (auto &i : out_idxes_) {
-      name = idx_to_name_[i];
-      auto &mem = name_to_host_mem_[name];
-      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
-         << "]: " << mem.shape();
-    }
-    return ss.str();
-  }
-
-private:
-  Logger logger_;
-  trt_unique_ptr_t<nvinfer1::ICudaEngine> engine_;
-  trt_unique_ptr_t<nvinfer1::IExecutionContext> ctx_;
-  std::unordered_map<std::string, tv::Tensor> name_to_dev_mem_;
-  std::unordered_map<std::string, tv::Tensor> name_to_host_mem_;
-  std::unordered_map<std::string, int> name_to_idx_;
-  std::unordered_map<int, std::string> idx_to_name_;
-  std::vector<int> inp_idxes_;
-  std::vector<int> out_idxes_;
-  std::vector<void *> bindings_;
-  cudaStream_t stream_;
-  int max_batch_size_;
-  int device_;
-};
-
-} // namespace trt