Initial commit

d1aac35d · zhangwenwei · d1aac35d · d1aac35d · d1aac35d · d1aac35d
Commit d1aac35d authored Apr 14, 2020 by zhangwenwei
20 changed files
--- a/mmdet3d/ops/spconv/include/spconv/point2voxel.h
+++ b/mmdet3d/ops/spconv/include/spconv/point2voxel.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <pybind11/pybind11.h>
+// must include pybind11/eigen.h if using eigen matrix as arguments.
+// must include pybind11/stl.h if using containers in STL in arguments.
+#include <algorithm>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+// #include <vector>
+#include <iostream>
+#include <math.h>
+
+namespace spconv {
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<DType> means,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j){
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<DType> height,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points,
+                          py::array_t<int> mask,
+                          py::array_t<DType> height,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range,
+                          int max_voxels,
+                          DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){
+      mask(i) = 0;
+    }
+  }
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> voxel_mask,
+                          py::array_t<DType> mins,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range,
+                          int max_points,
+                          int max_voxels,
+                          int block_factor,
+                          int block_size,
+                          DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0]-block_size/2);
+    stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2);
+    starty = std::max(0, block_coor[1]-block_size/2);
+    stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2);
+
+    for (int j = startx; j < stopx; ++j){
+      for (int k = starty; k < stopy; ++k){
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+  }
+  return voxel_num;
+}
+
+
+} // namespace spconv
--- a/mmdet3d/ops/spconv/include/spconv/pool_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/pool_ops.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_POOL_OP_H_
+#define SPARSE_POOL_OP_H_
+
+#include <cuda_runtime_api.h>
+#include <spconv/maxpool.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+
+namespace spconv {
+template <typename T>
+torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
+                          torch::Tensor indiceNum, int64_t numAct) {
+  auto device = features.device().type();
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
+  double totalTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    if (device == torch::kCPU) {
+      functor::SparseMaxPoolForwardFunctor<tv::CPU, T, int> forwardFtor;
+      forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(features),
+                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+    } else {
+      functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
+      forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(features),
+                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalTime += timer.report() / 1000.0;
+  }
+  // std::cout << "maxpool forward time " << totalTime << std::endl;
+  return output;
+}
+
+template <typename T>
+torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
+                                  torch::Tensor outFeatures,
+                                  torch::Tensor outGrad, torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum) {
+  auto device = features.device().type();
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+    auto kernelVolume = indicePairs.size(0);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, int> backwardFtor;
+      backwardFtor(tv::CPU(), tv::torch2tv<const T>(outFeatures),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
+                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+    } else {
+      functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
+      backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
+                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+  return inputGrad;
+}
+
+} // namespace spconv
+
+#endif
--- a/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef REORDERING_CU_H_
+#define REORDERING_CU_H_
+#include <tensorview/helper_kernel.cu.h>
+
+// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+namespace spconv {
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void gatherGenericKernel(T *buffer, const T *features,
+                                    const Index *indices, int size,
+                                    int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              features[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void gatherVecKernel(T *buffer, const T *features,
+                                const Index *indices, int size, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          reinterpret_cast<VecType *>(
+              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void gatherVecBlockKernel(T *buffer, const T *features,
+                                     const Index *indices, int size,
+                                     int numPlanes) {
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  features += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      reinterpret_cast<VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+          reinterpret_cast<const VecType *>(
+              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
+                        threadIdx.x];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
+                                        const Index *indices, int size,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size) {
+          outFeatures[inds[ilp] + iy] +=
+              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  outFeatures += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+  T buf[vecloadFactor];
+  T buf2[vecloadFactor];
+  Index idx;
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(buf)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idx];
+      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        buf[i] += buf2[i];
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idx] =
+          reinterpret_cast<VecType *>(buf)[0];
+    }
+  }
+}
+
+} // namespace spconv
+
+#endif
--- a/mmdet3d/ops/spconv/include/spconv/reordering.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_REORDERING_FUNCTOR_H_
+#define SPARSE_REORDERING_FUNCTOR_H_
+#include <tensorview/tensorview.h>
+
+namespace spconv
+{
+namespace functor
+{
+template <typename Device, typename T, typename Index>
+struct SparseGatherFunctor
+{
+    void operator()(const Device& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+                    tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename T, typename Index>
+struct SparseScatterAddFunctor
+{
+    void operator()(const Device& d, tv::TensorView<T> out_features,
+                    tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
+                    int size, bool stable=false);
+};
+} // namespace functor
+} // namespace spconv
+
+#endif
--- a/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_OP_H_
+#define SPARSE_CONV_OP_H_
+
+#include <cuda_runtime_api.h>
+#include <spconv/indice.h>
+#include <spconv/reordering.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+template <unsigned NDim>
+std::vector<torch::Tensor>
+getIndicePair(torch::Tensor indices, int64_t batchSize,
+        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+        std::vector<int64_t> padding, std::vector<int64_t> dilation,
+        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  // auto timer = spconv::CudaContextTimer<>();
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                   torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique =
+      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                    torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
+      numActOut =
+          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+                        padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor>
+getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+        std::vector<int64_t> padding, std::vector<int64_t> dilation,
+        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  // auto timer = spconv::CudaContextTimer<>();
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                   torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique =
+      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                    torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
+      numActOut =
+          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+                        padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+
+}
+
+
+template <typename T>
+torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
+                       torch::Tensor indicePairs, torch::Tensor indiceNum,
+                       int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  /*if (_subM){
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+
+    auto indicePairVecMaxSizeIter = std::max_element(
+        indicePairNumVec.begin(), indicePairNumVec.end());
+    indicePairMaxSize = *indicePairVecMaxSizeIter;
+  }*/
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      /* slower than SparseGatherFunctor, may due to int->long conversion
+      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
+      indicePairOptions);
+      torch::index_select_out(inputBufferBlob, features, 0,
+      indicePairBlob);*/
+    }
+    // totalGatherTime += timer.report() / 1000.0;
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+    // totalGEMMTime += timer.report() / 1000.0;
+
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+
+template <typename T>
+std::vector<torch::Tensor>
+indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+                 torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum,
+                 int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
+                    tv::torch2tv<const T>(outGrad),
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
+                    tv::torch2tv<const T>(outGrad),
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    auto filterGradSub = filtersGrad[i];
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+
+    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
+                  tv::torch2tv<const T>(inputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
+                  tv::torch2tv<const T>(inputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template <typename T>
+torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto totalActsTen = indicePairNumCpu.sum();
+  auto totalActs = indicePairNumCpu.data<int>()[0];
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
+                              indicePairNumCpu.data<int>() + kernelVolume);
+  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+  int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
+  if (subM) {
+    indicePairMaxSize = subRuleMaxSize;
+  }
+  auto timer = spconv::CudaContextTimer<>();
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  std::cout << "create time " << timer.report()/1000.0 << std::endl;
+  if (subM) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  // auto timer = spconv::CudaContextTimer<>();
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    //
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    // }
+    // for (int i = 0; i < kernelVolume; ++i) {
+    // totalGatherTime += timer.report() / 1000.0;
+    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+    // {nHot, numOutPlanes}, options);
+
+  }
+  // totalGatherTime += timer.report() / 1000.0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+  }
+  // totalGEMMTime += timer.report() / 1000.0;
+  // totalGEMMTime += timer.report() / 1000.0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBufferBlob),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBufferBlob),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // totalSAddTime += timer.report() / 1000.0;
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+
+} // namespace spconv
+
+#endif
--- a/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
+#pragma once
+// from tensorflow
+namespace tv
+{
+namespace detail
+{
+
+template <typename T>
+class KernelLoop
+{
+  struct Iterator
+  {
+    __forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+    __forceinline__ __device__ T operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++()
+    {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const
+    {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      // Anything past an end iterator (delta_ == 0) is equal.
+      // In range-based for loops, this optimizes to 'return less'.
+      if (!other.delta_)
+      {
+        return less;
+      }
+      if (!delta_)
+      {
+        return greater;
+      }
+      return less || greater;
+    }
+
+  private:
+    T index_;
+    const T delta_;
+  };
+
+public:
+  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+private:
+  T begin_;
+  T delta_;
+  T end_;
+};
+
+} // namespace detail
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
+                                  gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
+                                  gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
+                                  gridDim.z * blockDim.z * NumILP, count);
+}
+
+} // namespace tv
--- a/mmdet3d/ops/spconv/include/tensorview/helper_launch.h
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_launch.h
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv
+{
+namespace launch
+{
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+inline int getBlocks(const int N)
+{
+    TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+    return DivUp(N, CUDA_NUM_THREADS);
+}
+} // namespace launch
+} // namespace tv
--- a/mmdet3d/ops/spconv/include/tensorview/tensorview.h
+++ b/mmdet3d/ops/spconv/include/tensorview/tensorview.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <memory>
+// #include <prettyprint.h>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+namespace tv {
+
+#ifdef __NVCC__
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...)                                                  \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      printf(__VA_ARGS__);                                                     \
+      assert(expr);                                                            \
+    }                                                                          \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                                           \
+  {                                                                            \
+    if (!(expr) && threadIdx.x == 0)                                           \
+      printf(__VA_ARGS__);                                                     \
+    assert(expr);                                                              \
+  }
+
+template <class SStream, class T> void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                                            \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      throw std::invalid_argument(__macro_s.str());                            \
+    }                                                                          \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                                    \
+  {                                                                            \
+    auto err = cudaGetLastError();                                             \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << "cuda execution failed with error " << err;                 \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
+
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+struct CPU {};
+
+#define TV_MAX_DIM 6
+/*
+template <typename T>
+constexpr size_t calc_align(size_t ndim)
+{
+  if (ndim * sizeof(T) == 1)
+    return 1;
+  else if (ndim * sizeof(T) == 2)
+    return 2;
+  else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
+    return 4;
+  else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
+    return 8;
+  else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
+    return 16;
+  else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
+    return 32;
+  else
+    return 64;
+}
+*/
+template <typename T, size_t MaxDim = TV_MAX_DIM>
+struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
+public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (T s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<T> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(T s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const T *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+  public:
+    typedef iterator self_type;
+    typedef T value_type;
+    typedef T &reference;
+    typedef T *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+  private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+  public:
+    typedef const_iterator self_type;
+    typedef T value_type;
+    typedef const T &reference;
+    typedef const T *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+  private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+protected:
+  T mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename T, size_t MaxDim>
+bool operator==(const SimpleVector<T, MaxDim> &lfs,
+                const SimpleVector<T, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size())
+    return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i])
+      return false;
+  }
+  return true;
+}
+
+template <typename T, size_t MaxDim>
+bool operator!=(const SimpleVector<T, MaxDim> &lfs,
+                const SimpleVector<T, MaxDim> &rfs) {
+
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename T>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (T s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  template <typename T, template <class...> class Container>
+  ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0)
+      return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1)
+        shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1)
+        shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N> struct ArrayIndexRowMajor {
+  // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <> struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename T> constexpr const char *simpleTypeName(T val = T());
+template <> constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <> constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
+template <> constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
+template <> constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+}; // namespace detail
+
+template <typename T, int Rank = -1> struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  // explicit TV_HOST_DEVICE_INLINE TensorView(const
+  // TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
+  // mShape(tview.shape()) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  assign(const TensorView<T, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    T *ptr = mPtr;
+    const T *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i)
+      *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  assign(std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    T *ptr = mPtr;
+    for (const T1 &s : seq)
+      *(ptr++) = T(s);
+    return *this;
+  }
+
+  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE T &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
+  /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
+    int idx = rowArrayIdx(mShape, index);
+#ifdef TV_DEBUG
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+                int(idx), size());
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
+    int idx = rowArrayIdx(mShape, index);
+#ifdef TV_DEBUG
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+                int(idx), size());
+#endif
+    return mPtr[idx];
+  }*/
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  operator[](SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
+  operator[](SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE T *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const T *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<T, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<T, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze() const {
+    return TensorView<T, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze(int dim) const {
+    return TensorView<T, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slice slice,
+                                                    Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1; // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(int id, Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
+                               mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty())
+      return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
+      // detail::simpleTypeName<T>());
+      ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1)
+        ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0)
+          ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
+    // detail::simpleTypeName<T>());
+    ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
+    return ss.str();
+  }
+
+protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  _subview(SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1; // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  T *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename T, int Rank>
+Os &operator<<(Os &os, const TensorView<T, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename T, int Rank>
+Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename T> constexpr const char *printfTypeFormat(T val = T());
+template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
+template <> constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
+template <> constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
+template <> constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+}; // namespace detail
+
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
+                                    const char *format) {
+  if (tensor.empty())
+    return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1)
+      printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0)
+        printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename T>
+TV_HOST_DEVICE void printTensorView(TensorView<T> tensor) {
+  using Traw = typename std::remove_const<T>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
+  using Traw = typename std::remove_const<T>::type;
+  return printTensorView(TensorView<const T>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const T>(ptr, shape), format);
+}
+
+} // namespace tv
--- a/mmdet3d/ops/spconv/include/torch_utils.h
+++ b/mmdet3d/ops/spconv/include/torch_utils.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <tensorview/tensorview.h>
+#include <torch/script.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+namespace tv {
+
+struct TorchGPU: public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+  case at::ScalarType::Double: {
+    auto val = std::is_same<std::remove_const_t<T>, double>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Float: {
+    auto val = std::is_same<std::remove_const_t<T>, float>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Int: {
+    auto val = std::is_same<std::remove_const_t<T>, int>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Half: {
+    auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Long: {
+    auto val = std::is_same<std::remove_const_t<T>, long>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  default:
+    TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename T>
+tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<T>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
+}
+} // namespace tv
--- a/mmdet3d/ops/spconv/include/utility/timer.h
+++ b/mmdet3d/ops/spconv/include/utility/timer.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+namespace spconv {
+
+template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
+  CudaContextTimer() {
+    cudaDeviceSynchronize();
+    mCurTime = std::chrono::steady_clock::now();
+  }
+  typename TimeT::rep report() {
+    cudaDeviceSynchronize();
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+
+template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
+  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
+  typename TimeT::rep report() {
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+
+} // namespace spconv
--- a/mmdet3d/ops/spconv/modules.py
+++ b/mmdet3d/ops/spconv/modules.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from collections import OrderedDict
+
+import torch
+from torch import nn
+
+from .structure import SparseConvTensor
+
+
+def is_spconv_module(module):
+    spconv_modules = (SparseModule, )
+    return isinstance(module, spconv_modules)
+
+
+def is_sparse_conv(module):
+    from .conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
+
+
+def _mean_update(vals, m_vals, t):
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+
+
+class SparseModule(nn.Module):
+    """ place holder,
+        All module subclass from this will take sptensor in SparseSequential.
+    """
+    pass
+
+
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, given is a small example::
+
+        # Example of using Sequential
+        model = SparseSequential(
+                  SparseConv2d(1,20,5),
+                  nn.ReLU(),
+                  SparseConv2d(20,64,5),
+                  nn.ReLU()
+                )
+
+        # Example of using Sequential with OrderedDict
+        model = SparseSequential(OrderedDict([
+                  ('conv1', SparseConv2d(1,20,5)),
+                  ('relu1', nn.ReLU()),
+                  ('conv2', SparseConv2d(20,64,5)),
+                  ('relu2', nn.ReLU())
+                ]))
+
+        # Example of using Sequential with kwargs(python 3.6+)
+        model = SparseSequential(
+                  conv1=SparseConv2d(1,20,5),
+                  relu1=nn.ReLU(),
+                  conv2=SparseConv2d(20,64,5),
+                  relu2=nn.ReLU()
+                )
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(SparseSequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError('kwargs only supported in py36+')
+            if name in self._modules:
+                raise ValueError('name exists.')
+            self.add_module(name, module)
+        self._sparity_dict = {}
+
+    def __getitem__(self, idx):
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError('index {} is out of range'.format(idx))
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+
+    def __len__(self):
+        return len(self._modules)
+
+    @property
+    def sparity_dict(self):
+        return self._sparity_dict
+
+    def add(self, module, name=None):
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError('name exists')
+        self.add_module(name, module)
+
+    def forward(self, input):
+        for k, module in self._modules.items():
+            if is_spconv_module(module):  # use SpConvTensor as input
+                assert isinstance(input, SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
+
+    def fused(self):
+        """don't use this. no effect.
+        """
+        from .conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
+                                                      nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor.
+    """
+
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer.
+    """
+
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
--- a/mmdet3d/ops/spconv/ops.py
+++ b/mmdet3d/ops/spconv/ops.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from . import sparse_conv_ext
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices,
+                     batch_size,
+                     spatial_shape,
+                     ksize=3,
+                     stride=1,
+                     padding=0,
+                     dilation=1,
+                     out_padding=0,
+                     subm=False,
+                     transpose=False,
+                     grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_2d
+        elif ndim == 3:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_3d
+        elif ndim == 4:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_4d
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_grid_2d
+        elif ndim == 3:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_grid_3d
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+
+
+def indice_conv(features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                inverse=False,
+                subm=False):
+    if filters.dtype == torch.float32:
+        return sparse_conv_ext.indice_conv_fp32(features, filters,
+                                                indice_pairs, indice_pair_num,
+                                                num_activate_out, int(inverse),
+                                                int(subm))
+    elif filters.dtype == torch.half:
+        return sparse_conv_ext.indice_conv_half(features, filters,
+                                                indice_pairs, indice_pair_num,
+                                                num_activate_out, int(inverse),
+                                                int(subm))
+    else:
+        raise NotImplementedError
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    if features.dtype == torch.half:
+        func = sparse_conv_ext.fused_indice_conv_half
+    elif filters.dtype == torch.float32:
+        func = sparse_conv_ext.fused_indice_conv_fp32
+    else:
+        raise NotImplementedError
+
+    return func(features, filters, bias, indice_pairs, indice_pair_num,
+                num_activate_out, int(inverse), int(subm))
+
+
+def indice_conv_backward(features,
+                         filters,
+                         out_bp,
+                         indice_pairs,
+                         indice_pair_num,
+                         inverse=False,
+                         subm=False):
+    if filters.dtype == torch.float32:
+        return sparse_conv_ext.indice_conv_backward_fp32(
+            features, filters, out_bp, indice_pairs, indice_pair_num,
+            int(inverse), int(subm))
+    elif filters.dtype == torch.half:
+        return sparse_conv_ext.indice_conv_backward_half(
+            features, filters, out_bp, indice_pairs, indice_pair_num,
+            int(inverse), int(subm))
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32:
+        return sparse_conv_ext.indice_maxpool_fp32(features, indice_pairs,
+                                                   indice_pair_num,
+                                                   num_activate_out)
+    elif features.dtype == torch.half:
+        return sparse_conv_ext.indice_maxpool_half(features, indice_pairs,
+                                                   indice_pair_num,
+                                                   num_activate_out)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    if features.dtype == torch.float32:
+        return sparse_conv_ext.indice_maxpool_backward_fp32(
+            features, out_features, out_bp, indice_pairs, indice_pair_num)
+    elif features.dtype == torch.half:
+        return sparse_conv_ext.indice_maxpool_backward_half(
+            features, out_features, out_bp, indice_pairs, indice_pair_num)
+    else:
+        raise NotImplementedError
--- a/mmdet3d/ops/spconv/pool.py
+++ b/mmdet3d/ops/spconv/pool.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import functional as Fsp
+from . import ops
+from .modules import SparseModule
+from .structure import SparseConvTensor
+
+
+class SparseMaxPool(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super(SparseMaxPool, self).__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = ops.get_conv_output_size(
+                spatial_shape, self.kernel_size, self.stride, self.padding,
+                self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
+            self.padding, self.dilation, 0, self.subm)
+
+        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
+                                          indice_pairs_num.to(device),
+                                          outids.shape[0])
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseMaxPool2d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
+                                              dilation)
+
+
+class SparseMaxPool3d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
+                                              dilation)
--- a/mmdet3d/ops/spconv/src/all.cc
+++ b/mmdet3d/ops/spconv/src/all.cc
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime_api.h>
+#include <spconv/fused_spconv_ops.h>
+#include <spconv/pool_ops.h>
+#include <spconv/spconv_ops.h>
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_indice_pairs_2d", &spconv::getIndicePair<2>,
+        "get_indice_pairs_2d");
+  m.def("get_indice_pairs_3d", &spconv::getIndicePair<3>,
+        "get_indice_pairs_3d");
+  m.def("get_indice_pairs_4d", &spconv::getIndicePair<4>,
+        "get_indice_pairs_4d");
+  m.def("get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>,
+        "get_indice_pairs_grid_2d");
+  m.def("get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>,
+        "get_indice_pairs_grid_3d");
+  m.def("indice_conv_fp32", &spconv::indiceConv<float>, "indice_conv_fp32");
+  m.def("indice_conv_backward_fp32", &spconv::indiceConvBackward<float>,
+        "indice_conv_backward_fp32");
+  m.def("indice_conv_half", &spconv::indiceConv<at::Half>, "indice_conv_half");
+  m.def("indice_conv_backward_half", &spconv::indiceConvBackward<at::Half>,
+        "indice_conv_backward_half");
+  m.def("fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>,
+        "fused_indice_conv_fp32");
+  m.def("fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>,
+        "fused_indice_conv_half");
+  m.def("indice_maxpool_fp32", &spconv::indiceMaxPool<float>,
+        "indice_maxpool_fp32");
+  m.def("indice_maxpool_backward_fp32", &spconv::indiceMaxPoolBackward<float>,
+        "indice_maxpool_backward_fp32");
+  m.def("indice_maxpool_half", &spconv::indiceMaxPool<at::Half>,
+        "indice_maxpool_half");
+  m.def("indice_maxpool_backward_half",
+        &spconv::indiceMaxPoolBackward<at::Half>,
+        "indice_maxpool_backward_half");
+}
--- a/mmdet3d/ops/spconv/src/indice.cc
+++ b/mmdet3d/ops/spconv/src/indice.cc
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/geometry.h>
+#include <spconv/indice.h>
+#include <spconv/spconv_ops.h>
+#include <torch/script.h>
+
+namespace spconv {
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                     tv::TensorView<Index> indicesOut,
+                     tv::TensorView<IndexGrid> gridsOut,
+                     tv::TensorView<Index> indicePairs,
+                     tv::TensorView<Index> indiceNum,
+                     const tv::SimpleVector<Index, NDim> kernelSize,
+                     const tv::SimpleVector<Index, NDim> stride,
+                     const tv::SimpleVector<Index, NDim> padding,
+                     const tv::SimpleVector<Index, NDim> dilation,
+                     const tv::SimpleVector<Index, NDim> outSpatialShape,
+                     bool transpose, bool resetGrid) {
+    if (transpose)
+      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut,
+          gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+    else
+      return getIndicePairsConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut,
+          gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+
+  }
+};
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                     tv::TensorView<IndexGrid> gridsOut,
+                     tv::TensorView<Index> indicePairs,
+                     tv::TensorView<Index> indiceNum,
+                     const tv::SimpleVector<Index, NDim> kernelSize,
+                     const tv::SimpleVector<Index, NDim> stride,
+                     const tv::SimpleVector<Index, NDim> padding,
+                     const tv::SimpleVector<Index, NDim> dilation,
+                     const tv::SimpleVector<Index, NDim> outSpatialShape,
+                     bool transpose, bool resetGrid) {
+    return getIndicePairsSubM<Index, IndexGrid, NDim>(
+        indicesIn,
+        gridsOut, indicePairs, indiceNum,
+        kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+  }
+};
+} // namespace functor
+
+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>;      \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,  \
+                                                         NDIM>;
+
+
+#define DECLARE_CPU_INDEX(Index)                                               \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_SPECS_INDEX_NDIM
+
+} // namespace spconv
--- a/mmdet3d/ops/spconv/src/indice_cuda.cu
+++ b/mmdet3d/ops/spconv/src/indice_cuda.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/indice.h>
+#include <spconv/indice.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+
+namespace spconv {
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    // auto timer = spconv::CudaContextTimer<>();
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                           indiceNum, indicePairUnique, kernelSize, stride,
+                           padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                           indiceNum, indicePairUnique, kernelSize, stride,
+                           padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
+    return 1;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                         indicePairUnique, outSpatialShape, batchSize);
+    TV_CHECK_CUDA_ERR();
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                         indicePairUnique, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numAct;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    // auto timer = spconv::CudaContextTimer<>();
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                         kernelSize, stride, padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numActIn;
+  }
+};
+} // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
+  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,    \
+                                                       NDIM>;                  \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int,  \
+                                                         NDIM>;                \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int,  \
+                                                         NDIM>;                \
+  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,    \
+                                                       NDIM>;
+
+#define DECLARE_GPU_INDEX(Index)                                               \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_GPU_INDEX(int);
+
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
+} // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool.cc
+++ b/mmdet3d/ops/spconv/src/maxpool.cc
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/maxpool.h>
+#include <torch/script.h>
+
+namespace spconv {
+
+namespace functor {
+template <typename T, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto doutData = dout.data();
+    auto dinData = din.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          dinData[idxi + plane] += doutData[idxo + plane];
+    }
+  }
+};
+} // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>;     \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)                                                   \
+  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
+
+} // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool_cuda.cu
+++ b/mmdet3d/ops/spconv/src/maxpool_cuda.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/maxpool.h>
+#include <spconv/mp_helper.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+
+namespace spconv {
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  T in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
+                             const Index *indicesIn, const Index *indicesOut,
+                             int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+  T bufi[vecloadFactor];
+  T bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
+                      T *din, const Index *indicesIn, const Index *indicesOut,
+                      int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  T in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  dout += blockIdx.y * NumTLP;
+  din += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          din[idxi] += dout[idxo];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
+                                             const T *inFeatures, const T *dout,
+                                             T *din, const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void
+maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
+                         const T *dout, T *din, const Index *indicesIn,
+                         const Index *indicesOut, int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+  T bufi[vecloadFactor];
+  T bufo[vecloadFactor];
+  T bufdi[vecloadFactor];
+  T bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(dout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
+
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
+                        const T *dout, T *din, const Index *indicesIn,
+                        const Index *indicesOut, int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace functor {
+template <typename T, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                 indices.subview(0).data(),
+                                 indices.subview(1).data(), numHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data() + numHotBlock,
+                                    indices.subview(1).data() + numHotBlock,
+                                    size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                 dout.data(), din.data(),
+                                 indices.subview(0).data(),
+                                 indices.subview(1).data(), numHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    dout.data(), din.data(),
+                                    indices.subview(0).data() + numHotBlock,
+                                    indices.subview(1).data() + numHotBlock,
+                                    size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+} // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>;     \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
+} // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering.cc
+++ b/mmdet3d/ops/spconv/src/reordering.cc
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/reordering.h>
+#include <torch/script.h>
+
+namespace spconv {
+namespace functor {
+template <typename T, typename Index>
+struct SparseGatherFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(T) * numPlanes);
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
+                  int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const T* buf = buffer.data();
+    T* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j){
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+
+} // namespace functor
+
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)               \
+  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>;  \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)                                                   \
+  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
+
+} // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering_cuda.cu
+++ b/mmdet3d/ops/spconv/src/reordering_cuda.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/reordering.h>
+#include <spconv/reordering.cu.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+
+namespace spconv {
+namespace functor {
+template <typename T, typename Index>
+struct SparseGatherFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(), indices.data(),
+                                 nHotBlock, numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                 features.data(), indices.data() + nHotBlock,
+                                 size - nHotBlock, numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<T, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename T, typename Index>
+struct SparseScatterAddFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0)
+      return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(T); // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                 indices.data(), nHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+} // namespace functor
+
+
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>;             \
+  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
+} // namespace spconv