[Feature] Add spconv ops from mmdet3d (#1581)

* add ops (spconv) of mmdet3d * fix typo * refactor code * resolve comments in #1452 * fix compile error * fix bugs * fix bug * transform from 'types.h' to 'extension.h' * fix bug * transform from 'types.h' to 'extension.h' in parrots * add extension.h in pybind.cpp * add unittest * Recover code * (1) Remove prettyprint.h (2) Switch `T` to `scalar_t` (3) Remove useless lines (4) Refine example in docstring of sparse_modules.py * (1) rename from `cu.h` to `cuh` (2) remove useless files (3) move cpu files to `pytorch/cpu` * reorganize files * Add docstring for sparse_functional.py * use dispatcher * remove template * use dispatch in cuda ops * resolve Segmentation fault * remove useless files * fix lint * fix lint * fix lint * fix unittest in test_build_layers.py * add tensorview into include_dirs when compiling * recover all deleted files * fix lint and comments * recover setup.py * replace tv::GPU as tv::TorchGPU & support device guard * fix lint Co-authored-by: hdc <hudingchang.vendor@sensetime.com> Co-authored-by: grimoire <yaoqian@sensetime.com>

[Feature] Add spconv ops from mmdet3d (#1581)
* add ops (spconv) of mmdet3d * fix typo * refactor code * resolve comments in #1452 * fix compile error * fix bugs * fix bug * transform from 'types.h' to 'extension.h' * fix bug * transform from 'types.h' to 'extension.h' in parrots * add extension.h in pybind.cpp * add unittest * Recover code * (1) Remove prettyprint.h (2) Switch `T` to `scalar_t` (3) Remove useless lines (4) Refine example in docstring of sparse_modules.py * (1) rename from `cu.h` to `cuh` (2) remove useless files (3) move cpu files to `pytorch/cpu` * reorganize files * Add docstring for sparse_functional.py * use dispatcher * remove template * use dispatch in cuda ops * resolve Segmentation fault * remove useless files * fix lint * fix lint * fix lint * fix unittest in test_build_layers.py * add tensorview into include_dirs when compiling * recover all deleted files * fix lint and comments * recover setup.py * replace tv::GPU as tv::TorchGPU & support device guard * fix lint Co-authored-by: hdc <hudingchang.vendor@sensetime.com> Co-authored-by: grimoire <yaoqian@sensetime.com>
c1de4c9b · Wenhao Wu · GitHub · 33c83b5a · c1de4c9b · c1de4c9b
Unverified Commit c1de4c9b authored Feb 18, 2022 by Wenhao Wu Committed by GitHub Feb 18, 2022
20 changed files
--- a/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+#include "pytorch_cpp_helper.hpp"
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto foutData = fout.data();
+    auto finData = fin.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          finData[idxi + plane] += foutData[idxo + plane];
+    }
+  }
+};
+}  // namespace functor
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+#define DECLARE_CPU_SPECS(T)         \
+  DECLARE_CPU_SPECS_T_INDEX(T, int); \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
--- a/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <torch/script.h>
+#include <utils/spconv/spconv/reordering.h>
+#include "pytorch_cpp_helper.hpp"
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(scalar_t) * numPlanes);
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const scalar_t* buf = buffer.data();
+    scalar_t* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j) {
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+}  // namespace functor
+#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \
+  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
+#define DECLARE_CPU_SPECS(scalar_t)         \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
--- a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+++ b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -1501,6 +1501,117 @@ void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
 REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
                     points_in_polygons_forward_cuda);
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct);
+torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
+                                                indiceNum, numAct);
+};
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct);
+REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
+                     indice_maxpool_forward_cuda);
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum);
+torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
+                                                 indicePairs, indiceNum);
+};
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum);
+REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
+                     indice_maxpool_backward_cuda)
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM);
+torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return IndiceConvForwardCUDAKernelLauncher(
+      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
+};
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM);
+REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+std::vector<torch::Tensor> indice_conv_backward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return IndiceConvBackwardCUDAKernelLauncher(
+      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+};
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
+                     indice_conv_backward_cuda);
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
+                                                    indicePairs, indiceNum,
+                                                    numActOut, _inverse, _subM);
+};
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
+                     fused_indice_conv_batchnorm_forward_cuda)
 void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
 void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {

--- a/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output =
+      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {  // the center index of subm conv don't need gather and scatter
+               // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+#include <chrono>
+#include <limits>
+#include <spconv/indice.cuh>
+#include <type_traits>
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    return 1;
+  }
+};
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                            indicePairUnique, outSpatialShape, batchSize);
+    TV_CHECK_CUDA_ERR();
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                            indicePairUnique, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numAct;
+  }
+};
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                            kernelSize, stride, padding, dilation,
+                            outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numActIn;
+  }
+};
+}  // namespace functor
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
+  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;            \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;
+#define DECLARE_GPU_INDEX(Index)          \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+DECLARE_GPU_INDEX(int);
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/maxpool.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+#include <chrono>
+#include <limits>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
+                                             const scalar_t *inFeatures,
+                                             const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const scalar_t *fout, scalar_t *fin,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  fout += blockIdx.y * NumTLP;
+  fin += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          fin[idxi] += fout[idxo];
+        }
+      }
+    }
+  }
+}
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(
+    const scalar_t *outFeatures, const scalar_t *inFeatures,
+    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
+    const Index *indicesOut, int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const scalar_t *fout, scalar_t *fin,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  scalar_t bufdi[vecloadFactor];
+  scalar_t bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(fout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(fin)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(fin)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const scalar_t *fout, scalar_t *fin,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d,
+                  tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    fout.data(), fin.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       fout.data(), fin.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+}  // namespace functor
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
+                                                       Index>;                 \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
+                                                        scalar_t, Index>;
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
+  double totalTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  auto kernelVolume = indicePairs.size(0);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::TorchGPU(),
+                         tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return inputGrad;
+}
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/spconv/reordering.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+#include <chrono>
+#include <limits>
+#include <spconv/reordering.cuh>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                 vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
+                            vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0) return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+}  // namespace functor
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
+                                                   Index>;
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
--- a/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvForwardKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            gatherFtorOut(
+                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtorOut;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            gatherFtorOut(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+          auto filterGradSub = filtersGrad[i];
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
--- a/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+++ b/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
+                              features, filters, bias, indicePairs, indiceNum,
+                              numActOut, _inverse, _subM);
+}
+torch::Tensor fused_indice_conv_batchnorm_forward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
+                                                  indicePairs, indiceNum,
+                                                  numActOut, _inverse, _subM);
+}
--- a/mmcv/ops/csrc/pytorch/pybind.cpp
+++ b/mmcv/ops/csrc/pytorch/pybind.cpp
 // Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
 #include "pytorch_cpp_helper.hpp"
 std::string get_compiler_version();
@@ -240,6 +242,43 @@ void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                        Tensor idx_tensor, int b, int n, int m,
                        float min_radius, float max_radius, int nsample);
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template <unsigned NDim>
+std::vector<Tensor> get_indice_pairs_backward(
+    Tensor indices, Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
+                           Tensor indiceNum, int64_t numActOut,
+                           int64_t _inverse, int64_t _subM);
+std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
+                                         Tensor outGrad, Tensor indicePairs,
+                                         Tensor indiceNum, int64_t _inverse,
+                                         int64_t _subM);
+Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
+                                           Tensor bias, Tensor indicePairs,
+                                           Tensor indiceNum, int64_t numActOut,
+                                           int64_t _inverse, int64_t _subM);
+Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
+                              Tensor indiceNum, int64_t numAct);
+Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
+                               Tensor outGrad, Tensor indicePairs,
+                               Tensor indiceNum);
 Tensor bottom_pool_forward(Tensor input);
 Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
@@ -602,6 +641,54 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
        py::arg("std"), py::arg("grad_input"));
+  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
+        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
+        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
+        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
+        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
+        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
+        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
+        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
+        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
+        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
+        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
+        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
+  m.def("indice_maxpool_forward", &indice_maxpool_forward,
+        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numAct"));
+  m.def("indice_maxpool_backward", &indice_maxpool_backward,
+        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
+        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
        py::arg("input"), py::arg("output"), py::arg("psa_type"),
        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),

--- a/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+++ b/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
+                              indicePairs, indiceNum, numAct);
+}
+torch::Tensor indice_maxpool_forward(torch::Tensor features,
+                                     torch::Tensor indicePairs,
+                                     torch::Tensor indiceNum, int64_t numAct) {
+  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
+}
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
+                              outFeatures, outGrad, indicePairs, indiceNum);
+}
+torch::Tensor indice_maxpool_backward(torch::Tensor features,
+                                      torch::Tensor outFeatures,
+                                      torch::Tensor outGrad,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum) {
+  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
+                                      indicePairs, indiceNum);
+}
--- a/mmcv/ops/csrc/pytorch/spconv_ops.cpp
+++ b/mmcv/ops/csrc/pytorch/spconv_ops.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+};
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
+      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+      stride, padding, dilation, outPadding, _subM, _transpose);
+};
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+    return get_indice_pairs_forward_cuda<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+    CHECK_CUDA_INPUT(gridOut);
+    return get_indice_pairs_backward_cuda<NDim>(
+        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+        stride, padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
+                              indicePairs, indiceNum, numActOut, _inverse,
+                              _subM);
+}
+torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
+                                  torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum, int64_t numActOut,
+                                  int64_t _inverse, int64_t _subM) {
+  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
+                                  numActOut, _inverse, _subM);
+}
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
+                              outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+std::vector<torch::Tensor> indice_conv_backward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
+                                   indiceNum, _inverse, _subM);
+}
+template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
--- a/mmcv/ops/csrc/pytorch/spconv_utils.h
+++ b/mmcv/ops/csrc/pytorch/spconv_utils.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/script.h>
+#include <utils/spconv/tensorview/tensorview.h>
+#include "pytorch_cuda_helper.hpp"
+namespace tv {
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+struct TorchGPU : public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+template <typename scalar_t>
+void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+    case at::ScalarType::Double: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Float: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Int: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Half: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Long: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    default:
+      TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+template <typename scalar_t>
+tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<scalar_t>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<scalar_t>(
+      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
+}
+}  // namespace tv
--- a/mmcv/ops/sparse_conv.py
+++ b/mmcv/ops/sparse_conv.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import numpy as np
+import torch
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from ..cnn import CONV_LAYERS
+from . import sparse_functional as Fsp
+from . import sparse_ops as ops
+from .sparse_modules import SparseModule
+from .sparse_structure import SparseConvTensor
+def _calculate_fan_in_and_fan_out_hwio(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError('fan in and fan out can not be computed for tensor'
+                         'with fewer than 2 dimensions')
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(-2)
+        fan_out = tensor.size(-1)
+    else:
+        num_input_fmaps = tensor.size(-2)
+        num_output_fmaps = tensor.size(-1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[..., 0, 0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+    return fan_in, fan_out
+class SparseConvolution(SparseModule):
+    def __init__(self,
+                 ndim,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 subm=False,
+                 output_padding=0,
+                 transposed=False,
+                 inverse=False,
+                 indice_key=None,
+                 fused_bn=False):
+        super(SparseConvolution, self).__init__()
+        assert groups == 1
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        if not isinstance(output_padding, (list, tuple)):
+            output_padding = [output_padding] * ndim
+        for d, s in zip(dilation, stride):
+            assert any([s == 1, d == 1]), "don't support this."
+        self.ndim = ndim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.conv1x1 = np.prod(kernel_size) == 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.inverse = inverse
+        self.output_padding = output_padding
+        self.groups = groups
+        self.subm = subm
+        self.indice_key = indice_key
+        self.fused_bn = fused_bn
+        self.weight = Parameter(
+            torch.Tensor(*kernel_size, in_channels, out_channels))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            if self.transposed:
+                out_spatial_shape = ops.get_deconv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation, self.output_padding)
+            else:
+                out_spatial_shape = ops.get_conv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        if self.conv1x1:
+            features = torch.mm(
+                input.features,
+                self.weight.view(self.in_channels, self.out_channels))
+            if self.bias is not None:
+                features += self.bias
+            out_tensor = SparseConvTensor(features, input.indices,
+                                          input.spatial_shape,
+                                          input.batch_size)
+            out_tensor.indice_dict = input.indice_dict
+            out_tensor.grid = input.grid
+            return out_tensor
+        data = input.find_indice_pair(self.indice_key)
+        if self.inverse:
+            assert data is not None and self.indice_key is not None
+            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
+            assert indice_pairs.shape[0] == np.prod(
+                self.kernel_size
+            ), 'inverse conv must have same kernel size as its couple conv'
+        else:
+            if self.indice_key is not None and data is not None:
+                outids, _, indice_pairs, indice_pair_num, _ = data
+            else:
+                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                    indices,
+                    batch_size,
+                    spatial_shape,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.output_padding,
+                    self.subm,
+                    self.transposed,
+                    grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices,
+                                                      indice_pairs,
+                                                      indice_pair_num,
+                                                      spatial_shape)
+        if self.fused_bn:
+            assert self.bias is not None
+            out_features = ops.fused_indice_conv(features, self.weight,
+                                                 self.bias,
+                                                 indice_pairs.to(device),
+                                                 indice_pair_num,
+                                                 outids.shape[0], self.inverse,
+                                                 self.subm)
+        else:
+            if self.subm:
+                out_features = Fsp.indice_subm_conv(features, self.weight,
+                                                    indice_pairs.to(device),
+                                                    indice_pair_num,
+                                                    outids.shape[0])
+            else:
+                if self.inverse:
+                    out_features = Fsp.indice_inverse_conv(
+                        features, self.weight, indice_pairs.to(device),
+                        indice_pair_num, outids.shape[0])
+                else:
+                    out_features = Fsp.indice_conv(features, self.weight,
+                                                   indice_pairs.to(device),
+                                                   indice_pair_num,
+                                                   outids.shape[0])
+            if self.bias is not None:
+                out_features += self.bias
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+@CONV_LAYERS.register_module()
+class SparseConv2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SparseConv3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SparseConv4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv4d, self).__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SparseConvTranspose2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConvTranspose2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SparseConvTranspose3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConvTranspose3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SparseInverseConv2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super(SparseInverseConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SparseInverseConv3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super(SparseInverseConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SubMConv2d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SubMConv3d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+@CONV_LAYERS.register_module()
+class SubMConv4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv4d, self).__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
--- a/mmcv/ops/sparse_functional.py
+++ b/mmcv/ops/sparse_functional.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch.autograd import Function
+from . import sparse_ops as ops
+class SparseConvFunction(Function):
+    """Sparse Convolution.
+    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
+    more details.
+    """
+    @staticmethod
+    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
+                num_activate_out):
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False)
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False)
+        return input_bp, filters_bp, None, None, None
+class SparseInverseConvFunction(Function):
+    @staticmethod
+    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
+                num_activate_out):
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, True, False)
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            True, False)
+        return input_bp, filters_bp, None, None, None
+class SubMConvFunction(Function):
+    @staticmethod
+    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
+                num_activate_out):
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False, True)
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False, True)
+        return input_bp, filters_bp, None, None, None
+class SparseMaxPoolFunction(Function):
+    @staticmethod
+    def forward(ctx, features, indice_pairs, indice_pair_num,
+                num_activate_out):
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+        Returns:
+            torch.Tensor: Output features from sparse maxpooling.
+        """
+        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
+                                 num_activate_out)
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
+        return out
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
+        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
+                                               indice_pairs, indice_pair_num)
+        return input_bp, None, None, None
+indice_conv = SparseConvFunction.apply
+indice_inverse_conv = SparseInverseConvFunction.apply
+indice_subm_conv = SubMConvFunction.apply
+indice_maxpool = SparseMaxPoolFunction.apply
--- a/mmcv/ops/sparse_modules.py
+++ b/mmcv/ops/sparse_modules.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import OrderedDict
+import torch
+from torch import nn
+from .sparse_structure import SparseConvTensor
+def is_spconv_module(module):
+    spconv_modules = (SparseModule, )
+    return isinstance(module, spconv_modules)
+def is_sparse_conv(module):
+    from .sparse_conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
+def _mean_update(vals, m_vals, t):
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+class SparseModule(nn.Module):
+    """place holder, All module subclass from this will take sptensor in
+    SparseSequential."""
+    pass
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+    To make it easier to understand, given is a small example::
+    Example:
+        >>> # using Sequential:
+        >>> from mmcv.ops import SparseSequential
+        >>> model = SparseSequential(
+                    SparseConv2d(1,20,5),
+                    nn.ReLU(),
+                    SparseConv2d(20,64,5),
+                    nn.ReLU()
+                    )
+        >>> # using Sequential with OrderedDict
+        >>> model = SparseSequential(OrderedDict([
+                      ('conv1', SparseConv2d(1,20,5)),
+                      ('relu1', nn.ReLU()),
+                      ('conv2', SparseConv2d(20,64,5)),
+                      ('relu2', nn.ReLU())
+                    ]))
+        >>> # using Sequential with kwargs(python 3.6+)
+        >>> model = SparseSequential(
+                      conv1=SparseConv2d(1,20,5),
+                      relu1=nn.ReLU(),
+                      conv2=SparseConv2d(20,64,5),
+                      relu2=nn.ReLU()
+                    )
+    """
+    def __init__(self, *args, **kwargs):
+        super(SparseSequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError('kwargs only supported in py36+')
+            if name in self._modules:
+                raise ValueError('name exists.')
+            self.add_module(name, module)
+        self._sparity_dict = {}
+    def __getitem__(self, idx):
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError('index {} is out of range'.format(idx))
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+    def __len__(self):
+        return len(self._modules)
+    @property
+    def sparity_dict(self):
+        return self._sparity_dict
+    def add(self, module, name=None):
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError('name exists')
+        self.add_module(name, module)
+    def forward(self, input):
+        for k, module in self._modules.items():
+            if is_spconv_module(module):
+                assert isinstance(input, SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
+    def fused(self):
+        from .sparse_conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
+                                                      nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor."""
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer."""
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
--- a/mmcv/ops/sparse_ops.py
+++ b/mmcv/ops/sparse_ops.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', [
+    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
+    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
+    'get_indice_pairs_3d_backward', 'indice_conv_forward',
+    'indice_conv_backward', 'fused_indice_conv_forward',
+    'indice_maxpool_forward', 'indice_maxpool_backward'
+])
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+def get_indice_pairs(indices,
+                     batch_size,
+                     spatial_shape,
+                     ksize=3,
+                     stride=1,
+                     padding=0,
+                     dilation=1,
+                     out_padding=0,
+                     subm=False,
+                     transpose=False,
+                     grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
+        elif ndim == 4:
+            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+def indice_conv(features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                inverse=False,
+                subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_forward(features, filters, indice_pairs,
+                                              indice_pair_num,
+                                              num_activate_out, int(inverse),
+                                              int(subm))
+    else:
+        raise NotImplementedError
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    if features.dtype == torch.half or filters.dtypes == torch.float32:
+        func = ext_module.fused_indice_conv_forward
+    else:
+        raise NotImplementedError
+    return func(features, filters, bias, indice_pairs, indice_pair_num,
+                num_activate_out, int(inverse), int(subm))
+def indice_conv_backward(features,
+                         filters,
+                         out_bp,
+                         indice_pairs,
+                         indice_pair_num,
+                         inverse=False,
+                         subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_backward(features, filters, out_bp,
+                                               indice_pairs, indice_pair_num,
+                                               int(inverse), int(subm))
+    else:
+        raise NotImplementedError
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_forward(features, indice_pairs,
+                                                 indice_pair_num,
+                                                 num_activate_out)
+    else:
+        raise NotImplementedError
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_backward(features, out_features,
+                                                  out_bp, indice_pairs,
+                                                  indice_pair_num)
+    else:
+        raise NotImplementedError
--- a/mmcv/ops/sparse_pool.py
+++ b/mmcv/ops/sparse_pool.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# import sparse_functional as Fsp
+# import sparse_ops as ops
+from .sparse_functional import indice_maxpool
+from .sparse_modules import SparseModule
+from .sparse_ops import get_conv_output_size, get_indice_pairs
+from .sparse_structure import SparseConvTensor
+class SparseMaxPool(SparseModule):
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super(SparseMaxPool, self).__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = get_conv_output_size(spatial_shape,
+                                                     self.kernel_size,
+                                                     self.stride, self.padding,
+                                                     self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
+            self.padding, self.dilation, 0, self.subm)
+        out_features = indice_maxpool(features, indice_pairs.to(device),
+                                      indice_pairs_num.to(device),
+                                      outids.shape[0])
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+class SparseMaxPool2d(SparseMaxPool):
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
+                                              dilation)
+class SparseMaxPool3d(SparseMaxPool):
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
+                                              dilation)
--- a/mmcv/ops/sparse_structure.py
+++ b/mmcv/ops/sparse_structure.py
+import numpy as np
+import torch
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully when
+    indice repeats, don't support repeat add which is supported in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+class SparseConvTensor(object):
+    def __init__(self,
+                 features,
+                 indices,
+                 spatial_shape,
+                 batch_size,
+                 grid=None):
+        self.features = features
+        self.indices = indices
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict = {}
+        self.grid = grid
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+    def find_indice_pair(self, key):
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+    def dense(self, channels_first=True):
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+    @property
+    def sparity(self):
+        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
+                self.batch_size)