Release v1.0.0rc1

333536f6 · Wenwei Zhang · GitHub · 9c7270d0 · f747daab · 333536f6
Unverified Commit 333536f6 authored Apr 06, 2022 by Wenwei Zhang Committed by GitHub Apr 06, 2022
20 changed files
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.ops import SparseModule, SparseSequential
 from torch import nn

-from mmdet3d.ops import spconv
 from mmdet.models.backbones.resnet import BasicBlock, Bottleneck


-class SparseBottleneck(Bottleneck, spconv.SparseModule):
+class SparseBottleneck(Bottleneck, SparseModule):
    """Sparse bottleneck block for PartA^2.

    Bottleneck block implemented with submanifold sparse convolution.
@@ -32,7 +32,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
                 conv_cfg=None,
                 norm_cfg=None):

-        spconv.SparseModule.__init__(self)
+        SparseModule.__init__(self)
        Bottleneck.__init__(
            self,
            inplanes,
@@ -65,7 +65,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
        return out


-class SparseBasicBlock(BasicBlock, spconv.SparseModule):
+class SparseBasicBlock(BasicBlock, SparseModule):
    """Sparse basic block for PartA^2.

    Sparse basic block implemented with submanifold sparse convolution.
@@ -90,7 +90,7 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule):
                 downsample=None,
                 conv_cfg=None,
                 norm_cfg=None):
-        spconv.SparseModule.__init__(self)
+        SparseModule.__init__(self)
        BasicBlock.__init__(
            self,
            inplanes,
@@ -182,5 +182,5 @@ def make_sparse_convmodule(in_channels,
        elif layer == 'act':
            layers.append(nn.ReLU(inplace=True))

-    layers = spconv.SparseSequential(*layers)
+    layers = SparseSequential(*layers)
    return layers
--- a/mmdet3d/ops/spconv/__init__.py
+++ b/mmdet3d/ops/spconv/__init__.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
-                   SparseConvTranspose3d, SparseInverseConv2d,
-                   SparseInverseConv3d, SubMConv2d, SubMConv3d)
-from .modules import SparseModule, SparseSequential
-from .pool import SparseMaxPool2d, SparseMaxPool3d
-from .structure import SparseConvTensor, scatter_nd
-
-__all__ = [
-    'SparseConv2d',
-    'SparseConv3d',
-    'SubMConv2d',
-    'SubMConv3d',
-    'SparseConvTranspose2d',
-    'SparseConvTranspose3d',
-    'SparseInverseConv2d',
-    'SparseInverseConv3d',
-    'SparseModule',
-    'SparseSequential',
-    'SparseMaxPool2d',
-    'SparseMaxPool3d',
-    'SparseConvTensor',
-    'scatter_nd',
-]
--- a/mmdet3d/ops/spconv/conv.py
+++ b/mmdet3d/ops/spconv/conv.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import torch
-from mmcv.cnn import CONV_LAYERS
-from torch.nn import init
-from torch.nn.parameter import Parameter
-
-from . import functional as Fsp
-from . import ops
-from .modules import SparseModule
-from .structure import SparseConvTensor
-
-
-def _calculate_fan_in_and_fan_out_hwio(tensor):
-    dimensions = tensor.ndimension()
-    if dimensions < 2:
-        raise ValueError('fan in and fan out can not be computed for tensor'
-                         'with fewer than 2 dimensions')
-
-    if dimensions == 2:  # Linear
-        fan_in = tensor.size(-2)
-        fan_out = tensor.size(-1)
-    else:
-        num_input_fmaps = tensor.size(-2)
-        num_output_fmaps = tensor.size(-1)
-        receptive_field_size = 1
-        if tensor.dim() > 2:
-            receptive_field_size = tensor[..., 0, 0].numel()
-        fan_in = num_input_fmaps * receptive_field_size
-        fan_out = num_output_fmaps * receptive_field_size
-
-    return fan_in, fan_out
-
-
-class SparseConvolution(SparseModule):
-
-    def __init__(self,
-                 ndim,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 subm=False,
-                 output_padding=0,
-                 transposed=False,
-                 inverse=False,
-                 indice_key=None,
-                 fused_bn=False):
-        super(SparseConvolution, self).__init__()
-        assert groups == 1
-        if not isinstance(kernel_size, (list, tuple)):
-            kernel_size = [kernel_size] * ndim
-        if not isinstance(stride, (list, tuple)):
-            stride = [stride] * ndim
-        if not isinstance(padding, (list, tuple)):
-            padding = [padding] * ndim
-        if not isinstance(dilation, (list, tuple)):
-            dilation = [dilation] * ndim
-        if not isinstance(output_padding, (list, tuple)):
-            output_padding = [output_padding] * ndim
-
-        for d, s in zip(dilation, stride):
-            assert any([s == 1, d == 1]), "don't support this."
-
-        self.ndim = ndim
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.conv1x1 = np.prod(kernel_size) == 1
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.transposed = transposed
-        self.inverse = inverse
-        self.output_padding = output_padding
-        self.groups = groups
-        self.subm = subm
-        self.indice_key = indice_key
-        self.fused_bn = fused_bn
-
-        self.weight = Parameter(
-            torch.Tensor(*kernel_size, in_channels, out_channels))
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-        if self.bias is not None:
-            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
-            bound = 1 / math.sqrt(fan_in)
-            init.uniform_(self.bias, -bound, bound)
-
-    def forward(self, input):
-        assert isinstance(input, SparseConvTensor)
-        features = input.features
-        device = features.device
-        indices = input.indices
-        spatial_shape = input.spatial_shape
-        batch_size = input.batch_size
-        if not self.subm:
-            if self.transposed:
-                out_spatial_shape = ops.get_deconv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding,
-                    self.dilation, self.output_padding)
-            else:
-                out_spatial_shape = ops.get_conv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding,
-                    self.dilation)
-
-        else:
-            out_spatial_shape = spatial_shape
-        # input.update_grid(out_spatial_shape)
-        # t = time.time()
-        if self.conv1x1:
-            features = torch.mm(
-                input.features,
-                self.weight.view(self.in_channels, self.out_channels))
-            if self.bias is not None:
-                features += self.bias
-            out_tensor = SparseConvTensor(features, input.indices,
-                                          input.spatial_shape,
-                                          input.batch_size)
-            out_tensor.indice_dict = input.indice_dict
-            out_tensor.grid = input.grid
-            return out_tensor
-        data = input.find_indice_pair(self.indice_key)
-        if self.inverse:
-            assert data is not None and self.indice_key is not None
-            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
-            assert indice_pairs.shape[0] == np.prod(
-                self.kernel_size
-            ), 'inverse conv must have same kernel size as its couple conv'
-        else:
-            if self.indice_key is not None and data is not None:
-                outids, _, indice_pairs, indice_pair_num, _ = data
-            else:
-                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
-                    indices,
-                    batch_size,
-                    spatial_shape,
-                    self.kernel_size,
-                    self.stride,
-                    self.padding,
-                    self.dilation,
-                    self.output_padding,
-                    self.subm,
-                    self.transposed,
-                    grid=input.grid)
-                input.indice_dict[self.indice_key] = (outids, indices,
-                                                      indice_pairs,
-                                                      indice_pair_num,
-                                                      spatial_shape)
-        if self.fused_bn:
-            assert self.bias is not None
-            out_features = ops.fused_indice_conv(features, self.weight,
-                                                 self.bias,
-                                                 indice_pairs.to(device),
-                                                 indice_pair_num,
-                                                 outids.shape[0], self.inverse,
-                                                 self.subm)
-        else:
-            if self.subm:
-                out_features = Fsp.indice_subm_conv(features, self.weight,
-                                                    indice_pairs.to(device),
-                                                    indice_pair_num,
-                                                    outids.shape[0])
-            else:
-                if self.inverse:
-                    out_features = Fsp.indice_inverse_conv(
-                        features, self.weight, indice_pairs.to(device),
-                        indice_pair_num, outids.shape[0])
-                else:
-                    out_features = Fsp.indice_conv(features, self.weight,
-                                                   indice_pairs.to(device),
-                                                   indice_pair_num,
-                                                   outids.shape[0])
-
-            if self.bias is not None:
-                out_features += self.bias
-        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
-                                      batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
-        return out_tensor
-
-
-@CONV_LAYERS.register_module(force=True)
-class SparseConv2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConv2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SparseConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConv3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SparseConv4d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConv4d, self).__init__(
-            4,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SparseConvTranspose2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConvTranspose2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            transposed=True,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SparseConvTranspose3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConvTranspose3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            transposed=True,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SparseInverseConv2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 indice_key,
-                 bias=True):
-        super(SparseInverseConv2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            bias=bias,
-            inverse=True,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SparseInverseConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 indice_key,
-                 bias=True):
-        super(SparseInverseConv3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            bias=bias,
-            inverse=True,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SubMConv2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SubMConv2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SubMConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SubMConv3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
-
-
-@CONV_LAYERS.register_module(force=True)
-class SubMConv4d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SubMConv4d, self).__init__(
-            4,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
--- a/mmdet3d/ops/spconv/functional.py
+++ b/mmdet3d/ops/spconv/functional.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from torch.autograd import Function
-
-from . import ops as ops
-
-
-class SparseConvFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out):
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, False)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            False)
-
-        return input_bp, filters_bp, None, None, None
-
-
-class SparseInverseConvFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out):
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, True, False)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            True, False)
-
-        return input_bp, filters_bp, None, None, None
-
-
-class SubMConvFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out):
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, False, True)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            False, True)
-
-        return input_bp, filters_bp, None, None, None
-
-
-class SparseMaxPoolFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, indice_pairs, indice_pair_num,
-                num_activate_out):
-        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
-                                 num_activate_out)
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
-        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
-                                               indice_pairs, indice_pair_num)
-        return input_bp, None, None, None
-
-
-indice_conv = SparseConvFunction.apply
-indice_inverse_conv = SparseInverseConvFunction.apply
-indice_subm_conv = SubMConvFunction.apply
-indice_maxpool = SparseMaxPoolFunction.apply
--- a/mmdet3d/ops/spconv/include/paramsgrid.h
+++ b/mmdet3d/ops/spconv/include/paramsgrid.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-
-namespace detail {
-template <class T>
-int getTotalSize(std::vector<T> arg) {
-  return arg.size();
-}
-
-template <class T, class... TArgs>
-int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-template <typename T>
-int getSize(std::vector<T> arg) {
-  return arg.size();
-}
-
-template <int Idx, class TT, class T>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-
-template <int Idx, class TT, class T, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-}  // namespace detail
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-
-#endif
--- a/mmdet3d/ops/spconv/include/prettyprint.h
+++ b/mmdet3d/ops/spconv/include/prettyprint.h
-//          Copyright Louis Delacroix 2010 - 2014.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-//
-// A pretty printing library for C++
-//
-// Usage:
-// Include this header, and operator<< will "just work".
-
-#ifndef H_PRETTY_PRINT
-#define H_PRETTY_PRINT
-
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <ostream>
-#include <set>
-#include <tuple>
-#include <type_traits>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-
-namespace pretty_print {
-namespace detail {
-// SFINAE type trait to detect whether T::const_iterator exists.
-
-struct sfinae_base {
-  using yes = char;
-  using no = yes[2];
-};
-
-template <typename T>
-struct has_const_iterator : private sfinae_base {
- private:
-  template <typename C>
-  static yes &test(typename C::const_iterator *);
-  template <typename C>
-  static no &test(...);
-
- public:
-  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
-  using type = T;
-};
-
-template <typename T>
-struct has_begin_end : private sfinae_base {
- private:
-  template <typename C>
-  static yes &
-  f(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::begin)),
-                   typename C::const_iterator (C::*)() const>::value>::type *);
-
-  template <typename C>
-  static no &f(...);
-
-  template <typename C>
-  static yes &g(typename std::enable_if<
-                std::is_same<decltype(static_cast<typename C::const_iterator (
-                                          C::*)() const>(&C::end)),
-                             typename C::const_iterator (C::*)() const>::value,
-                void>::type *);
-
-  template <typename C>
-  static no &g(...);
-
- public:
-  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
-  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
-};
-
-}  // namespace detail
-
-// Holds the delimiter values for a specific character type
-
-template <typename TChar>
-struct delimiters_values {
-  using char_type = TChar;
-  const char_type *prefix;
-  const char_type *delimiter;
-  const char_type *postfix;
-};
-
-// Defines the delimiter values for a specific container and character type
-
-template <typename T, typename TChar>
-struct delimiters {
-  using type = delimiters_values<TChar>;
-  static const type values;
-};
-
-// Functor to print containers. You can use this directly if you want
-// to specify a non-default delimiters type. The printing logic can
-// be customized by specializing the nested template.
-
-template <typename T, typename TChar = char,
-          typename TCharTraits = ::std::char_traits<TChar>,
-          typename TDelimiters = delimiters<T, TChar>>
-struct print_container_helper {
-  using delimiters_type = TDelimiters;
-  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
-
-  template <typename U>
-  struct printer {
-    static void print_body(const U &c, ostream_type &stream) {
-      using std::begin;
-      using std::end;
-
-      auto it = begin(c);
-      const auto the_end = end(c);
-
-      if (it != the_end) {
-        for (;;) {
-          stream << *it;
-
-          if (++it == the_end) break;
-
-          if (delimiters_type::values.delimiter != NULL)
-            stream << delimiters_type::values.delimiter;
-        }
-      }
-    }
-  };
-
-  print_container_helper(const T &container) : container_(container) {}
-
-  inline void operator()(ostream_type &stream) const {
-    if (delimiters_type::values.prefix != NULL)
-      stream << delimiters_type::values.prefix;
-
-    printer<T>::print_body(container_, stream);
-
-    if (delimiters_type::values.postfix != NULL)
-      stream << delimiters_type::values.postfix;
-  }
-
- private:
-  const T &container_;
-};
-
-// Specialization for pairs
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename T1, typename T2>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::pair<T1, T2>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-
-  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
-    stream << c.first;
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << c.second;
-  }
-};
-
-// Specialization for tuples
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename... Args>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::tuple<Args...>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  using element_type = std::tuple<Args...>;
-
-  template <std::size_t I>
-  struct Int {};
-
-  static void print_body(const element_type &c, ostream_type &stream) {
-    tuple_print(c, stream, Int<0>());
-  }
-
-  static void tuple_print(const element_type &, ostream_type &,
-                          Int<sizeof...(Args)>) {}
-
-  static void tuple_print(
-      const element_type &c, ostream_type &stream,
-      typename std::conditional<sizeof...(Args) != 0, Int<0>,
-                                std::nullptr_t>::type) {
-    stream << std::get<0>(c);
-    tuple_print(c, stream, Int<1>());
-  }
-
-  template <std::size_t N>
-  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-
-    stream << std::get<N>(c);
-
-    tuple_print(c, stream, Int<N + 1>());
-  }
-};
-
-// Prints a print_container_helper to the specified stream.
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &stream,
-    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
-  helper(stream);
-  return stream;
-}
-
-// Basic is_container template; specialize to derive from std::true_type for all
-// desired container types
-
-template <typename T>
-struct is_container
-    : public std::integral_constant<bool,
-                                    detail::has_const_iterator<T>::value &&
-                                        detail::has_begin_end<T>::beg_value &&
-                                        detail::has_begin_end<T>::end_value> {};
-
-template <typename T, std::size_t N>
-struct is_container<T[N]> : std::true_type {};
-
-template <std::size_t N>
-struct is_container<char[N]> : std::false_type {};
-
-template <typename T>
-struct is_container<std::valarray<T>> : std::true_type {};
-
-template <typename T1, typename T2>
-struct is_container<std::pair<T1, T2>> : std::true_type {};
-
-template <typename... Args>
-struct is_container<std::tuple<Args...>> : std::true_type {};
-
-// Default delimiters
-
-template <typename T>
-struct delimiters<T, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T>
-const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
-template <typename T>
-struct delimiters<T, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T>
-const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
-                                                                   L"]"};
-
-// Delimiters for (multi)set and unordered_(multi)set
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
-                                                                  "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
-        "{", ", ", "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
-    L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-               wchar_t>::values = {L"{", L", ", L"}"};
-
-// Delimiters for pair and tuple
-
-template <typename T1, typename T2>
-struct delimiters<std::pair<T1, T2>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
-    "(", ", ", ")"};
-template <typename T1, typename T2>
-struct delimiters<::std::pair<T1, T2>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<wchar_t>
-    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
-
-template <typename... Args>
-struct delimiters<std::tuple<Args...>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename... Args>
-const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
-    "(", ", ", ")"};
-template <typename... Args>
-struct delimiters<::std::tuple<Args...>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename... Args>
-const delimiters_values<wchar_t>
-    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
-
-// Type-erasing helper class for easy use of custom delimiters.
-// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
-// and MyDelims needs to be defined for TChar. Usage: "cout <<
-// pretty_print::custom_delims<MyDelims>(x)".
-
-struct custom_delims_base {
-  virtual ~custom_delims_base() {}
-  virtual std::ostream &stream(::std::ostream &) = 0;
-  virtual std::wostream &stream(::std::wostream &) = 0;
-};
-
-template <typename T, typename Delims>
-struct custom_delims_wrapper : custom_delims_base {
-  custom_delims_wrapper(const T &t_) : t(t_) {}
-
-  std::ostream &stream(std::ostream &s) {
-    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
-               t);
-  }
-
-  std::wostream &stream(std::wostream &s) {
-    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
-                                       Delims>(t);
-  }
-
- private:
-  const T &t;
-};
-
-template <typename Delims>
-struct custom_delims {
-  template <typename Container>
-  custom_delims(const Container &c)
-      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
-
-  std::unique_ptr<custom_delims_base> base;
-};
-
-template <typename TChar, typename TCharTraits, typename Delims>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
-  return p.base->stream(s);
-}
-
-// A wrapper for a C-style array given as pointer-plus-size.
-// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
-
-template <typename T>
-struct array_wrapper_n {
-  typedef const T *const_iterator;
-  typedef T value_type;
-
-  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
-  inline const_iterator begin() const { return _array; }
-  inline const_iterator end() const { return _array + _n; }
-
- private:
-  const T *const _array;
-  size_t _n;
-};
-
-// A wrapper for hash-table based containers that offer local iterators to each
-// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
-// 5 of container m.)
-
-template <typename T>
-struct bucket_print_wrapper {
-  typedef typename T::const_local_iterator const_iterator;
-  typedef typename T::size_type size_type;
-
-  const_iterator begin() const { return m_map.cbegin(n); }
-
-  const_iterator end() const { return m_map.cend(n); }
-
-  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
-
- private:
-  const T &m_map;
-  const size_type n;
-};
-
-}  // namespace pretty_print
-
-// Global accessor functions for the convenience wrappers
-
-template <typename T>
-inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
-                                                           size_t n) {
-  return pretty_print::array_wrapper_n<T>(a, n);
-}
-
-template <typename T>
-pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
-                                                   typename T::size_type n) {
-  return pretty_print::bucket_print_wrapper<T>(m, n);
-}
-
-// Main magic entry point: An overload snuck into namespace std.
-// Can we do better?
-
-namespace std {
-// Prints a container to the stream using default delimiters
-
-template <typename T, typename TChar, typename TCharTraits>
-inline typename enable_if<::pretty_print::is_container<T>::value,
-                          basic_ostream<TChar, TCharTraits> &>::type
-operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
-  return stream
-         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
-                container);
-}
-}  // namespace std
-
-#endif  // H_PRETTY_PRINT
--- a/mmdet3d/ops/spconv/include/pybind11_utils.h
+++ b/mmdet3d/ops/spconv/include/pybind11_utils.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <pybind11/embed.h> // everything needed for embedding
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <tensorview/tensorview.h>
-
-namespace py = pybind11;
-
-template <typename T, typename TPyObject>
-std::vector<T> array2Vector(TPyObject arr){
-    py::array arr_np = arr;
-    size_t size = arr.attr("size").template cast<size_t>();
-    py::array_t<T> arr_cc = arr_np;
-    std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
-    return data;
-}
-
-template <typename T>
-std::vector<T> arrayT2Vector(py::array_t<T> arr)
-{
-  std::vector<T> data(arr.data(), arr.data() + arr.size());
-  return data;
-}
-
-template <typename T, typename TPyObject>
-tv::TensorView<T> array2TensorView(TPyObject arr){
-    py::array arr_np = arr;
-    py::array_t<T> arr_cc = arr_np;
-    tv::Shape shape;
-    for (int i = 0; i < arr_cc.ndim(); ++i){
-        shape.push_back(arr_cc.shape(i));
-    }
-    return tv::TensorView<T>(arr_cc.mutable_data(), shape);
-}
-template <typename T>
-tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
-    tv::Shape shape;
-    for (int i = 0; i < arr.ndim(); ++i){
-        shape.push_back(arr.shape(i));
-    }
-    return tv::TensorView<T>(arr.mutable_data(), shape);
-}
--- a/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSED_SPARSE_CONV_OP_H_
-#define FUSED_SPARSE_CONV_OP_H_
-
-#include <cuda_runtime_api.h>
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <torch/script.h>
-#include <torch_utils.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-template <typename T>
-torch::Tensor fusedIndiceConvBatchNorm(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output =
-      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {  // the center index of subm conv don't need gather and scatter
-               // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    } else {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    } else {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/geometry.h
+++ b/mmdet3d/ops/spconv/include/spconv/geometry.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include <tensorview/tensorview.h>
-
-#include <iostream>
-#include <limits>
-
-namespace spconv {
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
-                                    const Index *kernelSize,
-                                    const Index *stride, const Index *padding,
-                                    const Index *dilation,
-                                    const Index *outSpatialShape, Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
-                 stride[i] + padding[i]) /
-                stride[i];
-    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid) ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPosTranspose(
-    const Index *input_pos, const Index *kernelSize, const Index *stride,
-    const Index *padding, const Index *dilation, const Index *outSpatialShape,
-    Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = input_pos[i] * stride[i] - padding[i];
-    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (val - lowers[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid) ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<Index> indicesOut,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *kernelSize, const Index *stride,
-                         const Index *padding, const Index *dilation,
-                         const Index *outSpatialShape) {
-  // indicesOut: num_active * kernelVolume * (NDim + 1)
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
-                           tv::TensorView<Index> indicesOut,
-                           tv::TensorView<IndexGrid> gridsOut,
-                           tv::TensorView<Index> indicePairs,
-                           tv::TensorView<Index> indiceNum,
-                           const Index *kernelSize, const Index *stride,
-                           const Index *padding, const Index *dilation,
-                           const Index *outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  // Index validPoints[kernelVolume * (NDim + 1)];
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int j = 0; j < numActIn; ++j) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    gridsOut[index] = j;
-  }
-  for (int j = 0; j < numActIn; ++j) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-              spatialVolume * indicesIn(j, 0);
-      if (gridsOut[index] > -1) {
-        indicePairs(offset, 0, indiceNum[offset]) = j;
-        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-      }
-    }
-  }
-  return numActIn;
-}
-
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/indice.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.cu.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INDICE_CU_H_
-#define INDICE_CU_H_
-#include <spconv/geometry.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void prepareIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
-    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(offset, 0, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(offset, 1, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void prepareDeConvIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
-    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(offset, 0, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(offset, 1, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignGridAndIndiceOutKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numAct, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    index = indicePairUnique[ix];
-    gridsOut[index] = ix;
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignIndicePairsKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numActIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  Index index;
-  int kernelVolume = indicePairs.dim(0);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairs(i, 1, ix);
-      if (index > -1) {
-        indicePairs(i, 1, ix) = gridsOut[index];
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void prepareSubMGridKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
-                                         outSpatialShape.data()) +
-            spatialVolume * indicesIn(ix, 0);
-    gridsOut[index] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void getSubMIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      if (gridsOut[index] > -1) {
-        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(offset, 1, oldNum) = gridsOut[index];
-        indicePairs(offset, 0, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridKernel(const Index *indicePairUnique,
-                                tv::TensorView<IndexGrid> gridsOut,
-                                int numAct) {
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    gridsOut[indicePairUnique[ix]] = -1;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridSubMKernel(
-    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
-  int outSpatialShapeReg[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShapeReg[i] = outSpatialShape[i];
-  }
-  Index spatialVolume = 1;
-  auto indsPtr = indices;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    indsPtr = indices + ix * (NDim + 1);
-    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
-    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
-  }
-}
-
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/indice.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
-#define SPARSE_CONV_INDICE_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1 {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2 {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-}  // namespace functor
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/maxpool.h
+++ b/mmdet3d/ops/spconv/include/spconv/maxpool.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
-#define SPARSE_MAXPOOL_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct SparseMaxPoolForwardFunctor {
-  void operator()(const Device& d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const Index> indices, int size);
-};
-
-template <typename Device, typename T, typename Index>
-struct SparseMaxPoolBackwardFunctor {
-  void operator()(const Device& d, tv::TensorView<const T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const T> dout, tv::TensorView<T> din,
-                  tv::TensorView<const Index> indices, int size);
-};
-
-}  // namespace functor
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/mp_helper.h
+++ b/mmdet3d/ops/spconv/include/spconv/mp_helper.h
-#ifndef MP_HELPER_H_
-#define MP_HELPER_H_
-#include <type_traits>
-#include <utility>
-
-namespace spconv {
-template <class... T>
-struct mp_list {};
-
-template <class T, T... I>
-using mp_list_c = mp_list<std::integral_constant<T, I>...>;
-
-namespace detail {
-
-template <class... T, class F>
-constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
-  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
-}
-
-template <class F>
-constexpr F mp_for_each_impl(mp_list<>, F &&f) {
-  return std::forward<F>(f);
-}
-
-}  // namespace detail
-
-namespace detail {
-
-template <class A, template <class...> class B>
-struct mp_rename_impl {
-  // An error "no type named 'type'" here means that the first argument to
-  // mp_rename is not a list
-};
-
-template <template <class...> class A, class... T, template <class...> class B>
-struct mp_rename_impl<A<T...>, B> {
-  using type = B<T...>;
-};
-
-}  // namespace detail
-
-template <class A, template <class...> class B>
-using mp_rename = typename detail::mp_rename_impl<A, B>::type;
-
-template <class L, class F>
-constexpr F mp_for_each(F &&f) {
-  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
-}
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/point2voxel.h
+++ b/mmdet3d/ops/spconv/include/spconv/point2voxel.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-
-#include <algorithm>
-// #include <vector>
-#include <math.h>
-
-#include <iostream>
-
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(py::array_t<DType> points,
-                               py::array_t<DType> voxels,
-                               py::array_t<DType> means, py::array_t<int> coors,
-                               py::array_t<int> num_points_per_voxel,
-                               py::array_t<int> coor_to_voxelidx,
-                               std::vector<DType> voxel_size,
-                               std::vector<DType> coors_range, int max_points,
-                               int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto means_rw = means.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) +=
-            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
-      }
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(i, j, k) = means_rw(i, k);
-      }
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_height(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-        height_rw(voxelidx, k) =
-            std::min(points_rw(i, k), height_rw(voxelidx, k));
-        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    for (int k = 0; k < num_features; ++k) {
-      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
-                    py::array_t<DType> height, py::array_t<DType> maxs,
-                    py::array_t<int> coor_to_voxelidx,
-                    std::vector<DType> voxel_size,
-                    std::vector<DType> coors_range, int max_voxels, DType eps) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<1>();
-  auto maxs_rw = maxs.template mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-    }
-    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
-    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
-  }
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
-      mask(i) = 0;
-    }
-  }
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<int> voxel_mask, py::array_t<DType> mins,
-    py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels, int block_factor, int block_size,
-    DType height_threshold) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto mins_rw = mins.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-
-  DType max_value, min_value;
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int block_shape_H = grid_size[1] / block_factor;
-  int block_shape_W = grid_size[0] / block_factor;
-  int voxelidx, num;
-  int block_coor[2];
-  int startx, stopx, starty, stopy;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      block_coor[0] = coor[1] / block_factor;
-      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) =
-          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) =
-          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor[1] = coors_rw(i, 1);
-    coor[2] = coors_rw(i, 2);
-    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
-    block_coor[0] = coor[1] / block_factor;
-    block_coor[1] = coor[2] / block_factor;
-    min_value = mins_rw(block_coor[0], block_coor[1]);
-    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx =
-        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    starty = std::max(0, block_coor[1] - block_size / 2);
-    stopy =
-        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-
-    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k) {
-        min_value = std::min(min_value, mins_rw(j, k));
-        max_value = std::max(max_value, maxs_rw(j, k));
-      }
-    }
-    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
-  }
-  return voxel_num;
-}
-
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/include/spconv/pool_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/pool_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_POOL_OP_H_
-#define SPARSE_POOL_OP_H_
-
-#include <cuda_runtime_api.h>
-#include <spconv/maxpool.h>
-#include <torch/script.h>
-#include <torch_utils.h>
-#include <utility/timer.h>
-
-namespace spconv {
-template <typename T>
-torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                            torch::Tensor indiceNum, int64_t numAct) {
-  auto device = features.device().type();
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
-  double totalTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    if (device == torch::kCPU) {
-      functor::SparseMaxPoolForwardFunctor<tv::CPU, T, int> forwardFtor;
-      forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(features),
-                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-    } else {
-      functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
-      forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(features),
-                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-    // totalTime += timer.report() / 1000.0;
-  }
-  // std::cout << "maxpool forward time " << totalTime << std::endl;
-  return output;
-}
-
-template <typename T>
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                    torch::Tensor outFeatures,
-                                    torch::Tensor outGrad,
-                                    torch::Tensor indicePairs,
-                                    torch::Tensor indiceNum) {
-  auto device = features.device().type();
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  auto kernelVolume = indicePairs.size(0);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, int> backwardFtor;
-      backwardFtor(tv::CPU(), tv::torch2tv<const T>(outFeatures),
-                   tv::torch2tv<const T>(features),
-                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
-                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-    } else {
-      functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
-      backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
-                   tv::torch2tv<const T>(features),
-                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
-                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-  return inputGrad;
-}
-
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef REORDERING_CU_H_
-#define REORDERING_CU_H_
-#include <tensorview/helper_kernel.cu.h>
-
-// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-namespace spconv {
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void gatherGenericKernel(T *buffer, const T *features,
-                                    const Index *indices, int size,
-                                    int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              features[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void gatherVecKernel(T *buffer, const T *features,
-                                const Index *indices, int size, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          reinterpret_cast<VecType *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void gatherVecBlockKernel(T *buffer, const T *features,
-                                     const Index *indices, int size,
-                                     int numPlanes) {
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
-  features += blockIdx.x * NumTLP;
-  buffer += blockIdx.x * NumTLP;
-
-  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      reinterpret_cast<VecType *>(
-          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
-          reinterpret_cast<const VecType *>(
-              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
-                        threadIdx.x];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
-                                        const Index *indices, int size,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          outFeatures[inds[ilp] + iy] +=
-              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
-                                         const Index *indices, int size,
-                                         int numPlanes) {
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
-  outFeatures += blockIdx.x * NumTLP;
-  buffer += blockIdx.x * NumTLP;
-  T buf[vecloadFactor];
-  T buf2[vecloadFactor];
-  Index idx;
-  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(buf)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idx];
-      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
-          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        buf[i] += buf2[i];
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idx] =
-          reinterpret_cast<VecType *>(buf)[0];
-    }
-  }
-}
-
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/reordering.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_REORDERING_FUNCTOR_H_
-#define SPARSE_REORDERING_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct SparseGatherFunctor {
-  void operator()(const Device& d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size);
-};
-
-template <typename Device, typename T, typename Index>
-struct SparseScatterAddFunctor {
-  void operator()(const Device& d, tv::TensorView<T> out_features,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size,
-                  bool stable = false);
-};
-}  // namespace functor
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_OP_H_
-#define SPARSE_CONV_OP_H_
-
-#include <cuda_runtime_api.h>
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <torch/script.h>
-#include <torch_utils.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-template <unsigned NDim>
-std::vector<torch::Tensor> getIndicePair(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1;  // batchIdx + xyz
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  torch::Tensor indicePairs =
-      torch::full({kernelVolume, 2, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor gridOut =
-      torch::full({batchSize * outputVolume}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
-  int64_t numActOut = -1;
-  tv::SimpleVector<int, NDim> outSpatialShape32;
-  tv::SimpleVector<int, NDim> kernelSize32;
-  tv::SimpleVector<int, NDim> stride32;
-  tv::SimpleVector<int, NDim> padding32;
-  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique = torch::full(
-      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-      torch::dtype(torch::kInt32).device(indices.device()));
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShape32.push_back(outSpatialShape[i]);
-    kernelSize32.push_back(kernelSize[i]);
-    if (subM) {
-      stride32.push_back(1);
-      padding32.push_back(kernelSize[i] / 2);
-      dilation32.push_back(dilation[i]);
-    } else {
-      stride32.push_back(stride[i]);
-      padding32.push_back(padding[i]);
-      dilation32.push_back(dilation[i]);
-    }
-  }
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose);
-    } else {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose);
-    }
-    return {indices, indicePairs, indiceNum};
-  } else {
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
-          transpose);
-    } else {
-      auto getIndicePairFtorP1 =
-          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
-      auto getIndicePairFtorP2 =
-          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtorP1(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-          padding32, dilation32, outSpatialShape32, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = getIndicePairFtorP2(
-            tv::TorchGPU(), tv::torch2tv<const int>(indices),
-            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
-      }
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-
-template <unsigned NDim>
-std::vector<torch::Tensor> getIndicePairPreGrid(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1;  // batchIdx + xyz
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
-  torch::Tensor indicePairs =
-      torch::full({kernelVolume, 2, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
-  int64_t numActOut = -1;
-  tv::SimpleVector<int, NDim> outSpatialShape32;
-  tv::SimpleVector<int, NDim> kernelSize32;
-  tv::SimpleVector<int, NDim> stride32;
-  tv::SimpleVector<int, NDim> padding32;
-  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique = torch::full(
-      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-      torch::dtype(torch::kInt32).device(indices.device()));
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShape32.push_back(outSpatialShape[i]);
-    kernelSize32.push_back(kernelSize[i]);
-    if (subM) {
-      stride32.push_back(1);
-      padding32.push_back(kernelSize[i] / 2);
-      dilation32.push_back(dilation[i]);
-    } else {
-      stride32.push_back(stride[i]);
-      padding32.push_back(padding[i]);
-      dilation32.push_back(dilation[i]);
-    }
-  }
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose);
-      gridOut.fill_(-1);
-    } else {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose, true);
-    }
-    return {indices, indicePairs, indiceNum};
-  } else {
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
-          transpose, true);
-      gridOut.fill_(-1);
-    } else {
-      auto getIndicePairFtorP1 =
-          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
-      auto getIndicePairFtorP2 =
-          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtorP1(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-          padding32, dilation32, outSpatialShape32, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = getIndicePairFtorP2(
-            tv::TorchGPU(), tv::torch2tv<const int>(indices),
-            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
-            true);
-      }
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-
-template <typename T>
-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {  // the center index of subm conv don't need gather and scatter
-               // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    } else {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    } else {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-
-template <typename T>
-std::vector<torch::Tensor> indiceConvBackward(torch::Tensor features,
-                                              torch::Tensor filters,
-                                              torch::Tensor outGrad,
-                                              torch::Tensor indicePairs,
-                                              torch::Tensor indiceNum,
-                                              int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
-                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                    nHot);
-    } else {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
-                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                    nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-    auto filterGradSub = filtersGrad[i];
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-
-    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
-                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                  nHot);
-    } else {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
-                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                  nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <typename T>
-torch::Tensor indiceConvDevelopDontUse(torch::Tensor features,
-                                       torch::Tensor filters,
-                                       torch::Tensor indicePairs,
-                                       torch::Tensor indiceNum,
-                                       int64_t numActOut, int64_t _inverse,
-                                       int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto totalActsTen = indicePairNumCpu.sum();
-  auto totalActs = indicePairNumCpu.data_ptr<int>()[0];
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::vector<int> indicePairNumVec(
-      indicePairNumCpu.data_ptr<int>(),
-      indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-  int subRuleMaxSize =
-      *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  if (subM) {
-    indicePairMaxSize = subRuleMaxSize;
-  }
-  auto timer = spconv::CudaContextTimer<>();
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  std::cout << "create time " << timer.report() / 1000.0 << std::endl;
-  if (subM) {  // the center index of subm conv don't need gather and scatter
-               // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  // auto timer = spconv::CudaContextTimer<>();
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    //
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    } else {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-    // }
-    // for (int i = 0; i < kernelVolume; ++i) {
-    // totalGatherTime += timer.report() / 1000.0;
-    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
-    // {nHot, numOutPlanes}, options);
-  }
-  // totalGatherTime += timer.report() / 1000.0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-  }
-  // totalGEMMTime += timer.report() / 1000.0;
-  // totalGEMMTime += timer.report() / 1000.0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    } else {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // totalSAddTime += timer.report() / 1000.0;
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-
-}  // namespace spconv
-
-#endif
--- a/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
-#pragma once
-// from tensorflow
-namespace tv {
-namespace detail {
-
-template <typename T>
-class KernelLoop {
-  struct Iterator {
-    __forceinline__ __device__ Iterator(T index, T delta)
-        : index_(index), delta_(delta) {}
-    __forceinline__ __device__ T operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++() {
-      index_ += delta_;
-      return *this;
-    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
-      bool greater = index_ > other.index_;
-      bool less = index_ < other.index_;
-      // Anything past an end iterator (delta_ == 0) is equal.
-      // In range-based for loops, this optimizes to 'return less'.
-      if (!other.delta_) {
-        return less;
-      }
-      if (!delta_) {
-        return greater;
-      }
-      return less || greater;
-    }
-
-   private:
-    T index_;
-    const T delta_;
-  };
-
- public:
-  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
-      : begin_(begin), delta_(delta), end_(end) {}
-
-  __forceinline__ __device__ Iterator begin() const {
-    return Iterator{begin_, delta_};
-  }
-  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
-
- private:
-  T begin_;
-  T delta_;
-  T end_;
-};
-
-}  // namespace detail
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
-  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
-                               gridDim.x * blockDim.x * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
-// Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
-  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
-                               gridDim.y * blockDim.y * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
-// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
-  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
-                               gridDim.z * blockDim.z * NumILP, count);
-}
-
-}  // namespace tv
--- a/mmdet3d/ops/spconv/include/tensorview/helper_launch.h
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_launch.h
-#pragma once
-// from pytorch.aten
-#include "tensorview.h"
-namespace tv
-{
-namespace launch
-{
-
-template <typename T1, typename T2>
-inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
-
-// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
-// CUDA: number of blocks for threads.
-inline int getBlocks(const int N)
-{
-    TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
-    return DivUp(N, CUDA_NUM_THREADS);
-}
-} // namespace launch
-} // namespace tv