v2.0.2: fix a serious bug, and other small change

324235d0 · yan.yan · 68cbfcbc · 324235d0 · 324235d0 · 324235d0
Commit 324235d0 authored Oct 26, 2021 by yan.yan
12 changed files
--- a/README.md
+++ b/README.md
@@ -18,6 +18,9 @@

 [![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)

+# !!!!!!!!!!
+If you are using spconv < 2.0.2, update after spconv 2.0.2 build success or check [this issue](https://github.com/traveller59/spconv/issues/340#issuecomment-951493008) to fix a serious bug, I'm so sorry for this stupid bug.
+
 ## Breaking changes in Spconv 2.x

 * ```spconv.xxx``` move to ```spconv.pytorch.xxx```, change all ```import spconv``` to ```import spconv.pytorch as spconv``` and ```from spconv.xxx import``` to ```from spconv.pytorch.xxx import```.
@@ -42,6 +45,13 @@
 * doesn't depend on pytorch binary. 
 * since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.

+Spconv 2.1.0 vs 1.x speed:
+
+|                | 1080Ti Spconv 1.x F32 | 1080Ti Spconv 2.0 F32 | 3080M* Spconv 2.1 F16  |
+| -------------- |:---------------------:| ---------------------:| ----------:|
+| 27x128x128 Fwd | 11ms                  | 5.4ms                 | 1.4ms      |
+
+\* 3080M (Laptop) ~= 3070 Desktop

 ## Usage


--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
@@ -716,7 +716,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
            if (indice_pair_mask.ndim() == 2 && indice_pair_mask.dim(0) == 2){{
                auto mask_0 = indice_pair_mask[0];
                tv::cuda::Launch lanucher_fill(mask_0.size(), custream);
-                lanucher_fill(cudakers::fill_kernel<int>, mask_0.data_ptr<int>(), (1 << (kv / 2)), mask_0.size());
+                lanucher_fill(cudakers::fill_kernel<uint32_t>, mask_0.data_ptr<uint32_t>(), (1 << (kv / 2)), mask_0.size());
                indice_pair_mask[1].zero_(ctx);
                auto kernel = &calc_subm_conv_indices_split_mask<table_t>;
                launcher_num_act_in(kernel, loc_iter, hash,  
@@ -725,7 +725,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
                    indices.dim(0), indice_pairs.dim(2), kv);
            }}else{{
                tv::cuda::Launch lanucher_fill(indice_pair_mask.size(), custream);
-                lanucher_fill(cudakers::fill_kernel<int>, indice_pair_mask.data_ptr<int>(), (1 << (kv / 2)), indice_pair_mask.size());
+                lanucher_fill(cudakers::fill_kernel<uint32_t>, indice_pair_mask.data_ptr<uint32_t>(), (1 << (kv / 2)), indice_pair_mask.size());
                TV_ASSERT_RT_ERR(indice_pair_mask.ndim() == 1, "error");
                launcher_num_act_in(calc_subm_conv_indices_mask<table_t>, loc_iter, hash, 
                    indices.data_ptr<int>(), indice_pairs.data_ptr<int>(), 

--- a/spconv/pytorch/constants.py
+++ b/spconv/pytorch/constants.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch 
+try:
+    remove_plus = torch.__version__.find("+")
+    remove_dotdev = torch.__version__.find(".dev")
+
+    PYTORCH_VERSION = torch.__version__
+    if remove_plus != -1:
+        PYTORCH_VERSION = torch.__version__[:remove_plus]
+    if remove_dotdev != -1:
+        PYTORCH_VERSION = torch.__version__[:remove_dotdev]
+
+    PYTORCH_VERSION = list(map(int, PYTORCH_VERSION.split(".")))
+except:
+    # for unknown errors, just set a version
+    PYTORCH_VERSION = [1, 8, 0]
\ No newline at end of file
--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -191,6 +191,7 @@ class SparseConvolution(SparseModule):
        datas = input.find_indice_pair(self.indice_key)
        if self.inverse:
            assert datas is not None and self.indice_key is not None
+            assert datas.is_subm is False, "inverse conv can only be used with standard conv and pool ops."
            outids = datas.indices
            indice_pairs = datas.indice_pairs
            indice_pair_num = datas.indice_pair_num
@@ -226,7 +227,7 @@ class SparseConvolution(SparseModule):
                        self.name]["indice_gen_time"].append(interval)

                indice_data = IndiceData(outids, indices, indice_pairs,
-                                         indice_pair_num, spatial_shape)
+                                         indice_pair_num, spatial_shape, is_subm=self.subm)
                input.indice_dict[self.indice_key] = indice_data
        if input.benchmark:
            torch.cuda.synchronize()

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -12,18 +12,153 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import torch 
-remove_plus = torch.__version__.find("+")
-remove_dotdev = torch.__version__.find(".dev")
+from typing import List, Optional

-PYTORCH_VERSION = torch.__version__
-if remove_plus != -1:
-    PYTORCH_VERSION = torch.__version__[:remove_plus]
-if remove_dotdev != -1:
-    PYTORCH_VERSION = torch.__version__[:remove_dotdev]
+import numpy as np
+import torch
+from spconv.pytorch.constants import PYTORCH_VERSION

-PYTORCH_VERSION = list(map(int, PYTORCH_VERSION.split(".")))
 if PYTORCH_VERSION >= [1, 8, 0]:
-    from .core_fx import *
+    try:
+        import torch.fx
+        if PYTORCH_VERSION >= [1, 10, 0]:
+            from torch.fx import ProxyableClassMeta
+        else:
+            from torch.fx.symbolic_trace import ProxyableClassMeta
+        SpConvTensorMeta = ProxyableClassMeta
+    except:
+        class SpConvTensorMeta(type):
+            pass
 else:
-    from .core import *
+    class SpConvTensorMeta(type):
+        pass
+
+class IndiceData(object):
+    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
+                 out_spatial_shape, is_subm: bool):
+        self.out_indices = out_indices
+        self.indices = indices
+        self.indice_pairs = indice_pairs
+        self.indice_pair_num = indice_pair_num
+        self.out_spatial_shape = out_spatial_shape
+        self.is_subm = is_subm
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+# ProxyableClassMeta is used for TensorRT conversion in future.
+class SparseConvTensor(metaclass=SpConvTensorMeta):
+    def __init__(self,
+                 features: torch.Tensor,
+                 indices: torch.Tensor,
+                 spatial_shape: List[int],
+                 batch_size: int,
+                 grid: Optional[torch.Tensor]=None,
+                 voxel_num: Optional[torch.Tensor]=None,
+                 indice_dict: Optional[dict] = None,
+                 benchmark: bool=False):
+        """
+        Args:
+            features: [num_points, num_features] feature tensor
+            indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
+            spatial_shape: spatial shape of your sparse data
+            batch_size: batch size of your sparse data
+            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
+                is very large.
+            benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
+                SparseConvTensor.
+        """
+        self._features = features
+        self.indices = indices
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        if indice_dict is None:
+            indice_dict = {}
+        self.indice_dict = indice_dict
+        if grid is None:
+            grid = torch.Tensor()  # empty tensor
+        self.grid = grid
+        self.voxel_num = voxel_num # for tensorrt
+        self.benchmark = benchmark
+        self.benchmark_record = {}
+
+    def replace_feature(self, feature):
+        """we need to replace x.features = F.relu(x) with x = x.replace_feature(F.relu(x.features))
+        due to limit of torch.fx
+        """
+        new_spt = SparseConvTensor(feature, self.indices, self.spatial_shape, self.batch_size, self.grid, self.voxel_num, self.indice_dict)
+        new_spt.benchmark = self.benchmark
+        new_spt.benchmark_record = self.benchmark_record
+        return new_spt
+
+    @property
+    def features(self):
+        return self._features
+
+    @features.setter
+    def features(self, val):
+        msg = ("you can't set feature directly, use 'x = x.replace_feature(your_new_feature)'"
+                " to generate new SparseConvTensor instead.")
+        raise ValueError(msg)
+
+    @classmethod
+    def from_dense(cls, x: torch.Tensor):
+        """create sparse tensor fron channel last dense tensor by to_sparse
+        x must be NHWC tensor, channel last
+        """
+        x_sp = x.to_sparse(x.ndim - 1)
+        spatial_shape = list(x_sp.shape[1:-1])
+        batch_size = x_sp.shape[0]
+        indices_th = x_sp.indices().permute(1, 0).contiguous().int()
+        features_th = x_sp.values()
+        return cls(features_th, indices_th, spatial_shape, batch_size)
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key) -> Optional[IndiceData]:
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first: bool=True):
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(
+            self.indices.to(self.features.device).long(), self.features,
+            output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    # remove this due to limit of torch.fx
+    # @property
+    # def sparity(self):
+    #     return self.indices.shape[0] / np.prod(
+    #         self.spatial_shape) / self.batch_size
+
+    def shadow_copy(self) -> "SparseConvTensor":
+        """create a new spconv tensor with all member unchanged"""
+        tensor = SparseConvTensor(self.features, self.indices,
+                                  self.spatial_shape, self.batch_size,
+                                  self.grid, self.voxel_num, self.indice_dict, self.benchmark)
+        tensor.benchmark_record = self.benchmark_record
+        return tensor
--- a/spconv/pytorch/core_fx.py
+++ b/spconv/pytorch/core_fx.py
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-import numpy as np
-import torch
-import torch.fx
-from torch.fx.symbolic_trace import ProxyableClassMeta
-
-class IndiceData(object):
-    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
-                 out_spatial_shape):
-        self.out_indices = out_indices
-        self.indices = indices
-        self.indice_pairs = indice_pairs
-        self.indice_pair_num = indice_pair_num
-        self.out_spatial_shape = out_spatial_shape
-
-
-def scatter_nd(indices, updates, shape):
-    """pytorch edition of tensorflow scatter_nd.
-    this function don't contain except handle code. so use this carefully
-    when indice repeats, don't support repeat add which is supported
-    in tensorflow.
-    """
-    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
-    ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
-    flatted_indices = indices.view(-1, ndim)
-    slices = [flatted_indices[:, i] for i in range(ndim)]
-    slices += [Ellipsis]
-    ret[slices] = updates.view(*output_shape)
-    return ret
-
-
-class SparseConvTensor(metaclass=ProxyableClassMeta):
-    def __init__(self,
-                 features: torch.Tensor,
-                 indices: torch.Tensor,
-                 spatial_shape: List[int],
-                 batch_size: int,
-                 grid: Optional[torch.Tensor]=None,
-                 voxel_num: Optional[torch.Tensor]=None,
-                 indice_dict: Optional[dict] = None,
-                 benchmark: bool=False):
-        """
-        Args:
-            features: [num_points, num_features] feature tensor
-            indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
-            spatial_shape: spatial shape of your sparse data
-            batch_size: batch size of your sparse data
-            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
-                is very large.
-            benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
-                SparseConvTensor.
-        """
-        self._features = features
-        self.indices = indices
-        self.spatial_shape = spatial_shape
-        self.batch_size = batch_size
-        if indice_dict is None:
-            indice_dict = {}
-        self.indice_dict = indice_dict
-        if grid is None:
-            grid = torch.Tensor()  # empty tensor
-        self.grid = grid
-        self.voxel_num = voxel_num # for tensorrt
-        self.benchmark = benchmark
-        self.benchmark_record = {}
-
-    def replace_feature(self, feature):
-        """we need to replace x.features = F.relu(x) with x = x.replace_feature(F.relu(x.features))
-        due to limit of torch.fx
-        """
-        new_spt = SparseConvTensor(feature, self.indices, self.spatial_shape, self.batch_size, self.grid, self.voxel_num, self.indice_dict)
-        new_spt.benchmark = self.benchmark
-        new_spt.benchmark_record = self.benchmark_record
-        return new_spt
-
-    @property
-    def features(self):
-        return self._features
-
-    @features.setter
-    def features(self, val):
-        msg = ("you can't set feature directly, use 'x = x.replace_feature(your_new_feature)'"
-                " to generate new SparseConvTensor instead.")
-        raise ValueError(msg)
-
-    @classmethod
-    def from_dense(cls, x: torch.Tensor):
-        """create sparse tensor fron channel last dense tensor by to_sparse
-        x must be NHWC tensor, channel last
-        """
-        x_sp = x.to_sparse(x.ndim - 1)
-        spatial_shape = list(x_sp.shape[1:-1])
-        batch_size = x_sp.shape[0]
-        indices_th = x_sp.indices().permute(1, 0).contiguous().int()
-        features_th = x_sp.values()
-        return cls(features_th, indices_th, spatial_shape, batch_size)
-
-    @property
-    def spatial_size(self):
-        return np.prod(self.spatial_shape)
-
-    def find_indice_pair(self, key) -> Optional[IndiceData]:
-        if key is None:
-            return None
-        if key in self.indice_dict:
-            return self.indice_dict[key]
-        return None
-
-    def dense(self, channels_first: bool=True):
-        output_shape = [self.batch_size] + list(
-            self.spatial_shape) + [self.features.shape[1]]
-        res = scatter_nd(
-            self.indices.to(self.features.device).long(), self.features,
-            output_shape)
-        if not channels_first:
-            return res
-        ndim = len(self.spatial_shape)
-        trans_params = list(range(0, ndim + 1))
-        trans_params.insert(1, ndim + 1)
-        return res.permute(*trans_params).contiguous()
-
-    # remove this due to limit of torch.fx
-    # @property
-    # def sparity(self):
-    #     return self.indices.shape[0] / np.prod(
-    #         self.spatial_shape) / self.batch_size
-
-    def shadow_copy(self) -> "SparseConvTensor":
-        """create a new spconv tensor with all member unchanged"""
-        tensor = SparseConvTensor(self.features, self.indices,
-                                  self.spatial_shape, self.batch_size,
-                                  self.grid, self.voxel_num, self.indice_dict, self.benchmark)
-        tensor.benchmark_record = self.benchmark_record
-        return tensor
--- a/spconv/pytorch/core_nofx.py
+++ b/spconv/pytorch/core_nofx.py
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-import numpy as np
-import torch
-
-
-class IndiceData(object):
-    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
-                 out_spatial_shape):
-        self.out_indices = out_indices
-        self.indices = indices
-        self.indice_pairs = indice_pairs
-        self.indice_pair_num = indice_pair_num
-        self.out_spatial_shape = out_spatial_shape
-
-
-def scatter_nd(indices, updates, shape):
-    """pytorch edition of tensorflow scatter_nd.
-    this function don't contain except handle code. so use this carefully
-    when indice repeats, don't support repeat add which is supported
-    in tensorflow.
-    """
-    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
-    ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
-    flatted_indices = indices.view(-1, ndim)
-    slices = [flatted_indices[:, i] for i in range(ndim)]
-    slices += [Ellipsis]
-    ret[slices] = updates.view(*output_shape)
-    return ret
-
-
-class SparseConvTensor:
-    def __init__(self,
-                 features: torch.Tensor,
-                 indices: torch.Tensor,
-                 spatial_shape: List[int],
-                 batch_size: int,
-                 grid: Optional[torch.Tensor]=None,
-                 voxel_num: Optional[torch.Tensor]=None,
-                 indice_dict: Optional[dict] = None,
-                 benchmark: bool=False):
-        """
-        Args:
-            features: [num_points, num_features] feature tensor
-            indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
-            spatial_shape: spatial shape of your sparse data
-            batch_size: batch size of your sparse data
-            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
-                is very large.
-            benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
-                SparseConvTensor.
-        """
-        self._features = features
-        self.indices = indices
-        self.spatial_shape = spatial_shape
-        self.batch_size = batch_size
-        if indice_dict is None:
-            indice_dict = {}
-        self.indice_dict = indice_dict
-        if grid is None:
-            grid = torch.Tensor()  # empty tensor
-        self.grid = grid
-        self.voxel_num = voxel_num # for tensorrt
-        self.benchmark = benchmark
-        self.benchmark_record = {}
-
-    def replace_feature(self, feature):
-        """we need to replace x.features = F.relu(x) with x = x.replace_feature(F.relu(x.features))
-        due to limit of torch.fx
-        """
-        new_spt = SparseConvTensor(feature, self.indices, self.spatial_shape, self.batch_size, self.grid, self.voxel_num, self.indice_dict)
-        new_spt.benchmark = self.benchmark
-        new_spt.benchmark_record = self.benchmark_record
-        return new_spt
-
-    @property
-    def features(self):
-        return self._features
-
-    @features.setter
-    def features(self, val):
-        msg = ("you can't set feature directly, use 'x = x.replace_feature(your_new_feature)'"
-                " to generate new SparseConvTensor instead.")
-        raise ValueError(msg)
-
-    @classmethod
-    def from_dense(cls, x: torch.Tensor):
-        """create sparse tensor fron channel last dense tensor by to_sparse
-        x must be NHWC tensor, channel last
-        """
-        x_sp = x.to_sparse(x.ndim - 1)
-        spatial_shape = list(x_sp.shape[1:-1])
-        batch_size = x_sp.shape[0]
-        indices_th = x_sp.indices().permute(1, 0).contiguous().int()
-        features_th = x_sp.values()
-        return cls(features_th, indices_th, spatial_shape, batch_size)
-
-    @property
-    def spatial_size(self):
-        return np.prod(self.spatial_shape)
-
-    def find_indice_pair(self, key) -> Optional[IndiceData]:
-        if key is None:
-            return None
-        if key in self.indice_dict:
-            return self.indice_dict[key]
-        return None
-
-    def dense(self, channels_first: bool=True):
-        output_shape = [self.batch_size] + list(
-            self.spatial_shape) + [self.features.shape[1]]
-        res = scatter_nd(
-            self.indices.to(self.features.device).long(), self.features,
-            output_shape)
-        if not channels_first:
-            return res
-        ndim = len(self.spatial_shape)
-        trans_params = list(range(0, ndim + 1))
-        trans_params.insert(1, ndim + 1)
-        return res.permute(*trans_params).contiguous()
-
-    # remove this due to limit of torch.fx
-    # @property
-    # def sparity(self):
-    #     return self.indices.shape[0] / np.prod(
-    #         self.spatial_shape) / self.batch_size
-
-    def shadow_copy(self) -> "SparseConvTensor":
-        """create a new spconv tensor with all member unchanged"""
-        tensor = SparseConvTensor(self.features, self.indices,
-                                  self.spatial_shape, self.batch_size,
-                                  self.grid, self.voxel_num, self.indice_dict, self.benchmark)
-        tensor.benchmark_record = self.benchmark_record
-        return tensor
--- a/spconv/pytorch/modules.py
+++ b/spconv/pytorch/modules.py
@@ -138,7 +138,7 @@ class SparseSequential(SparseModule):
            else:
                if isinstance(input, spconv.SparseConvTensor):
                    if input.indices.shape[0] != 0:
-                        input.replace_feature(module(input.features))
+                        input = input.replace_feature(module(input.features))
                else:
                    input = module(input)
        return input

--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -26,7 +26,8 @@ from spconv.core_cc.csrc.sparse.all import SpconvOps
 from spconv.algo import GEMM  # , GATHER, SCATTER
 import time
 from spconv.constants import FILTER_HWIO
-
+import pickle 
+from pathlib import Path 

 def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
    ndim = len(input_size)
@@ -78,6 +79,8 @@ def get_indice_pairs(indices: torch.Tensor,
                                             padding, dilation)
    else:
        out_shape = spatial_shape
+    if any([x == 0 for x in out_shape]):
+        raise ValueError(f"your out spatial shape {out_shape} reach zero!!! input shape: {spatial_shape}")
    assert algo == ConvAlgo.Native, "TODO"
    stream = get_current_stream()

@@ -205,6 +208,9 @@ def indice_conv(features: torch.Tensor,

    stream = get_current_stream()
    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    if subm and all(x == 0 for x in indice_pair_num_cpu):
+        return out_features
+
    arch = torch.cuda.get_device_capability()
    inited: bool = subm
    a = torch_tensor_to_tv(features)
@@ -214,7 +220,14 @@ def indice_conv(features: torch.Tensor,
        profile_idx = kv_center - 1
    # profile_idx = first_n
    nhot_profile = indice_pair_num_cpu[profile_idx]
-
+    if nhot_profile == 0:
+        # find a non-zero profile index
+        profile_idx = 0
+        for i, nhot in enumerate(indice_pair_num_cpu):
+            if nhot > nhot_profile:
+                nhot_profile = nhot
+                profile_idx = i
+    assert nhot_profile > 0, "this shouldn't happen"
    # print(nhot_profile, indice_pair_num_cpu)
    profile_res = GEMM.get_profiled_algo(
        a.shape,
@@ -294,8 +307,6 @@ def indice_conv(features: torch.Tensor,
    # # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
    # # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t)

-    # # print(indice_pair_num_cpu)
-    # print("G", time.time() - t)
    return out_features


@@ -312,15 +323,14 @@ def indice_conv_backward(features: torch.Tensor,
                         inverse: bool = False,
                         subm: bool = False,
                         algo: ConvAlgo = ConvAlgo.Native):
-    # torch.cuda.synchronize()
-    # t = time.time()
-
    num_activate_out = out_bp.shape[0]
    out_channel = out_bp.shape[-1]
    filters_shape = filters.shape
    filters = filters.reshape(-1, *filters.shape[-2:])
    kv = filters.shape[0]
    kv_center = kv // 2
+    if not out_bp.is_contiguous():
+        out_bp = out_bp.contiguous()
    assert out_bp.is_contiguous()
    assert filters.is_contiguous()
    assert features.is_contiguous()
@@ -349,6 +359,9 @@ def indice_conv_backward(features: torch.Tensor,

    stream = get_current_stream()
    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    if subm and all(x == 0 for x in indice_pair_num_cpu):
+        return (din, dfilters.reshape(filters_shape))
+
    arch = torch.cuda.get_device_capability()
    filters_tv = torch_tensor_to_tv(filters)

@@ -359,10 +372,18 @@ def indice_conv_backward(features: torch.Tensor,
    din_tv = torch_tensor_to_tv(din)

    profile_idx = kv_center
-    if subm:
+    if subm or indice_pair_num_cpu[profile_idx] == 0:
        profile_idx = kv_center - 1
    # profile_idx = first_n
    nhot_profile = indice_pair_num_cpu[profile_idx]
+    if nhot_profile == 0:
+        # find a non-zero profile index
+        profile_idx = 0
+        for i, nhot in enumerate(indice_pair_num_cpu):
+            if nhot > nhot_profile:
+                nhot_profile = nhot
+                profile_idx = i
+    assert nhot_profile > 0, "this shouldn't happen"

    # print(nhot_profile, indice_pair_num_cpu)
    profile_res_dgrad = GEMM.get_profiled_algo(
@@ -549,11 +570,12 @@ def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
    out_features_tv = torch_tensor_to_tv(out_features)
    features_tv = torch_tensor_to_tv(features)
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
    for i, nhot in enumerate(indice_pair_num_cpu):
        if nhot <= 0:
            continue
-        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
-        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        inp_indices = indice_pairs_tv[0][i].slice_first_axis(0, nhot)
+        out_indices = indice_pairs_tv[1][i].slice_first_axis(0, nhot)
        SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
                                  inp_indices, stream)
    # torch.cuda.synchronize()
@@ -568,15 +590,18 @@ def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
    din = torch.zeros_like(features)
    stream = get_current_stream()
    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    if not out_bp.is_contiguous():
+        out_bp = out_bp.contiguous()
    out_features_tv = torch_tensor_to_tv(out_features)
    features_tv = torch_tensor_to_tv(features)
    out_bp_tv = torch_tensor_to_tv(out_bp)
    din_tv = torch_tensor_to_tv(din)
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
    for i, nhot in enumerate(indice_pair_num_cpu):
        if nhot <= 0:
            continue
-        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
-        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        inp_indices = indice_pairs_tv[0][i].slice_first_axis(0, nhot)
+        out_indices = indice_pairs_tv[1][i].slice_first_axis(0, nhot)
        SpconvOps.maxpool_backward(out_features_tv, features_tv, out_bp_tv,
                                   din_tv, out_indices, inp_indices, stream)


--- a/spconv/pytorch/pool.py
+++ b/spconv/pytorch/pool.py
@@ -119,7 +119,7 @@ class SparseMaxPool(SparseModule):
            datas = input.find_indice_pair(self.indice_key)
            if datas is None:
                indice_data = IndiceData(outids, indices, indice_pairs,
-                                         indice_pairs_num, spatial_shape)
+                                         indice_pairs_num, spatial_shape, is_subm=False)
                input.indice_dict[self.indice_key] = indice_data
            else:
                raise ValueError("indice data exists")
@@ -147,12 +147,14 @@ class SparseMaxPool1d(SparseMaxPool):
                 stride=None,
                 padding=0,
                 dilation=1,
+                 indice_key=None,
                 name=None):
        super(SparseMaxPool1d, self).__init__(1,
                                              kernel_size,
                                              stride,
                                              padding,
                                              dilation,
+                                              indice_key=indice_key,
                                              name=name)

 class SparseMaxPool2d(SparseMaxPool):
@@ -161,12 +163,14 @@ class SparseMaxPool2d(SparseMaxPool):
                 stride=None,
                 padding=0,
                 dilation=1,
+                 indice_key=None,
                 name=None):
        super(SparseMaxPool2d, self).__init__(2,
                                              kernel_size,
                                              stride,
                                              padding,
                                              dilation,
+                                              indice_key=indice_key,
                                              name=name)


@@ -176,12 +180,14 @@ class SparseMaxPool3d(SparseMaxPool):
                 stride=None,
                 padding=0,
                 dilation=1,
+                 indice_key=None,
                 name=None):
        super(SparseMaxPool3d, self).__init__(3,
                                              kernel_size,
                                              stride,
                                              padding,
                                              dilation,
+                                              indice_key=indice_key,
                                              name=name)

 class SparseMaxPool4d(SparseMaxPool):
@@ -190,10 +196,12 @@ class SparseMaxPool4d(SparseMaxPool):
                 stride=None,
                 padding=0,
                 dilation=1,
+                 indice_key=None,
                 name=None):
        super(SparseMaxPool4d, self).__init__(4,
                                              kernel_size,
                                              stride,
                                              padding,
                                              dilation,
+                                              indice_key=indice_key,
                                              name=name)
--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -62,8 +62,11 @@ class Net(nn.Module):
                              bias=False,
                              indice_key="c0",
                              algo=algo),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            spconv.SparseConv3d(64, 64, 2, 2, bias=False, indice_key="m0"),

-            spconv.SparseMaxPool3d(2, 2),
+            # spconv.SparseMaxPool3d(2, 2),
            spconv.SubMConv3d(64,
                              96,
                              3,
@@ -78,7 +81,9 @@ class Net(nn.Module):
                              algo=algo),
            # nn.BatchNorm1d(64),
            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
+            spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1"),
+
+            # spconv.SparseMaxPool3d(2, 2),
            spconv.SubMConv3d(96,
                              128,
                              3,
@@ -93,7 +98,9 @@ class Net(nn.Module):
                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
+            spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
+
+            # spconv.SparseMaxPool3d(2, 2),
            spconv.SubMConv3d(128,
                              160,
                              3,
@@ -108,7 +115,9 @@ class Net(nn.Module):
                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
+            spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
+
+            # spconv.SparseMaxPool3d(2, 2),
            spconv.SubMConv3d(160,
                              192,
                              3,
@@ -123,7 +132,9 @@ class Net(nn.Module):
                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
+            # spconv.SparseMaxPool3d(2, 2, indice_key="m4"),
+            spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
+
            spconv.SubMConv3d(192,
                              224,
                              3,
@@ -136,9 +147,10 @@ class Net(nn.Module):
                              bias=False,
                              indice_key="c5",
                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
+            nn.BatchNorm1d(224),
+            nn.ReLU(),
+            spconv.SparseConv3d(224, 224, 2, 2, bias=False, indice_key="m5"),
+            # spconv.SparseMaxPool3d(2, 2, indice_key="m5"),
            spconv.SubMConv3d(224,
                              256,
                              3,
@@ -151,8 +163,15 @@ class Net(nn.Module):
                              bias=False,
                              indice_key="c6",
                              algo=algo),
-            spconv.SparseInverseConv3d(256, 128, 3, indice_key="c6", bias=False),
-            spconv.SparseInverseConv3d(128, 64, 3, indice_key="c5", bias=False),
+
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+
+            spconv.SparseInverseConv3d(256, 128, 2, indice_key="m5", bias=False),
+            nn.BatchNorm1d(128),
+            nn.ReLU(),
+
+            spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False),

        )
        max_batch_size = 1
@@ -171,7 +190,7 @@ class Net2(nn.Module):
    def __init__(self, shape, algo):
        super().__init__()
        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 256, 3, bias=False, indice_key="c0",
+            spconv.SubMConv3d(3, 128, 3, bias=False, indice_key="c0",
                              algo=algo),
            # spconv.SubMConv3d(32,
            #                   32,
@@ -185,27 +204,27 @@ class Net2(nn.Module):
            # #                   algo=algo),
            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
            #                   algo=algo),
-            spconv.SubMConv3d(256,
-                              256,
+            spconv.SubMConv3d(128,
+                              128,
                              3,
                              bias=False,
                              indice_key="c0",
                              algo=algo),
            # nn.BatchNorm1d(32),
            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(256,
-                              512,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo),
-            spconv.SubMConv3d(512,
-                              512,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo),
+            # spconv.SparseMaxPool3d(2, 2),
+            # spconv.SubMConv3d(256,
+            #                   512,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c1",
+            #                   algo=algo),
+            # spconv.SubMConv3d(512,
+            #                   512,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c1",
+            #                   algo=algo),
        )
        max_batch_size = 1
        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
@@ -249,19 +268,19 @@ def main():
    dout_t = torch.from_numpy(dout).cuda().to(dtype)

    print(out.spatial_shape, out.features.mean(),  out.features.max(),  out.features.min())
-    times = []
-    with torch.no_grad():
-        for i in range(20):
-            print("------------")
-            torch.cuda.synchronize()
-            t = time.time()
-            out_nograd = net(voxels_th, coors_th, 1)
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    print("spconv time", np.mean(times[10:]))
+    # times = []
+    # with torch.no_grad():
+    #     for i in range(20):
+    #         print("------------")
+    #         torch.cuda.synchronize()
+    #         t = time.time()
+    #         out_nograd = net(voxels_th, coors_th, 1)
+    #         torch.cuda.synchronize()
+    #         times.append(time.time() - t)
+    # print("spconv time", np.mean(times[10:]))
    times = []

-    for i in range(10):
+    for i in range(1):
        out = net(voxels_th, coors_th, 1)
        print("------------")
        torch.cuda.synchronize()
@@ -270,9 +289,9 @@ def main():
        torch.cuda.synchronize()
        times.append(time.time() - t)

-    # print((net.grid == -1).float().sum(), net.grid.numel())
-    # print("spconv time", time.time() - t)
-    print("spconv bw time", np.mean(times[5:]))
+    # # print((net.grid == -1).float().sum(), net.grid.numel())
+    # # print("spconv time", time.time() - t)
+    # print("spconv bw time", np.mean(times[5:]))


 if __name__ == "__main__":

--- a/version.txt
+++ b/version.txt
-2.0.1
\ No newline at end of file
+2.0.2
\ No newline at end of file