v2.1.5: add profile tool and python 3.6 for linux

82fd7a8b · yan.yan · f31eee3a · 82fd7a8b · 82fd7a8b · 82fd7a8b
Commit 82fd7a8b authored Nov 10, 2021 by yan.yan
20 changed files
--- a/spconv/csrc/sparse/pointops.py
+++ b/spconv/csrc/sparse/pointops.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,27 +15,27 @@
 import contextlib
 from cumm.gemm.core.metaarray import MetaArray, seq
 from cumm import dtypes
-import pccm 
+import pccm
 from cumm.gemm.layout import TensorGeneric, to_stride
 from cumm.common import TensorView, TensorViewHashKernel
 from cumm.gemm import codeops
-from typing import List 
+from typing import List
 from cumm.conv.params import ConvProblem
-import numpy as np 
+import numpy as np
+

 class Point2VoxelCommon(pccm.ParameterizedClass):
    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
        super().__init__()
        self.add_dependency(TensorView)
-        self.dtype = dtype 
-        self.ndim = ndim 
+        self.dtype = dtype
+        self.ndim = ndim
        self.zyx = zyx
        ret_str = f"std::array<int, {self.ndim}>"
        retf_str = f"std::array<float, {self.ndim}>"
        retf2_str = f"std::array<float, {self.ndim * 2}>"
        self.calc_meta_ret = f"std::tuple<{retf_str}, {ret_str}, {ret_str}, {retf2_str}>"

-    @pccm.pybind.mark
    @pccm.static_function
    def calc_meta_data(self):
        code = pccm.FunctionCode()
@@ -80,7 +80,8 @@ class Point2VoxelCommon(pccm.ParameterizedClass):
        retf_str = f"std::array<float, {self.ndim}>"
        retf2_str = f"std::array<float, {self.ndim * 2}>"

-        return code.ret(f"std::tuple<{retf_str}, {ret_str}, {ret_str}, {retf2_str}>")
+        return code.ret(
+            f"std::tuple<{retf_str}, {ret_str}, {ret_str}, {retf2_str}>")

    @pccm.static_function
    def array2tvarray(self):
@@ -112,16 +113,21 @@ class Point2VoxelCommon(pccm.ParameterizedClass):
        """)
        return code.ret("std::array<T, N>")

+
 class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
    """this class don't support multi-thread. 
    create p2v for every thread.
    """
-    def __init__(self, dtype: dtypes.DType, ndim: int, layout: TensorGeneric, zyx: bool = True):
+    def __init__(self,
+                 dtype: dtypes.DType,
+                 ndim: int,
+                 layout: TensorGeneric,
+                 zyx: bool = True):
        super().__init__()
        self.add_dependency(TensorView, TensorViewHashKernel)
        self.add_param_class("layout_ns", layout, "Layout")
-        self.dtype = dtype 
-        self.ndim = ndim 
+        self.dtype = dtype
+        self.ndim = ndim
        self.zyx = zyx

    @pccm.cuda.cuda_global_function
@@ -142,7 +148,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        point_xyz = f"{self.ndim - 1} - j"
        if not self.zyx:
            point_xyz = f"j"
-        # if zyx, the coors_range and grid_bound is zyx too, 
+        # if zyx, the coors_range and grid_bound is zyx too,
        # generated indices is zyx.
        code.raw(f"""
        for (int i : tv::KernelLoopX<int>(num_points)){{
@@ -166,7 +172,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
            }}
        }}
        """)
-        return code 
+        return code

    @pccm.cuda.cuda_global_function
    def assign_table(self):
@@ -190,7 +196,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
            }}
        }}
        """)
-        return code 
+        return code

    @pccm.cuda.cuda_global_function
    def generate_voxel(self):
@@ -231,7 +237,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
            }}
        }}
        """)
-        return code 
+        return code

    @pccm.cuda.cuda_global_function
    def voxel_empty_fill_mean(self):
@@ -263,7 +269,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
            }}
        }}
        """)
-        return code 
+        return code

    @pccm.cuda.cuda_global_function
    def limit_num_per_voxel_value(self):
@@ -276,7 +282,8 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
            num_per_voxel[i] = count;
        }}
        """)
-        return code 
+        return code
+

 class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
@@ -286,14 +293,23 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        self.add_param_class("p2v_c", self.p2v_c, "Point2VoxelCommon")
        layout = TensorGeneric(ndim, True)
        self.add_param_class("layout_ns", layout, "Layout")
-        self.dtype = dtype 
-        self.ndim = ndim 
+        self.dtype = dtype
+        self.ndim = ndim
        self.zyx = zyx
-        cuda_funcs = [self.point_to_voxel_hash, self.point_to_voxel_hash_static]
-        self.add_impl_only_param_class(cuda_funcs, "kernel", Point2VoxelKernel(dtype, ndim, layout, zyx))
-
-        self.add_pybind_member("hashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
-        self.add_pybind_member("point_indice_data", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+        cuda_funcs = [
+            self.point_to_voxel_hash, self.point_to_voxel_hash_static
+        ]
+        self.add_impl_only_param_class(
+            cuda_funcs, "kernel", Point2VoxelKernel(dtype, ndim, layout, zyx))
+
+        self.add_pybind_member("hashdata",
+                               "tv::Tensor",
+                               readwrite=False,
+                               pyanno="cumm.tensorview.Tensor")
+        self.add_pybind_member("point_indice_data",
+                               "tv::Tensor",
+                               readwrite=False,
+                               pyanno="cumm.tensorview.Tensor")

        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
@@ -357,7 +373,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        hashdata = tv::zeros({{1}}, tv::custom128, 0);
        point_indice_data = tv::zeros({{1}}, tv::int64, 0);
        """)
-        return code 
+        return code

    @pccm.pybind.mark
    @pccm.cuda.member_function
@@ -439,13 +455,13 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        """)
        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")

-
    @pccm.pybind.mark
    @pccm.cuda.static_function
    def point_to_voxel_hash_static(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data", "tv::Tensor")
+        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
+                 "tv::Tensor")
        code.arg("vsize", f"std::array<float, {self.ndim}>")
        code.arg("grid_size, grid_stride", f"std::array<int, {self.ndim}>")
        code.arg("coors_range", f"std::array<float, {self.ndim * 2}>")
@@ -527,13 +543,16 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        self.add_dependency(TensorView)
        layout = TensorGeneric(ndim, True)
        self.add_param_class("layout_ns", layout, "Layout")
-        self.dtype = dtype 
-        self.ndim = ndim 
+        self.dtype = dtype
+        self.ndim = ndim
        self.zyx = zyx
        self.p2v_c = Point2VoxelCommon(dtype, ndim, zyx)
        self.add_param_class("p2v_c", self.p2v_c, "Point2VoxelCommon")

-        self.add_pybind_member("densehashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+        self.add_pybind_member("densehashdata",
+                               "tv::Tensor",
+                               readwrite=False,
+                               pyanno="cumm.tensorview.Tensor")

        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
@@ -568,7 +587,6 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        """)
        return code.ret(self.p2v_c.calc_meta_ret)

-
    @pccm.pybind.mark
    @pccm.constructor
    def ctor(self):
@@ -613,7 +631,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
            densehashdata_ptr[i] = -1;
        }}
        """)
-        return code 
+        return code

    def point_to_voxel_static_template(self, mean: bool = False):
        code = pccm.FunctionCode()

--- a/spconv/pytorch/__init__.py
+++ b/spconv/pytorch/__init__.py
@@ -4,13 +4,14 @@ from pathlib import Path
 import numpy as np
 import torch

-from spconv.pytorch import ops
-from spconv.pytorch.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
-                         SparseConvTranspose3d, SparseInverseConv2d,
-                         SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from spconv.pytorch import ops, functional
+from spconv.pytorch.conv import (SparseConv2d, SparseConv3d,
+                                 SparseConvTranspose2d, SparseConvTranspose3d,
+                                 SparseInverseConv2d, SparseInverseConv3d,
+                                 SubMConv2d, SubMConv3d)
 from spconv.pytorch.core import SparseConvTensor
 from spconv.pytorch.identity import Identity
-from spconv.pytorch.modules import SparseModule, SparseSequential
+from spconv.pytorch.modules import SparseModule, SparseSequential, assign_name_for_sparse_modules
 from spconv.pytorch.ops import ConvAlgo
 from spconv.pytorch.pool import SparseMaxPool2d, SparseMaxPool3d
 from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable

--- a/spconv/pytorch/constants.py
+++ b/spconv/pytorch/constants.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import torch 
+import torch
 try:
    remove_plus = torch.__version__.find("+")
    remove_dotdev = torch.__version__.find(".dev")
@@ -26,4 +26,4 @@ try:
    PYTORCH_VERSION = list(map(int, PYTORCH_VERSION.split(".")))
 except:
    # for unknown errors, just set a version
-    PYTORCH_VERSION = [1, 8, 0]
\ No newline at end of file
+    PYTORCH_VERSION = [1, 8, 0]
--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -24,12 +24,13 @@ from torch.nn.parameter import Parameter

 from spconv import pytorch as spconv
 from spconv.core import ConvAlgo
-import spconv.pytorch.functional as Fsp
+from spconv.pytorch import functional as Fsp
 from spconv.pytorch import ops
 from spconv.cppconstants import CPU_ONLY_BUILD
 from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndiceData
 from spconv.pytorch.modules import SparseModule
 from spconv.constants import FILTER_HWIO
+from spconv.utils import nullcontext


 def _calculate_fan_in_and_fan_out_hwio(tensor, algo: ConvAlgo):
@@ -205,6 +206,7 @@ class SparseConvolution(SparseModule):
                    self.dilation)
        else:
            out_spatial_shape = spatial_shape
+        # print(self._sparse_unique_name, spatial_shape, out_spatial_shape)
        # input.update_grid(out_spatial_shape)
        # t = time.time()
        out_tensor = input.shadow_copy()
@@ -247,158 +249,165 @@ class SparseConvolution(SparseModule):
            out_tensor = out_tensor.replace_feature(features)
            return out_tensor
        indice_dict = input.indice_dict.copy()
-        
+
        algo = self.algo
-        if self.indice_key is not None :
+        if self.indice_key is not None:
            datas = input.find_indice_pair(self.indice_key)
            if datas is not None:
                msg = "due to limitation of pytorch, you must provide same algo to layers share same indice key."
                assert algo == datas.algo, msg
                # algo = datas.algo
-        if algo == ConvAlgo.Native:
-            datas = input.find_indice_pair(self.indice_key)
-            if datas is not None:
-                assert isinstance(datas, IndiceData)
-            if self.inverse:
-                assert datas is not None and self.indice_key is not None
-                assert datas.is_subm is False, "inverse conv can only be used with standard conv and pool ops."
-
-                outids = datas.indices
-                indice_pairs = datas.indice_pairs
-                indice_pair_num = datas.indice_pair_num
-                out_spatial_shape = datas.out_spatial_shape
-                assert indice_pair_num.shape[0] == np.prod(
-                    self.kernel_size
-                ), "inverse conv must have same kernel size as its couple conv"
-            else:
-                if self.indice_key is not None and datas is not None:
-                    outids = datas.out_indices
+        profile_ctx = nullcontext()
+        if input._timer is not None and self._sparse_unique_name:
+            profile_ctx = input._timer.namespace(self._sparse_unique_name)
+        with profile_ctx:
+            if algo == ConvAlgo.Native:
+                datas = input.find_indice_pair(self.indice_key)
+                if datas is not None:
+                    assert isinstance(datas, IndiceData)
+                if self.inverse:
+                    assert datas is not None and self.indice_key is not None
+                    assert datas.is_subm is False, "inverse conv can only be used with standard conv and pool ops."
+
+                    outids = datas.indices
                    indice_pairs = datas.indice_pairs
                    indice_pair_num = datas.indice_pair_num
+                    out_spatial_shape = datas.out_spatial_shape
+                    assert indice_pair_num.shape[0] == np.prod(
+                        self.kernel_size
+                    ), "inverse conv must have same kernel size as its couple conv"
                else:
-                    if input.benchmark:
-                        torch.cuda.synchronize()
-                        t = time.time()
-                    outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
-                        indices, batch_size, spatial_shape, algo,
-                        self.kernel_size, self.stride, self.padding,
-                        self.dilation, self.output_padding, self.subm,
-                        self.transposed)
-                    if input.benchmark:
-                        torch.cuda.synchronize()
-                        interval = time.time() - t
-                        out_tensor.benchmark_record[
-                            self.name]["indice_gen_time"].append(interval)
-
-                    indice_data = IndiceData(outids,
-                                             indices,
-                                             indice_pairs,
-                                             indice_pair_num,
-                                             spatial_shape,
-                                             is_subm=self.subm,
-                                             algo=algo)
-                    if self.indice_key is not None:
-                        msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
-                        assert self.indice_key not in indice_dict, msg
-                        indice_dict[self.indice_key] = indice_data
-            if input.benchmark:
-                torch.cuda.synchronize()
-                t = time.time()
-            indice_pairs_calc = indice_pairs
-            if indice_pairs.device != features.device:
-                indice_pairs_calc = indice_pairs.to(features.device)
-            if self.subm:
-                out_features = Fsp.indice_subm_conv(features, self.weight,
-                                                    indice_pairs_calc,
-                                                    indice_pair_num,
-                                                    outids.shape[0], algo)
-            else:
-                if self.inverse:
-                    out_features = Fsp.indice_inverse_conv(
+                    if self.indice_key is not None and datas is not None:
+                        outids = datas.out_indices
+                        indice_pairs = datas.indice_pairs
+                        indice_pair_num = datas.indice_pair_num
+                    else:
+                        if input.benchmark:
+                            torch.cuda.synchronize()
+                            t = time.time()
+                        outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                            indices, batch_size, spatial_shape, algo,
+                            self.kernel_size, self.stride, self.padding,
+                            self.dilation, self.output_padding, self.subm,
+                            self.transposed)
+                        if input.benchmark:
+                            torch.cuda.synchronize()
+                            interval = time.time() - t
+                            out_tensor.benchmark_record[
+                                self.name]["indice_gen_time"].append(interval)
+
+                        indice_data = IndiceData(outids,
+                                                 indices,
+                                                 indice_pairs,
+                                                 indice_pair_num,
+                                                 spatial_shape,
+                                                 is_subm=self.subm,
+                                                 algo=algo)
+                        if self.indice_key is not None:
+                            msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
+                            assert self.indice_key not in indice_dict, msg
+                            indice_dict[self.indice_key] = indice_data
+                if input.benchmark:
+                    torch.cuda.synchronize()
+                    t = time.time()
+                indice_pairs_calc = indice_pairs
+                if indice_pairs.device != features.device:
+                    indice_pairs_calc = indice_pairs.to(features.device)
+                if self.subm:
+                    out_features = Fsp.indice_subm_conv(
                        features, self.weight, indice_pairs_calc,
-                        indice_pair_num, outids.shape[0], algo)
+                        indice_pair_num, outids.shape[0], algo, input._timer)
                else:
-                    out_features = Fsp.indice_conv(features, self.weight,
-                                                   indice_pairs_calc,
-                                                   indice_pair_num,
-                                                   outids.shape[0], algo)
-
-        else:
-            datas = input.find_indice_pair(self.indice_key)
-            if datas is not None:
-                assert isinstance(datas, ImplicitGemmIndiceData)
-            if self.inverse:
-                assert datas is not None and self.indice_key is not None
-                assert datas.is_subm is False, "inverse conv can only be used with standard conv and pool ops."
-                outids = datas.indices
-                pair_fwd = datas.pair_bwd
-                pair_bwd = datas.pair_fwd
-                pair_mask_fwd_splits = datas.pair_mask_bwd_splits
-                pair_mask_bwd_splits = datas.pair_mask_fwd_splits
-                mask_argsort_fwd_splits = datas.mask_argsort_bwd_splits
-                mask_argsort_bwd_splits = datas.mask_argsort_fwd_splits
-                masks = datas.masks
+                    if self.inverse:
+                        out_features = Fsp.indice_inverse_conv(
+                            features, self.weight, indice_pairs_calc,
+                            indice_pair_num, outids.shape[0], algo)
+                    else:
+                        out_features = Fsp.indice_conv(features, self.weight,
+                                                       indice_pairs_calc,
+                                                       indice_pair_num,
+                                                       outids.shape[0], algo,
+                                                       input._timer)

            else:
-                if self.indice_key is not None and datas is not None:
-                    outids = datas.out_indices
-                    pair_fwd = datas.pair_fwd
-                    pair_bwd = datas.pair_bwd
-                    pair_mask_fwd_splits = datas.pair_mask_fwd_splits
-                    pair_mask_bwd_splits = datas.pair_mask_bwd_splits
-                    mask_argsort_fwd_splits = datas.mask_argsort_fwd_splits
-                    mask_argsort_bwd_splits = datas.mask_argsort_bwd_splits
+                datas = input.find_indice_pair(self.indice_key)
+                if datas is not None:
+                    assert isinstance(datas, ImplicitGemmIndiceData)
+                if self.inverse:
+                    assert datas is not None and self.indice_key is not None
+                    assert datas.is_subm is False, "inverse conv can only be used with standard conv and pool ops."
+                    outids = datas.indices
+                    pair_fwd = datas.pair_bwd
+                    pair_bwd = datas.pair_fwd
+                    pair_mask_fwd_splits = datas.pair_mask_bwd_splits
+                    pair_mask_bwd_splits = datas.pair_mask_fwd_splits
+                    mask_argsort_fwd_splits = datas.mask_argsort_bwd_splits
+                    mask_argsort_bwd_splits = datas.mask_argsort_fwd_splits
                    masks = datas.masks
+
                else:
-                    res = ops.get_indice_pairs_implicit_gemm(
-                        indices,
-                        batch_size,
-                        spatial_shape,
-                        algo,
-                        ksize=self.kernel_size,
-                        stride=self.stride,
-                        padding=self.padding,
-                        dilation=self.dilation,
-                        out_padding=self.output_padding,
-                        subm=self.subm,
-                        transpose=self.transposed,
-                        is_train=self.training,
-                        alloc=input.thrust_allocator)
-                    outids = res[0]
-                    num_inds_per_loc = res[1]
-                    pair_fwd = res[2]
-                    pair_bwd = res[3]
-                    pair_mask_fwd_splits = res[4]
-                    pair_mask_bwd_splits = res[5]
-                    mask_argsort_fwd_splits = res[6]
-                    mask_argsort_bwd_splits = res[7]
-                    masks = res[8]
-                    if self.indice_key is not None:
-                        indice_data = ImplicitGemmIndiceData(
-                            outids,
-                            indices,
-                            pair_fwd,
-                            pair_bwd,
-                            pair_mask_fwd_splits=pair_mask_fwd_splits,
-                            pair_mask_bwd_splits=pair_mask_bwd_splits,
-                            mask_argsort_fwd_splits=mask_argsort_fwd_splits,
-                            mask_argsort_bwd_splits=mask_argsort_bwd_splits,
-                            masks=masks,
-                            is_subm=self.subm,
-                            out_spatial_shape=out_spatial_shape,
-                            algo=algo)
-                        msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
-                        assert self.indice_key not in indice_dict, msg
-                        indice_dict[self.indice_key] = indice_data
-            if input.benchmark:
-                torch.cuda.synchronize()
-                t = time.time()
-            num_activate_out = outids.shape[0]
-            out_features = Fsp.implicit_gemm(
-                features, self.weight, pair_fwd, pair_bwd,
-                pair_mask_fwd_splits, pair_mask_bwd_splits,
-                mask_argsort_fwd_splits, mask_argsort_bwd_splits,
-                num_activate_out, masks, self.training, self.subm)
+                    if self.indice_key is not None and datas is not None:
+                        outids = datas.out_indices
+                        pair_fwd = datas.pair_fwd
+                        pair_bwd = datas.pair_bwd
+                        pair_mask_fwd_splits = datas.pair_mask_fwd_splits
+                        pair_mask_bwd_splits = datas.pair_mask_bwd_splits
+                        mask_argsort_fwd_splits = datas.mask_argsort_fwd_splits
+                        mask_argsort_bwd_splits = datas.mask_argsort_bwd_splits
+                        masks = datas.masks
+                    else:
+                        with input._timer.namespace("gen_pairs"):
+                            res = ops.get_indice_pairs_implicit_gemm(
+                                indices,
+                                batch_size,
+                                spatial_shape,
+                                algo,
+                                ksize=self.kernel_size,
+                                stride=self.stride,
+                                padding=self.padding,
+                                dilation=self.dilation,
+                                out_padding=self.output_padding,
+                                subm=self.subm,
+                                transpose=self.transposed,
+                                is_train=self.training,
+                                alloc=input.thrust_allocator,
+                                timer=input._timer)
+                        outids = res[0]
+                        num_inds_per_loc = res[1]
+                        pair_fwd = res[2]
+                        pair_bwd = res[3]
+                        pair_mask_fwd_splits = res[4]
+                        pair_mask_bwd_splits = res[5]
+                        mask_argsort_fwd_splits = res[6]
+                        mask_argsort_bwd_splits = res[7]
+                        masks = res[8]
+                        if self.indice_key is not None:
+                            indice_data = ImplicitGemmIndiceData(
+                                outids,
+                                indices,
+                                pair_fwd,
+                                pair_bwd,
+                                pair_mask_fwd_splits=pair_mask_fwd_splits,
+                                pair_mask_bwd_splits=pair_mask_bwd_splits,
+                                mask_argsort_fwd_splits=mask_argsort_fwd_splits,
+                                mask_argsort_bwd_splits=mask_argsort_bwd_splits,
+                                masks=masks,
+                                is_subm=self.subm,
+                                out_spatial_shape=out_spatial_shape,
+                                algo=algo)
+                            msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
+                            assert self.indice_key not in indice_dict, msg
+                            indice_dict[self.indice_key] = indice_data
+                if input.benchmark:
+                    torch.cuda.synchronize()
+                    t = time.time()
+                num_activate_out = outids.shape[0]
+                out_features = Fsp.implicit_gemm(
+                    features, self.weight, pair_fwd, pair_bwd,
+                    pair_mask_fwd_splits, pair_mask_bwd_splits,
+                    mask_argsort_fwd_splits, mask_argsort_bwd_splits,
+                    num_activate_out, masks, self.training, self.subm,
+                    input._timer)
        if self.bias is not None:
            out_features += self.bias
        if input.benchmark:

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -19,6 +19,7 @@ import torch
 from spconv.core import ConvAlgo
 from spconv.pytorch.constants import PYTORCH_VERSION
 from spconv.pytorch.ops import ThrustSortAllocator
+from spconv.tools import CUDAKernelTimer

 if PYTORCH_VERSION >= [1, 8, 0]:
    try:
@@ -51,13 +52,14 @@ class IndiceData(object):


 class ImplicitGemmIndiceData(object):
-    def __init__(self, out_indices: torch.Tensor, indices: torch.Tensor, pair_fwd: torch.Tensor,
-                 pair_bwd: torch.Tensor,
+    def __init__(self, out_indices: torch.Tensor, indices: torch.Tensor,
+                 pair_fwd: torch.Tensor, pair_bwd: torch.Tensor,
                 pair_mask_fwd_splits: List[torch.Tensor],
                 pair_mask_bwd_splits: List[torch.Tensor],
                 mask_argsort_fwd_splits: List[torch.Tensor],
                 mask_argsort_bwd_splits: List[torch.Tensor],
-                 masks: List[np.ndarray], out_spatial_shape, is_subm: bool, algo: ConvAlgo):
+                 masks: List[np.ndarray], out_spatial_shape, is_subm: bool,
+                 algo: ConvAlgo):
        self.out_indices = out_indices
        self.indices = indices
        self.pair_fwd = pair_fwd
@@ -99,7 +101,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
                 voxel_num: Optional[torch.Tensor] = None,
                 indice_dict: Optional[dict] = None,
                 benchmark: bool = False,
-                 permanent_thrust_allocator: bool = False):
+                 permanent_thrust_allocator: bool = False,
+                 enable_timer: bool = False):
        """
        Args:
            features: [num_points, num_features] feature tensor
@@ -130,9 +133,10 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
        self.voxel_num = voxel_num  # for tensorrt
        self.benchmark = benchmark
        self.benchmark_record = {}
-        self.thrust_allocator: Optional[ThrustSortAllocator] = None 
+        self.thrust_allocator: Optional[ThrustSortAllocator] = None
        if permanent_thrust_allocator:
            self.thrust_allocator = ThrustSortAllocator(features.device)
+        self._timer = CUDAKernelTimer(enable_timer)

    def replace_feature(self, feature):
        """we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features))
@@ -144,7 +148,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
        new_spt.benchmark = self.benchmark
        new_spt.benchmark_record = self.benchmark_record
        new_spt.thrust_allocator = self.thrust_allocator
-
+        new_spt._timer = self._timer
        return new_spt

    @property
@@ -174,7 +178,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
    def spatial_size(self):
        return np.prod(self.spatial_shape)

-    def find_indice_pair(self, key) -> Optional[Union[IndiceData, ImplicitGemmIndiceData]]:
+    def find_indice_pair(
+            self, key) -> Optional[Union[IndiceData, ImplicitGemmIndiceData]]:
        if key is None:
            return None
        if key in self.indice_dict:
@@ -208,4 +213,5 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
                                  self.benchmark)
        tensor.benchmark_record = self.benchmark_record
        tensor.thrust_allocator = self.thrust_allocator
+        tensor._timer = self._timer
        return tensor
--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from cumm import tensorview as tv 
-import torch 
+from cumm import tensorview as tv
+import torch
 from typing import Optional, List
+
 _TORCH_DTYPE_TO_TV = {
    torch.float32: tv.float32,
    torch.float64: tv.float64,
@@ -26,10 +27,13 @@ _TORCH_DTYPE_TO_TV = {
    torch.uint8: tv.uint8,
 }

-def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Optional[List[int]] = None):
+
+def torch_tensor_to_tv(ten: torch.Tensor,
+                       dtype: Optional[int] = None,
+                       shape: Optional[List[int]] = None):
    assert ten.is_contiguous(), "must be contiguous tensor"
    ptr = ten.data_ptr()
-    device = ten.device 
+    device = ten.device
    if device.type == "cpu":
        tv_device = -1
    elif device.type == "cuda":
@@ -42,10 +46,12 @@ def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Op
        dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
    return tv.from_blob(ptr, shape, dtype, tv_device)

+
 def get_current_stream():
    return torch.cuda.current_stream().cuda_stream

+
 if __name__ == "__main__":
    a = torch.rand(2, 2)
    atv = torch_tensor_to_tv(a)
-    print(atv.numpy_view())
\ No newline at end of file
+    print(atv.numpy_view())
--- a/spconv/pytorch/functional.py
+++ b/spconv/pytorch/functional.py
@@ -15,8 +15,9 @@
 import torch
 from torch import nn
 from torch.autograd import Function
-
-import spconv.pytorch.ops as ops
+from typing import Optional
+from spconv.tools import CUDAKernelTimer
+from spconv.pytorch import ops
 import torch.cuda.amp as amp
 from torch.autograd.function import once_differentiable
 import numpy as np
@@ -27,23 +28,32 @@ from typing import List
 class SparseConvFunction(Function):
    @staticmethod
    @amp.custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out, algo):
+    def forward(ctx,
+                features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                algo,
+                timer: CUDAKernelTimer = CUDAKernelTimer(False)):
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        ctx.algo = algo
+        ctx.timer = timer
        return ops.indice_conv(features,
                               filters,
                               indice_pairs,
                               indice_pair_num,
                               num_activate_out,
                               False,
-                               algo=algo)
+                               algo=algo,
+                               timer=timer)

    @staticmethod
    @once_differentiable
    @amp.custom_bwd
    def backward(ctx, grad_output):
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        timer = ctx.timer

        input_bp, filters_bp = ops.indice_conv_backward(features,
                                                        filters,
@@ -51,18 +61,27 @@ class SparseConvFunction(Function):
                                                        indice_pairs,
                                                        indice_pair_num,
                                                        False,
-                                                        algo=ctx.algo)
+                                                        algo=ctx.algo,
+                                                        timer=timer)

-        return input_bp, filters_bp, None, None, None, None
+        return input_bp, filters_bp, None, None, None, None, None


 class SparseInverseConvFunction(Function):
    @staticmethod
    @amp.custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out, algo):
+    def forward(ctx,
+                features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                algo,
+                timer: CUDAKernelTimer = CUDAKernelTimer(False)):
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        ctx.algo = algo
+        ctx.timer = timer
+
        return ops.indice_conv(features,
                               filters,
                               indice_pairs,
@@ -70,13 +89,16 @@ class SparseInverseConvFunction(Function):
                               num_activate_out,
                               True,
                               False,
-                               algo=algo)
+                               algo=algo,
+                               timer=timer)

    @staticmethod
    @once_differentiable
    @amp.custom_bwd
    def backward(ctx, grad_output):
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        timer = ctx.timer
+
        input_bp, filters_bp = ops.indice_conv_backward(features,
                                                        filters,
                                                        grad_output,
@@ -84,29 +106,40 @@ class SparseInverseConvFunction(Function):
                                                        indice_pair_num,
                                                        True,
                                                        False,
-                                                        algo=ctx.algo)
+                                                        algo=ctx.algo,
+                                                        timer=timer)

-        return input_bp, filters_bp, None, None, None, None
+        return input_bp, filters_bp, None, None, None, None, None


 class SparseImplicitGemmFunction(Function):
    @staticmethod
    @amp.custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, features: torch.Tensor, filters: torch.Tensor,
-                pair_fwd: torch.Tensor, pair_bwd: torch.Tensor,
+    def forward(ctx,
+                features: torch.Tensor,
+                filters: torch.Tensor,
+                pair_fwd: torch.Tensor,
+                pair_bwd: torch.Tensor,
                pair_mask_fwd_splits: List[torch.Tensor],
                pair_mask_bwd_splits: List[torch.Tensor],
                mask_argsort_fwd_splits: List[torch.Tensor],
                mask_argsort_bwd_splits: List[torch.Tensor],
-                num_activate_out: int, masks: List[np.ndarray], is_train: bool,
-                is_subm: bool):
+                num_activate_out: int,
+                masks: List[np.ndarray],
+                is_train: bool,
+                is_subm: bool,
+                timer: CUDAKernelTimer = CUDAKernelTimer(False)):

-        out, mask_out, mask_width = ops.implicit_gemm(
-            features, filters, pair_fwd, pair_mask_fwd_splits,
-            mask_argsort_fwd_splits, num_activate_out, masks, is_train, is_subm)
+        out, mask_out, mask_width = ops.implicit_gemm(features, filters,
+                                                      pair_fwd,
+                                                      pair_mask_fwd_splits,
+                                                      mask_argsort_fwd_splits,
+                                                      num_activate_out, masks,
+                                                      is_train, is_subm, timer)
        ctx.save_for_backward(features, filters, pair_fwd, pair_bwd)
        ctx.mask_width = mask_width
        ctx.mask_out = mask_out
+        ctx.timer = timer
        ctx.pair_mask_fwd_splits = pair_mask_fwd_splits
        ctx.mask_argsort_fwd_splits = mask_argsort_fwd_splits
        ctx.pair_mask_bwd_splits = pair_mask_bwd_splits
@@ -130,30 +163,40 @@ class SparseImplicitGemmFunction(Function):
        # num_activate_out = ctx.num_activate_out
        masks = ctx.masks
        is_subm = ctx.is_subm
-
-        input_bp, filters_bp = ops.implicit_gemm_backward(features,
-                                                        filters,
-                                                        grad_output,
-                                                        pair_fwd,
-                                                        pair_bwd,
-                                                        pair_mask_fwd_splits,
-                                                        pair_mask_bwd_splits,
-                                                        mask_argsort_fwd_splits,
-                                                        mask_argsort_bwd_splits,
-                                                        mask_output_fwd=mask_out,
-                                                        masks=masks,
-                                                        mask_width=mask_width,
-                                                        is_subm=is_subm)
-        None_9 = [None] * 10
+        timer = ctx.timer
+        input_bp, filters_bp = ops.implicit_gemm_backward(
+            features,
+            filters,
+            grad_output,
+            pair_fwd,
+            pair_bwd,
+            pair_mask_fwd_splits,
+            pair_mask_bwd_splits,
+            mask_argsort_fwd_splits,
+            mask_argsort_bwd_splits,
+            mask_output_fwd=mask_out,
+            masks=masks,
+            mask_width=mask_width,
+            is_subm=is_subm,
+            timer=timer)
+        None_9 = [None] * 11
        return (input_bp, filters_bp, *None_9)

+
 class SubMConvFunction(Function):
    @staticmethod
    @amp.custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out, algo):
+    def forward(ctx,
+                features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                algo,
+                timer: CUDAKernelTimer = CUDAKernelTimer(False)):
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        ctx.algo = algo
+        ctx.timer = timer
        return ops.indice_conv(features,
                               filters,
                               indice_pairs,
@@ -161,13 +204,16 @@ class SubMConvFunction(Function):
                               num_activate_out,
                               False,
                               True,
-                               algo=algo)
+                               algo=algo,
+                               timer=timer)

    @staticmethod
    @once_differentiable
    @amp.custom_bwd
    def backward(ctx, grad_output):
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        timer = ctx.timer
+
        input_bp, filters_bp = ops.indice_conv_backward(features,
                                                        filters,
                                                        grad_output,
@@ -175,9 +221,10 @@ class SubMConvFunction(Function):
                                                        indice_pair_num,
                                                        False,
                                                        True,
-                                                        algo=ctx.algo)
+                                                        algo=ctx.algo,
+                                                        timer=timer)

-        return input_bp, filters_bp, None, None, None, None
+        return input_bp, filters_bp, None, None, None, None, None


 class SparseMaxPoolFunction(Function):
@@ -199,12 +246,14 @@ class SparseMaxPoolFunction(Function):
                                               indice_pairs, indice_pair_num)
        return input_bp, None, None, None

+
 class SparseMaxPoolImplicitGemmFunction(Function):
    @staticmethod
    @amp.custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, features: torch.Tensor, indice_pairs_fwd: torch.Tensor, indice_pairs_bwd: torch.Tensor,
-                   num_activate_out: int):
-        out = ops.indice_maxpool_implicit_gemm(features, indice_pairs_fwd, num_activate_out)
+    def forward(ctx, features: torch.Tensor, indice_pairs_fwd: torch.Tensor,
+                indice_pairs_bwd: torch.Tensor, num_activate_out: int):
+        out = ops.indice_maxpool_implicit_gemm(features, indice_pairs_fwd,
+                                               num_activate_out)
        ctx.save_for_backward(indice_pairs_bwd, features, out)
        return out

@@ -213,10 +262,11 @@ class SparseMaxPoolImplicitGemmFunction(Function):
    @amp.custom_bwd
    def backward(ctx, grad_output):
        indice_pairs_bwd, features, out = ctx.saved_tensors
-        input_bp = ops.indice_maxpool_implicit_gemm_backward(features, out, grad_output,
-                                               indice_pairs_bwd)
+        input_bp = ops.indice_maxpool_implicit_gemm_backward(
+            features, out, grad_output, indice_pairs_bwd)
        return input_bp, None, None, None

+
 indice_conv = SparseConvFunction.apply
 implicit_gemm = SparseImplicitGemmFunction.apply
 indice_inverse_conv = SparseInverseConvFunction.apply

--- a/spconv/pytorch/modules.py
+++ b/spconv/pytorch/modules.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
 import sys
 import time
 from collections import OrderedDict
@@ -53,6 +52,7 @@ class SparseModule(nn.Module):
    def __init__(self, name=None):
        super().__init__()
        self.name = name
+        self._sparse_unique_name = ""


 class SparseSequential(SparseModule):
@@ -143,3 +143,8 @@ class SparseSequential(SparseModule):
                    input = module(input)
        return input

+
+def assign_name_for_sparse_modules(module: nn.Module):
+    for k, n in module.named_modules():
+        if isinstance(n, SparseModule):
+            n._sparse_unique_name = k
--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -26,14 +26,19 @@ from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
 from spconv.core_cc.csrc.sparse.all import SpconvOps
 import spconv.core_cc as _ext

+from spconv.utils import nullcontext
+
 if hasattr(_ext, "cumm"):
+    CPU_ONLY_BUILD = False
    from spconv.algo import GEMM, CONV  # , GATHER, SCATTER
 else:
-    GEMM = None 
-    CONV = None 
+    CPU_ONLY_BUILD = True
+    GEMM = None
+    CONV = None
 import time
 from spconv.constants import FILTER_HWIO
 from cumm.gemm import codeops
+from spconv.tools import CUDAKernelTimer

 DEBUG = False

@@ -240,19 +245,21 @@ def get_indice_pairs(indices: torch.Tensor,
    return out_inds, pair, indice_num_per_loc


-def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
-                                   batch_size: int,
-                                   spatial_shape: List[int],
-                                   algo: ConvAlgo,
-                                   ksize: List[int],
-                                   stride: List[int],
-                                   padding: List[int],
-                                   dilation: List[int],
-                                   out_padding: List[int],
-                                   subm: bool = False,
-                                   transpose: bool = False,
-                                   is_train: bool = True,
-                                   alloc: Optional[ThrustSortAllocator] = None):
+def get_indice_pairs_implicit_gemm(
+    indices: torch.Tensor,
+    batch_size: int,
+    spatial_shape: List[int],
+    algo: ConvAlgo,
+    ksize: List[int],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    out_padding: List[int],
+    subm: bool = False,
+    transpose: bool = False,
+    is_train: bool = True,
+    alloc: Optional[ThrustSortAllocator] = None,
+    timer: CUDAKernelTimer = CUDAKernelTimer(False)):
    """
    Why return tuple? because pytorch seems don't support custom object in autograd.
    return: (
@@ -336,18 +343,18 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
        out_inds_tv = torch_tensor_to_tv(out_inds)
        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
        pair_mask_tv = torch_tensor_to_tv(pair_mask, dtype=tv.uint32)
-
-        SpconvOps.generate_subm_conv_inds(inds_tv,
-                                          hashdata_tv,
-                                          pair_tv,
-                                          out_inds_tv,
-                                          indice_num_per_loc_tv,
-                                          batch_size=batch_size,
-                                          input_dims=spatial_shape,
-                                          ksize=ksize,
-                                          dilation=dilation,
-                                          indice_pair_mask=pair_mask_tv,
-                                          stream_int=stream)
+        with timer.record("gen_subm_inds", stream):
+            SpconvOps.generate_subm_conv_inds(inds_tv,
+                                              hashdata_tv,
+                                              pair_tv,
+                                              out_inds_tv,
+                                              indice_num_per_loc_tv,
+                                              batch_size=batch_size,
+                                              input_dims=spatial_shape,
+                                              ksize=ksize,
+                                              dilation=dilation,
+                                              indice_pair_mask=pair_mask_tv,
+                                              stream_int=stream)
        # torch.cuda.synchronize()
        # print("SUBM0", time.time() - t)
        # CONV.stream_synchronize(stream)
@@ -358,13 +365,15 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
        mask_argsort_tv = torch_tensor_to_tv(mask_argsort)
        if alloc is None:
            alloc = ThrustSortAllocator(indices.device)
-        for j in range(mask_split_count):
-            # thrust don't provide two-step sort (first step return workspace size)
-            # so I use this stupid hack to use torch allocator without touch
-            # pytorch binary (c++).
-            # f**k thrust
-            SpconvOps.sort_1d_by_key_allocator(pair_mask_tv[j], alloc.alloc,
-                                               mask_argsort_tv[j], stream)
+        with timer.record("gen_subm_inds_sort", stream):
+            for j in range(mask_split_count):
+                # thrust don't provide two-step sort (first step return workspace size)
+                # so I use this stupid hack to use torch allocator without touch
+                # pytorch binary (c++).
+                # f**k thrust
+                SpconvOps.sort_1d_by_key_allocator(pair_mask_tv[j],
+                                                   alloc.alloc,
+                                                   mask_argsort_tv[j], stream)
        # CONV.stream_synchronize(stream)
        pair_mask_in_splits = [pair_mask[i] for i in range(mask_split_count)]
        mask_argsort_in_splits = [
@@ -391,20 +400,20 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
                                        dtype=indices.dtype,
                                        device=indices.device)
        indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
-
-        SpconvOps.generate_conv_inds_mask_stage1(inds_tv,
-                                                 pair_bwd_tv,
-                                                 indice_pairs_uniq_tv,
-                                                 indice_num_per_loc_tv,
-                                                 batch_size=batch_size,
-                                                 output_dims=out_shape,
-                                                 input_dims=spatial_shape,
-                                                 ksize=ksize,
-                                                 stride=stride,
-                                                 padding=padding,
-                                                 dilation=dilation,
-                                                 transposed=transpose,
-                                                 stream_int=stream)
+        with timer.record("gen_conv_inds_stage1", stream):
+            SpconvOps.generate_conv_inds_mask_stage1(inds_tv,
+                                                     pair_bwd_tv,
+                                                     indice_pairs_uniq_tv,
+                                                     indice_num_per_loc_tv,
+                                                     batch_size=batch_size,
+                                                     output_dims=out_shape,
+                                                     input_dims=spatial_shape,
+                                                     ksize=ksize,
+                                                     stride=stride,
+                                                     padding=padding,
+                                                     dilation=dilation,
+                                                     transposed=transpose,
+                                                     stream_int=stream)
        if DEBUG:

            CONV.stream_synchronize(stream)
@@ -452,25 +461,25 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
            CONV.stream_synchronize(stream)
            print("REGU_S2_PREPARE", time.time() - t)
            t = time.time()
-
-        SpconvOps.generate_conv_inds_mask_stage2(inds_tv,
-                                                 hashdata_tv,
-                                                 pair_fwd_tv,
-                                                 pair_bwd_tv,
-                                                 uniq_res_tv,
-                                                 out_inds_tv,
-                                                 pair_mask_fwd_tv,
-                                                 pair_mask_bwd_tv,
-                                                 num_out_act=num_act_out,
-                                                 batch_size=batch_size,
-                                                 output_dims=out_shape,
-                                                 input_dims=spatial_shape,
-                                                 ksize=ksize,
-                                                 stride=stride,
-                                                 padding=padding,
-                                                 dilation=dilation,
-                                                 transposed=transpose,
-                                                 stream_int=stream)
+        with timer.record("gen_conv_inds_stage2", stream):
+            SpconvOps.generate_conv_inds_mask_stage2(inds_tv,
+                                                     hashdata_tv,
+                                                     pair_fwd_tv,
+                                                     pair_bwd_tv,
+                                                     uniq_res_tv,
+                                                     out_inds_tv,
+                                                     pair_mask_fwd_tv,
+                                                     pair_mask_bwd_tv,
+                                                     num_out_act=num_act_out,
+                                                     batch_size=batch_size,
+                                                     output_dims=out_shape,
+                                                     input_dims=spatial_shape,
+                                                     ksize=ksize,
+                                                     stride=stride,
+                                                     padding=padding,
+                                                     dilation=dilation,
+                                                     transposed=transpose,
+                                                     stream_int=stream)
        if DEBUG:

            CONV.stream_synchronize(stream)
@@ -492,62 +501,61 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
            mask_argsort_bwd_tv = torch_tensor_to_tv(mask_argsort_bwd)
        if alloc is None:
            alloc = ThrustSortAllocator(indices.device)
-
-        if is_mask_split:
-            for j in range(mask_split_count):
-                mask_tv = tv.from_numpy(masks[j])
-                # here we try to ensure only call allocator once.
-                if not is_train:
-                    SpconvOps.sort_1d_by_key_split_allocator(
-                        pair_mask_fwd_tv[j], alloc.alloc, mask_tv,
-                        mask_argsort_fwd_tv[j], stream)
-                else:
-                    if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
-                        SpconvOps.sort_1d_by_key_split_allocator(
-                            pair_mask_bwd_tv[j], alloc.alloc, mask_tv,
-                            mask_argsort_bwd_tv[j], stream)
+        with timer.record("gen_conv_inds_sort", stream):
+            if is_mask_split:
+                for j in range(mask_split_count):
+                    mask_tv = tv.from_numpy(masks[j])
+                    # here we try to ensure only call allocator once.
+                    if not is_train:
                        SpconvOps.sort_1d_by_key_split_allocator(
                            pair_mask_fwd_tv[j], alloc.alloc, mask_tv,
                            mask_argsort_fwd_tv[j], stream)
                    else:
-                        SpconvOps.sort_1d_by_key_split_allocator(
-                            pair_mask_fwd_tv[j], alloc.alloc, mask_tv,
-                            mask_argsort_fwd_tv[j], stream)
-                        SpconvOps.sort_1d_by_key_split_allocator(
-                            pair_mask_bwd_tv[j], alloc.alloc, mask_tv,
-                            mask_argsort_bwd_tv[j], stream)
-
-                # SpconvOps.sort_1d_by_key_split(pair_mask_fwd_tv[j], mask_tv,
-                #                                mask_argsort_fwd_tv[j], stream)
-                # if is_train:
-                #     SpconvOps.sort_1d_by_key_split(pair_mask_bwd_tv[j],
-                #                                    mask_tv,
-                #                                    mask_argsort_bwd_tv[j],
-                #                                    stream)
+                        if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
+                            SpconvOps.sort_1d_by_key_split_allocator(
+                                pair_mask_bwd_tv[j], alloc.alloc, mask_tv,
+                                mask_argsort_bwd_tv[j], stream)
+                            SpconvOps.sort_1d_by_key_split_allocator(
+                                pair_mask_fwd_tv[j], alloc.alloc, mask_tv,
+                                mask_argsort_fwd_tv[j], stream)
+                        else:
+                            SpconvOps.sort_1d_by_key_split_allocator(
+                                pair_mask_fwd_tv[j], alloc.alloc, mask_tv,
+                                mask_argsort_fwd_tv[j], stream)
+                            SpconvOps.sort_1d_by_key_split_allocator(
+                                pair_mask_bwd_tv[j], alloc.alloc, mask_tv,
+                                mask_argsort_bwd_tv[j], stream)
+
+                    # SpconvOps.sort_1d_by_key_split(pair_mask_fwd_tv[j], mask_tv,
+                    #                                mask_argsort_fwd_tv[j], stream)
+                    # if is_train:
+                    #     SpconvOps.sort_1d_by_key_split(pair_mask_bwd_tv[j],
+                    #                                    mask_tv,
+                    #                                    mask_argsort_bwd_tv[j],
+                    #                                    stream)

-        else:
-            # if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
-            if not is_train:
-                SpconvOps.sort_1d_by_key_allocator(pair_mask_fwd_tv[0],
-                                                alloc.alloc,
-                                                mask_argsort_fwd_tv[0], stream)
            else:
-                if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
-                    SpconvOps.sort_1d_by_key_allocator(pair_mask_bwd_tv[0],
-                                                    alloc.alloc,
-                                                    mask_argsort_bwd_tv[0],
-                                                    stream)
+                # if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
+                if not is_train:
                    SpconvOps.sort_1d_by_key_allocator(pair_mask_fwd_tv[0],
-                                                    alloc.alloc,
-                                                    mask_argsort_fwd_tv[0], stream)
+                                                       alloc.alloc,
+                                                       mask_argsort_fwd_tv[0],
+                                                       stream)
                else:
-                    SpconvOps.sort_1d_by_key_allocator(pair_mask_fwd_tv[0],
-                                                    alloc.alloc,
-                                                    mask_argsort_fwd_tv[0], stream)
-                    SpconvOps.sort_1d_by_key_allocator(pair_mask_bwd_tv[0],
-                                                    alloc.alloc,
-                                                    mask_argsort_bwd_tv[0],
-                                                    stream)
+                    if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
+                        SpconvOps.sort_1d_by_key_allocator(
+                            pair_mask_bwd_tv[0], alloc.alloc,
+                            mask_argsort_bwd_tv[0], stream)
+                        SpconvOps.sort_1d_by_key_allocator(
+                            pair_mask_fwd_tv[0], alloc.alloc,
+                            mask_argsort_fwd_tv[0], stream)
+                    else:
+                        SpconvOps.sort_1d_by_key_allocator(
+                            pair_mask_fwd_tv[0], alloc.alloc,
+                            mask_argsort_fwd_tv[0], stream)
+                        SpconvOps.sort_1d_by_key_allocator(
+                            pair_mask_bwd_tv[0], alloc.alloc,
+                            mask_argsort_bwd_tv[0], stream)
        if DEBUG:
            CONV.stream_synchronize(stream)
            print("REGU_S2_FINISH", time.time() - t)
@@ -587,7 +595,8 @@ def indice_conv(features: torch.Tensor,
                num_activate_out: int,
                inverse: bool = False,
                subm: bool = False,
-                algo: ConvAlgo = ConvAlgo.Native):
+                algo: ConvAlgo = ConvAlgo.Native,
+                timer: CUDAKernelTimer = CUDAKernelTimer(False)):
    # filters: RSKC
    # stream = get_current_stream()
    # CONV.stream_synchronize(stream)
@@ -717,38 +726,38 @@ def indice_conv(features: torch.Tensor,
            stream=stream)
    # CONV.stream_synchronize(stream)
    # t = time.time()
-
-    for i, nhot in enumerate(indice_pair_num_cpu):
-        if subm and i == kv_center:
-            continue
-        if subm and i > kv_center:
-            nhot = indice_pair_num_cpu[kv - i - 1]
-        if nhot <= 0:
-            continue
-        inp_indices = pair_in[i].slice_first_axis(0, nhot)
-        out_indices = pair_out[i].slice_first_axis(0, nhot)
-        b = filters_tv[i]
-        # inp @ filter.T, NC @ KC
-        beta = 1.0 if inited else 0.0
-        algo_desp = GEMM.run_with_tuned_result(
-            tuned_res,
-            a,
-            b,
-            c,
-            False,
-            False if FILTER_HWIO else True,
-            False,
-            arch=arch,
-            stream=stream,
-            shuffle_type=ShuffleStrideType.ShuffleAC,
-            a_inds=inp_indices,
-            c_inds=out_indices,
-            hint=AlgoHint.Fowrard.value,
-            alpha=1.0,
-            beta=beta)
-
-        # gather_times += gather_time
-        inited = True
+    with timer.record("forward", stream):
+        for i, nhot in enumerate(indice_pair_num_cpu):
+            if subm and i == kv_center:
+                continue
+            if subm and i > kv_center:
+                nhot = indice_pair_num_cpu[kv - i - 1]
+            if nhot <= 0:
+                continue
+            inp_indices = pair_in[i].slice_first_axis(0, nhot)
+            out_indices = pair_out[i].slice_first_axis(0, nhot)
+            b = filters_tv[i]
+            # inp @ filter.T, NC @ KC
+            beta = 1.0 if inited else 0.0
+            algo_desp = GEMM.run_with_tuned_result(
+                tuned_res,
+                a,
+                b,
+                c,
+                False,
+                False if FILTER_HWIO else True,
+                False,
+                arch=arch,
+                stream=stream,
+                shuffle_type=ShuffleStrideType.ShuffleAC,
+                a_inds=inp_indices,
+                c_inds=out_indices,
+                hint=AlgoHint.Fowrard.value,
+                alpha=1.0,
+                beta=beta)
+
+            # gather_times += gather_time
+            inited = True
    # CONV.stream_synchronize(stream)
    # print(out_features.mean(), out_features.max(), out_features.min())

@@ -770,7 +779,8 @@ def indice_conv_backward(features: torch.Tensor,
                         indice_pair_num: torch.Tensor,
                         inverse: bool = False,
                         subm: bool = False,
-                         algo: ConvAlgo = ConvAlgo.Native):
+                         algo: ConvAlgo = ConvAlgo.Native,
+                         timer: CUDAKernelTimer = CUDAKernelTimer(False)):
    # print(out_bp.mean(), out_bp.max(), out_bp.min())

    num_activate_out = out_bp.shape[0]
@@ -1046,12 +1056,16 @@ def indice_conv_backward(features: torch.Tensor,
    return (din, dfilters.reshape(filters_shape))


-def implicit_gemm(features: torch.Tensor, filters: torch.Tensor,
+def implicit_gemm(features: torch.Tensor,
+                  filters: torch.Tensor,
                  pair_fwd: torch.Tensor,
                  pair_mask_fwd_splits: List[torch.Tensor],
                  mask_argsort_fwd_splits: List[torch.Tensor],
-                  num_activate_out: int, masks: List[np.ndarray],
-                  is_train: bool, is_subm: bool):
+                  num_activate_out: int,
+                  masks: List[np.ndarray],
+                  is_train: bool,
+                  is_subm: bool,
+                  timer: CUDAKernelTimer = CUDAKernelTimer(False)):
    stream = get_current_stream()
    # if DEBUG:

@@ -1136,24 +1150,25 @@ def implicit_gemm(features: torch.Tensor, filters: torch.Tensor,
    # CONV.stream_synchronize(stream)

    # t = time.time()
-
-    for j in range(num_split):
-        beta = 0 if j == 0 else 1
-        CONV.run_with_tuned_result(tune_res,
-                                   ConvOpType.kForward,
-                                   features_tv,
-                                   filters_tv,
-                                   out_features_tv,
-                                   mask=pair_mask_fwd_split_tvs[j],
-                                   mask_argsort=mask_argsort_fwd_split_tvs[j],
-                                   mask_output=mask_output_fwd_tvs[j],
-                                   indices=pair_fwd_tv,
-                                   reverse_mask=False,
-                                   mask_filter=masks_ints[j],
-                                   mask_width=-1,
-                                   beta=beta,
-                                   stream=stream,
-                                   verbose=False)
+    with timer.record("implicit_gemm", stream):
+        for j in range(num_split):
+            beta = 0 if j == 0 else 1
+            CONV.run_with_tuned_result(
+                tune_res,
+                ConvOpType.kForward,
+                features_tv,
+                filters_tv,
+                out_features_tv,
+                mask=pair_mask_fwd_split_tvs[j],
+                mask_argsort=mask_argsort_fwd_split_tvs[j],
+                mask_output=mask_output_fwd_tvs[j],
+                indices=pair_fwd_tv,
+                reverse_mask=False,
+                mask_filter=masks_ints[j],
+                mask_width=-1,
+                beta=beta,
+                stream=stream,
+                verbose=False)

    # torch.cuda.synchronize()
    # if DEBUG:
@@ -1166,16 +1181,20 @@ def implicit_gemm(features: torch.Tensor, filters: torch.Tensor,
    return out_features, mask_output_fwd, mask_width


-def implicit_gemm_backward(features: torch.Tensor, filters: torch.Tensor,
-                           out_bp: torch.Tensor, pair_fwd: torch.Tensor,
+def implicit_gemm_backward(features: torch.Tensor,
+                           filters: torch.Tensor,
+                           out_bp: torch.Tensor,
+                           pair_fwd: torch.Tensor,
                           pair_bwd: torch.Tensor,
                           pair_mask_fwd_splits: List[torch.Tensor],
                           pair_mask_bwd_splits: List[torch.Tensor],
                           mask_argsort_fwd_splits: List[torch.Tensor],
                           mask_argsort_bwd_splits: List[torch.Tensor],
                           mask_output_fwd: torch.Tensor,
-                           masks: List[np.ndarray], mask_width: int,
-                           is_subm: bool):
+                           masks: List[np.ndarray],
+                           mask_width: int,
+                           is_subm: bool,
+                           timer: CUDAKernelTimer = CUDAKernelTimer(False)):
    # print(out_bp.mean(), out_bp.max(), out_bp.min())
    if features.dtype == torch.int8 or features.dtype == torch.qint8:
        raise NotImplementedError("work in progress")
@@ -1287,44 +1306,46 @@ def implicit_gemm_backward(features: torch.Tensor, filters: torch.Tensor,
                                dtype=torch.int8,
                                device=features.device)
        workspace_tv = torch_tensor_to_tv(workspace)
-    for j in range(num_split):
-        beta = 0 if j == 0 else 1
-        if is_subm:
-            mask = pair_mask_fwd_split_tvs[j]
-            mask_argsort = mask_argsort_fwd_split_tvs[j]
-        else:
-            mask = pair_mask_bwd_split_tvs[j]
-            mask_argsort = mask_argsort_bwd_split_tvs[j]
-
-        CONV.run_with_tuned_result(dgrad_tune_res,
-                                   ConvOpType.kBackwardInput,
-                                   din_tv,
-                                   filters_tv,
-                                   dout_tv,
-                                   mask=mask,
-                                   mask_argsort=mask_argsort,
-                                   mask_output=tv.Tensor(),
-                                   indices=pair_bwd_tv,
-                                   reverse_mask=is_subm,
-                                   mask_filter=masks[j].item(),
-                                   mask_width=-1,
-                                   beta=beta,
-                                   stream=stream)
-        CONV.run_with_tuned_result(wgrad_tune_res,
-                                   ConvOpType.kBackwardWeight,
-                                   features_tv,
-                                   dfilters_tv,
-                                   dout_tv,
-                                   mask=mask_output_fwd_tv[j],
-                                   mask_argsort=mask_argsort_fwd_split_tvs[j],
-                                   mask_output=tv.Tensor(),
-                                   indices=pair_fwd_tv,
-                                   reverse_mask=False,
-                                   mask_filter=masks[j].item(),
-                                   mask_width=mask_width,
-                                   beta=beta,
-                                   workspace=workspace_tv,
-                                   stream=stream)
+    with timer.record("implicit_gemm_backward", stream):
+        for j in range(num_split):
+            beta = 0 if j == 0 else 1
+            if is_subm:
+                mask = pair_mask_fwd_split_tvs[j]
+                mask_argsort = mask_argsort_fwd_split_tvs[j]
+            else:
+                mask = pair_mask_bwd_split_tvs[j]
+                mask_argsort = mask_argsort_bwd_split_tvs[j]
+
+            CONV.run_with_tuned_result(dgrad_tune_res,
+                                       ConvOpType.kBackwardInput,
+                                       din_tv,
+                                       filters_tv,
+                                       dout_tv,
+                                       mask=mask,
+                                       mask_argsort=mask_argsort,
+                                       mask_output=tv.Tensor(),
+                                       indices=pair_bwd_tv,
+                                       reverse_mask=is_subm,
+                                       mask_filter=masks[j].item(),
+                                       mask_width=-1,
+                                       beta=beta,
+                                       stream=stream)
+            CONV.run_with_tuned_result(
+                wgrad_tune_res,
+                ConvOpType.kBackwardWeight,
+                features_tv,
+                dfilters_tv,
+                dout_tv,
+                mask=mask_output_fwd_tv[j],
+                mask_argsort=mask_argsort_fwd_split_tvs[j],
+                mask_output=tv.Tensor(),
+                indices=pair_fwd_tv,
+                reverse_mask=False,
+                mask_filter=masks[j].item(),
+                mask_width=mask_width,
+                beta=beta,
+                workspace=workspace_tv,
+                stream=stream)

    return (din, dfilters.reshape(filters_shape))

@@ -1445,4 +1466,3 @@ def indice_maxpool_implicit_gemm_backward(features, out_features, out_bp,
                                             out_bp_tv, din_tv,
                                             indice_pairs_tv, stream)
    return din
-
--- a/spconv/pytorch/pool.py
+++ b/spconv/pytorch/pool.py
@@ -24,11 +24,12 @@ from typing import List, Optional, Tuple, Union

 from spconv import pytorch as spconv
 from spconv.core import ConvAlgo
-import spconv.pytorch.functional as Fsp
+from spconv.pytorch import functional as Fsp
 from spconv.pytorch import ops
 from spconv.pytorch.core import IndiceData, ImplicitGemmIndiceData
 from spconv.pytorch.modules import SparseModule
 from spconv.cppconstants import CPU_ONLY_BUILD
+from spconv.utils import nullcontext


 class SparseMaxPool(SparseModule):
@@ -126,79 +127,87 @@ class SparseMaxPool(SparseModule):
        if input.benchmark:
            torch.cuda.synchronize()
            t = time.time()
-        out_padding = [0] * self.ndim 
+        out_padding = [0] * self.ndim
        indice_dict = input.indice_dict.copy()
-        if self.algo == ConvAlgo.Native:
-            outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
-                indices, batch_size, spatial_shape, ConvAlgo.Native,
-                self.kernel_size, self.stride, self.padding, self.dilation, out_padding,
-                False)
-            if input.benchmark:
-                torch.cuda.synchronize()
-                interval = time.time() - t
-                out_tensor.benchmark_record[
-                    self.name]["indice_gen_time"].append(interval)
-                t = time.time()
+        profile_ctx = nullcontext()
+        if input._timer is not None and self._sparse_unique_name:
+            profile_ctx = input._timer.namespace(self._sparse_unique_name)
+        with profile_ctx:
+            if self.algo == ConvAlgo.Native:
+                outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
+                    indices, batch_size, spatial_shape, ConvAlgo.Native,
+                    self.kernel_size, self.stride, self.padding, self.dilation,
+                    out_padding, False)
+                if input.benchmark:
+                    torch.cuda.synchronize()
+                    interval = time.time() - t
+                    out_tensor.benchmark_record[
+                        self.name]["indice_gen_time"].append(interval)
+                    t = time.time()

-            if self.indice_key is not None:
-                datas = input.find_indice_pair(self.indice_key)
-                if datas is None:
-                    indice_data = IndiceData(outids,
-                                             indices,
-                                             indice_pairs,
-                                             indice_pairs_num,
-                                             spatial_shape,
-                                             is_subm=False,
-                                             algo=self.algo)
-                    indice_dict[self.indice_key] = indice_data
-                else:
-                    raise ValueError(f"indice key {self.indice_key} exists")
+                if self.indice_key is not None:
+                    datas = input.find_indice_pair(self.indice_key)
+                    if datas is None:
+                        indice_data = IndiceData(outids,
+                                                 indices,
+                                                 indice_pairs,
+                                                 indice_pairs_num,
+                                                 spatial_shape,
+                                                 is_subm=False,
+                                                 algo=self.algo)
+                        indice_dict[self.indice_key] = indice_data
+                    else:
+                        raise ValueError(
+                            f"indice key {self.indice_key} exists")

-            out_features = Fsp.indice_maxpool(features,
-                                              indice_pairs.to(device),
-                                              indice_pairs_num.to(device),
-                                              outids.shape[0])
-        else:
-            res = ops.get_indice_pairs_implicit_gemm(indices,
-                                                     batch_size,
-                                                     spatial_shape,
-                                                     self.algo,
-                                                     ksize=self.kernel_size,
-                                                     stride=self.stride,
-                                                     padding=self.padding,
-                                                     dilation=self.dilation,
-                                                     out_padding=out_padding,
-                                                     subm=self.subm,
-                                                     is_train=self.training,
-                                                     alloc=input.thrust_allocator)
-            outids = res[0]
-            num_inds_per_loc = res[1]
-            pair_fwd = res[2]
-            pair_bwd = res[3]
-            pair_mask_fwd_splits = res[4]
-            pair_mask_bwd_splits = res[5]
-            mask_argsort_fwd_splits = res[6]
-            mask_argsort_bwd_splits = res[7]
-            masks = res[8]
-            if self.indice_key is not None:
-                indice_data = ImplicitGemmIndiceData(
-                    outids,
-                    indices,
-                    pair_fwd,
-                    pair_bwd,
-                    pair_mask_fwd_splits=pair_mask_fwd_splits,
-                    pair_mask_bwd_splits=pair_mask_bwd_splits,
-                    mask_argsort_fwd_splits=mask_argsort_fwd_splits,
-                    mask_argsort_bwd_splits=mask_argsort_bwd_splits,
-                    masks=masks,
-                    is_subm=self.subm,
-                    out_spatial_shape=out_spatial_shape,
-                    algo=self.algo)
-                msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
-                assert self.indice_key not in indice_dict, msg
-                indice_dict[self.indice_key] = indice_data
-            out_features = Fsp.indice_maxpool_implicit_gemm(
-                features, pair_fwd, pair_bwd, outids.shape[0])
+                out_features = Fsp.indice_maxpool(features,
+                                                  indice_pairs.to(device),
+                                                  indice_pairs_num.to(device),
+                                                  outids.shape[0])
+            else:
+                with input._timer.namespace("gen_pairs"):
+                    res = ops.get_indice_pairs_implicit_gemm(
+                        indices,
+                        batch_size,
+                        spatial_shape,
+                        self.algo,
+                        ksize=self.kernel_size,
+                        stride=self.stride,
+                        padding=self.padding,
+                        dilation=self.dilation,
+                        out_padding=out_padding,
+                        subm=self.subm,
+                        is_train=self.training,
+                        alloc=input.thrust_allocator,
+                        timer=input._timer)
+                outids = res[0]
+                num_inds_per_loc = res[1]
+                pair_fwd = res[2]
+                pair_bwd = res[3]
+                pair_mask_fwd_splits = res[4]
+                pair_mask_bwd_splits = res[5]
+                mask_argsort_fwd_splits = res[6]
+                mask_argsort_bwd_splits = res[7]
+                masks = res[8]
+                if self.indice_key is not None:
+                    indice_data = ImplicitGemmIndiceData(
+                        outids,
+                        indices,
+                        pair_fwd,
+                        pair_bwd,
+                        pair_mask_fwd_splits=pair_mask_fwd_splits,
+                        pair_mask_bwd_splits=pair_mask_bwd_splits,
+                        mask_argsort_fwd_splits=mask_argsort_fwd_splits,
+                        mask_argsort_bwd_splits=mask_argsort_bwd_splits,
+                        masks=masks,
+                        is_subm=self.subm,
+                        out_spatial_shape=out_spatial_shape,
+                        algo=self.algo)
+                    msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
+                    assert self.indice_key not in indice_dict, msg
+                    indice_dict[self.indice_key] = indice_data
+                out_features = Fsp.indice_maxpool_implicit_gemm(
+                    features, pair_fwd, pair_bwd, outids.shape[0])

        if input.benchmark:
            torch.cuda.synchronize()

--- a/spconv/pytorch/spatial.py
+++ b/spconv/pytorch/spatial.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/spconv/pytorch/tables.py
+++ b/spconv/pytorch/tables.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,18 +15,18 @@
 import torch
 from torch.autograd import Function

-import spconv.pytorch as spconv
 #from torch.nn import Module
 from spconv.pytorch.modules import SparseModule
 from spconv.pytorch.core import SparseConvTensor
-from typing import List 
+from typing import List
+

 class JoinTable(SparseModule):  # Module):
    def forward(self, input: List[SparseConvTensor]):
-        output = spconv.SparseConvTensor(
-            torch.cat([i.features for i in input], 1), input[0].indices,
-            input[0].spatial_shape, input[0].batch_size, input[0].grid, input[0].voxel_num,
-            input[0].indice_dict)
+        output = SparseConvTensor(torch.cat([i.features for i in input], 1),
+                                  input[0].indices, input[0].spatial_shape,
+                                  input[0].batch_size, input[0].grid,
+                                  input[0].voxel_num, input[0].indice_dict)
        output.benchmark_record = input[1].benchmark_record
        output.thrust_allocator = input[1].thrust_allocator
        return output
@@ -37,10 +37,10 @@ class JoinTable(SparseModule):  # Module):

 class AddTable(SparseModule):  # Module):
    def forward(self, input: List[SparseConvTensor]):
-        output = spconv.SparseConvTensor(
-            sum([i.features for i in input]), input[0].indices,
-            input[0].spatial_shape, input[0].batch_size, input[0].grid, input[0].voxel_num,
-            input[0].indice_dict)
+        output = SparseConvTensor(sum([i.features for i in input]),
+                                  input[0].indices, input[0].spatial_shape,
+                                  input[0].batch_size, input[0].grid,
+                                  input[0].voxel_num, input[0].indice_dict)
        output.benchmark_record = input[1].benchmark_record
        output.thrust_allocator = input[1].thrust_allocator
        return output

--- a/spconv/pytorch/utils.py
+++ b/spconv/pytorch/utils.py
@@ -82,24 +82,25 @@ class PointToVoxel(object):

                if self.point_indice_data.shape[0] < pc.shape[0]:
                    self.point_indice_data = torch.empty([pc.shape[0]],
-                                                            dtype=torch.int64,
-                                                            device=self.device)
+                                                         dtype=torch.int64,
+                                                         device=self.device)
                pc_tv = torch_tensor_to_tv(pc)
                stream = get_current_stream()
                voxels_tv = torch_tensor_to_tv(self.voxels)
                indices_tv = torch_tensor_to_tv(self.indices)
                num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel)
-                hashdata_tv = torch_tensor_to_tv(self.hashdata,
-                                                dtype=tv.custom128,
-                                                shape=[self.hashdata.shape[0]])
-                point_indice_data_tv = torch_tensor_to_tv(self.point_indice_data)
+                hashdata_tv = torch_tensor_to_tv(
+                    self.hashdata,
+                    dtype=tv.custom128,
+                    shape=[self.hashdata.shape[0]])
+                point_indice_data_tv = torch_tensor_to_tv(
+                    self.point_indice_data)

-                res = SpconvOps.point2voxel_cuda(pc_tv, voxels_tv, indices_tv,
-                                                num_per_voxel_tv, hashdata_tv,
-                                                point_indice_data_tv, self.vsize,
-                                                self.grid_size, self.grid_stride,
-                                                self.coors_range, empty_mean,
-                                                clear_voxels, stream)
+                res = SpconvOps.point2voxel_cuda(
+                    pc_tv, voxels_tv, indices_tv, num_per_voxel_tv,
+                    hashdata_tv, point_indice_data_tv, self.vsize,
+                    self.grid_size, self.grid_stride, self.coors_range,
+                    empty_mean, clear_voxels, stream)
                num_voxels = res[0].shape[0]
            else:
                pc_tv = torch_tensor_to_tv(pc)
@@ -111,8 +112,9 @@ class PointToVoxel(object):
                res = SpconvOps.point2voxel_cpu(pc_tv, voxels_tv, indices_tv,
                                                num_per_voxel_tv, hashdata_tv,
                                                self.vsize, self.grid_size,
-                                                self.grid_stride, self.coors_range,
-                                                empty_mean, clear_voxels)
+                                                self.grid_stride,
+                                                self.coors_range, empty_mean,
+                                                clear_voxels)
                num_voxels = res[0].shape[0]

            return (self.voxels[:num_voxels], self.indices[:num_voxels],

--- a/spconv/test_utils.py
+++ b/spconv/test_utils.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/spconv/tools.py
+++ b/spconv/tools.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+from spconv.cppconstants import CPU_ONLY_BUILD
+import contextlib
+from spconv.utils import nullcontext
+if not CPU_ONLY_BUILD:
+    from cumm.tensorview import CUDAKernelTimer as _CUDAKernelTimer
+
+
+class CUDAKernelTimer:
+    def __init__(self, enable: bool = True) -> None:
+        self.enable = enable and not CPU_ONLY_BUILD
+        if self.enable:
+            self._timer = _CUDAKernelTimer(enable)
+        else:
+            self._timer = None
+
+    @contextlib.contextmanager
+    def _namespace(self, name: str):
+        assert self._timer is not None
+        self._timer.push(name)
+        try:
+            yield
+        finally:
+            self._timer.pop()
+
+    @contextlib.contextmanager
+    def _record(self, name: str, stream: int = 0):
+        assert self._timer is not None
+        self._timer.push(name)
+        try:
+            self._timer.insert_pair("", "start", "stop")
+            self._timer.record("start", stream)
+            yield
+            self._timer.record("stop", stream)
+        finally:
+            self._timer.pop()
+
+    def namespace(self, name: str):
+        if self.enable:
+            return self._namespace(name)
+        else:
+            return nullcontext()
+
+    def record(self, name: str, stream: int = 0):
+        if self.enable:
+            return self._record(name, stream)
+        else:
+            return nullcontext()
+
+    def get_all_pair_time(self) -> Dict[str, float]:
+        if self.enable:
+            assert self._timer is not None
+            return self._timer.get_all_pair_duration()
+        else:
+            return {}
+
+    @staticmethod
+    def collect_by_name(name: str, res: Dict[str, float]):
+        filtered_res: Dict[str, float] = {}
+        for k, v in res.items():
+            k_split = k.split(".")
+            if name in k_split:
+                filtered_res[k] = v
+        return filtered_res
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,18 +13,37 @@
 # limitations under the License.

 import numpy as np
-from cumm import tensorview as tv 
+from cumm import tensorview as tv
+from contextlib import AbstractContextManager
+from spconv.cppconstants import CPU_ONLY_BUILD

 from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
 from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
 from spconv.core_cc.csrc.sparse.all.ops_cpu3d import Point2VoxelCPU as Point2VoxelCPU3d
 from spconv.core_cc.csrc.sparse.all.ops_cpu4d import Point2VoxelCPU as Point2VoxelCPU4d
-import spconv.core_cc.csrc.sparse.all as __all

-IS_CPU_ONLY_BUILD = hasattr(__all, "ops1d")
-
-if IS_CPU_ONLY_BUILD:
+if not CPU_ONLY_BUILD:
    from spconv.core_cc.csrc.sparse.all.ops1d import Point2Voxel as Point2VoxelGPU1d
    from spconv.core_cc.csrc.sparse.all.ops2d import Point2Voxel as Point2VoxelGPU2d
    from spconv.core_cc.csrc.sparse.all.ops3d import Point2Voxel as Point2VoxelGPU3d
    from spconv.core_cc.csrc.sparse.all.ops4d import Point2Voxel as Point2VoxelGPU4d
+
+
+class nullcontext(AbstractContextManager):
+    """Context manager that does no additional processing.
+
+    Used as a stand-in for a normal context manager, when a particular
+    block of code is only sometimes used with a normal context manager:
+
+    cm = optional_cm if condition else nullcontext()
+    with cm:
+        # Perform operation, using optional_cm if condition is True
+    """
+    def __init__(self, enter_result=None):
+        self.enter_result = enter_result
+
+    def __enter__(self):
+        return self.enter_result
+
+    def __exit__(self, *excinfo):
+        pass
--- a/test/aaa.py
+++ b/test/aaa.py
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-STR = """
-BWG 0.0008761882781982422
-BWG 0.0008311271667480469
-BWG 0.002079486846923828
-BWG 0.002329587936401367
-BWG 0.0025458335876464844
-BWG 0.0026700496673583984
-BWG 0.002583742141723633
-BWG 0.0025262832641601562
-BWG 0.003481149673461914
-BWG 0.003238201141357422
-BWG 0.005095958709716797
-BWG 0.0037899017333984375
-BWG 0.003931283950805664
-BWG 0.003300189971923828
-"""
-"""
-0.003921985626220703
-0.0049707889556884766
-0.0052530765533447266
-0.0060312747955322266
-0.0036766529083251953
-0.00421142578125
-
-0.002129793167114258
-0.0023038387298583984
-0.0013151168823242188
-0.0015285015106201172
-0.0008392333984375
-0.0008127689361572266
-0.0002486705780029297
-0.00030994415283203125
-"""
-
-STR1 = """
-SUBM 0.0005137920379638672
-F 0.0012662410736083984
-F 0.0016875267028808594
-REGU 0.0009055137634277344
-M 0.0009114742279052734
-SUBM 0.00037789344787597656
-F 0.0020329952239990234
-F 0.001947641372680664
-REGU 0.0009374618530273438
-M 0.00045609474182128906
-SUBM 0.0009856224060058594
-F 0.0009992122650146484
-F 0.0010600090026855469
-REGU 0.0006346702575683594
-M 0.0004057884216308594
-SUBM 0.0006394386291503906
-F 0.0008478164672851562
-F 0.0008838176727294922
-REGU 0.0007183551788330078
-M 0.00025177001953125
-SUBM 0.0009539127349853516
-F 0.0009481906890869141
-F 0.0010502338409423828
-REGU 0.0007147789001464844
-M 0.000274658203125
-SUBM 0.0007004737854003906
-F 0.0009715557098388672
-F 0.0012331008911132812
-REGU 0.0008800029754638672
-M 0.0002167224884033203
-SUBM 0.00045108795166015625
-F 0.0006735324859619141
-F 0.0008375644683837891
-"""
-STR2 = """
-F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A0T1688_NS00_C3_01LLL_1 0.0007038116455078125
-F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0007627010345458984
-F Turing_f16f16f16f16f16tnt_m64n128k32m32n64k32A1T1688_NS00_C3_01LLL_1 0.0007650852203369141
-F Turing_f16f16f16f16f16tnt_m64n128k32m32n64k32A1T1688_NS00_C3_01LLL_1 0.0008864402770996094
-F Turing_f16f16f16f16f16tnt_m64n128k32m32n64k32A1T1688_NS00_C3_01LLL_1 0.0004017353057861328
-F Turing_f16f16f16f16f16tnt_m32n128k64m32n32k32A1T1688_NS00_C3_01LLL_1 0.0006165504455566406
-F Turing_f16f16f16f16f16tnt_m64n64k32m32n32k32A1T1688_NS00_C3_01LLL_1 0.0005872249603271484
-F Turing_f16f16f16f16f16tnt_m64n64k32m32n32k32A1T1688_NS00_C3_01LLL_1 0.0006289482116699219
-F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0002968311309814453
-F Turing_f16f16f16f16f16tnt_m64n64k32m32n32k32A1T1688_NS00_C3_01LLL_1 0.0003299713134765625
-F Turing_f16f16f16f16f16tnt_m64n128k64m32n64k32A1T1688_NS00_C3_01LLL_1 0.0002288818359375
-F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0002830028533935547
-F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0001780986785888672
-F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0003058910369873047
-"""
-def _handle_lines(s: str):
-    arr = s.split(" ")
-    return (arr[0], float(arr[-1]))
-from cumm.gemm.codeops import group_by
-def print_str(s: str):
-
-    nums = list(map(_handle_lines, s.strip().split("\n")))
-    num_dict = group_by(lambda x: x[0], nums)
-    num_dict_ = {k: sum([vv[1] for vv in v]) for k, v in num_dict.items()}
-    print(num_dict_)
-
-print_str(STR1)
-print_str(STR2)
\ No newline at end of file
--- a/test/benchmark.py
+++ b/test/benchmark.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,10 +19,12 @@ import numpy as np
 import torch
 from torch import nn
 from cumm import tensorview as tv
-from spconv.core import ConvAlgo 
+from spconv.core import ConvAlgo

 import spconv.pytorch as spconv
 from spconv.utils import Point2VoxelCPU3d
+
+
 def waymo_data(batch_size=1):
    gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
                           150000, 1)
@@ -42,7 +44,7 @@ def waymo_data(batch_size=1):
 class Net(nn.Module):
    def __init__(self, shape, algo):
        super().__init__()
-        pool_algo = algo 
+        pool_algo = algo
        # pool_algo = ConvAlgo.Native
        self.net = spconv.SparseSequential(
            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
@@ -68,7 +70,6 @@ class Net(nn.Module):
            # nn.BatchNorm1d(32),
            # nn.ReLU(),
            # spconv.SparseConv3d(64, 64, 2, 2, bias=False, indice_key="m0"),
-
            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
            spconv.SubMConv3d(64,
                              96,
@@ -101,7 +102,6 @@ class Net(nn.Module):
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            # spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
-
            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
            spconv.SubMConv3d(128,
                              160,
@@ -118,7 +118,6 @@ class Net(nn.Module):
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            # spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
-
            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
            spconv.SubMConv3d(160,
                              192,
@@ -136,7 +135,6 @@ class Net(nn.Module):
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2, indice_key="m4", algo=pool_algo),
            # spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
-
            spconv.SubMConv3d(192,
                              224,
                              3,
@@ -174,7 +172,6 @@ class Net(nn.Module):
            # # nn.ReLU(),

            # spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False, algo=algo),
-
        )
        max_batch_size = 1
        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
@@ -183,16 +180,25 @@ class Net(nn.Module):
        # self.grid = None
        self.shape = shape

-    def forward(self, features, coors, batch_size):
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
-                                    self.grid)
+    def forward(self, features, coors, batch_size, enable_timer: bool = False):
+        x = spconv.SparseConvTensor(features,
+                                    coors,
+                                    self.shape,
+                                    batch_size,
+                                    self.grid,
+                                    enable_timer=enable_timer)
        return self.net(x)

+
 class Net2(nn.Module):
    def __init__(self, shape, algo):
        super().__init__()
        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 128, 3, bias=False, indice_key="c0",
+            spconv.SubMConv3d(3,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c0",
                              algo=algo),
            # spconv.SubMConv3d(32,
            #                   32,
@@ -240,20 +246,22 @@ class Net2(nn.Module):
                                    self.grid)
        return self.net(x)

-import numpy as np 
-from cumm import tensorview as tv 
+
+import numpy as np
+from cumm import tensorview as tv
 from spconv.core_cc.csrc.sparse.all import SpconvOps
-import pickle 
+import pickle
 import torch

-from spconv.pytorch.cppcore import torch_tensor_to_tv 
+from spconv.pytorch.cppcore import torch_tensor_to_tv
+

 def sort_bench():
    with open("/home/yy/asd.pkl", "rb") as f:
        a_th = pickle.load(f)
    mask_argsort = torch.empty((1, a_th.shape[1]),
-                                dtype=torch.int32,
-                                device=a_th.device)
+                               dtype=torch.int32,
+                               device=a_th.device)

    a = a_th.cpu().numpy()[0]
    a_tv = torch_tensor_to_tv(a_th)
@@ -262,8 +270,9 @@ def sort_bench():
        a_tv_1 = a_tv.clone()
        SpconvOps.sort_1d_by_key(a_tv_1[0], mask_argsort_tv[0])

+
 def main():
-    import pickle 
+    import pickle
    np.random.seed(50051)
    torch.manual_seed(50051)
    # voxels, coors, spatial_shape = waymo_data()
@@ -280,24 +289,55 @@ def main():
    voxels_th = torch.from_numpy(voxels).to(device).to(dtype)
    coors_th = torch.from_numpy(coors).to(device).int()
    voxels_th.requires_grad = True
-    algo = spconv.ConvAlgo.MaskImplicitGemm
+    algo = spconv.ConvAlgo.Native
+    # 3080 Laptop
+    # MaskImpGemm: 11.2ms
+    # MaskSplitImpGemm: 12.2ms
+    # Native: 13.7ms
+    # F32
+    # MaskSplitImpGemm: 22ms
+    # MaskImplicitGemm: 23.5ms
+    # Native: 21.7ms
+    # Pure Gemm
+    # Native: 6.6ms
+    # MaskImpGemm: 4.3ms
+    # MaskSplitImpGemm: 4.0ms
+    # F16 Bwd
+    # MaskSplitImpGemm: 12.2ms
+    # MaskImpGemm: 13.8ms
+    # Native: 25.2ms
+
+    # F32 Bwd
+    # Native: 41.9ms
+    # MaskImpGemm: 51.0ms
+    # MaskSplitImpGemm: 41.1ms
+    # algo = None
    net = Net(spatial_shape, algo).to(device).eval().to(dtype).train()
+    spconv.assign_name_for_sparse_modules(net)
    print(coors_th.shape)
    out = net(voxels_th, coors_th, 1)
    print(out.spatial_shape)
-    print(voxels.mean(),  voxels.max(), voxels.min())
-    dout = np.random.uniform(-0.2, 0.2,
-                                out.features.shape).astype(np.float32)
+    print(voxels.mean(), voxels.max(), voxels.min())
+    dout = np.random.uniform(-0.2, 0.2, out.features.shape).astype(np.float32)
    dout_t = torch.from_numpy(dout).to(device).to(dtype)

-    print(out.spatial_shape, out.features.mean(),  out.features.max(),  out.features.min())
+    print(out.spatial_shape, out.features.mean(), out.features.max(),
+          out.features.min())
    times = []
    with torch.no_grad():
        for i in range(20):
            print("------------")
            torch.cuda.synchronize()
            t = time.time()
-            out_nograd = net(voxels_th, coors_th, 1)
+            out_nograd = net(voxels_th, coors_th, 1, True)
+            timer = out_nograd._timer
+            res = timer.collect_by_name("forward", timer.get_all_pair_time())
+            res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
+
+            print(sum(res.values()) + sum(res2.values()))
+            # print(timer.get_all_pair_time())
+
+            # print(sum(timer.get_all_pair_time().values()))
            torch.cuda.synchronize()
            # sort_bench()
            times.append(time.time() - t)
@@ -313,8 +353,8 @@ def main():
    #     torch.cuda.synchronize()
    #     times.append(time.time() - t)

-    # print((net.grid == -1).float().sum(), net.grid.numel())
-    # print("spconv time", time.time() - t)
+    # # # print((net.grid == -1).float().sum(), net.grid.numel())
+    # # # print("spconv time", time.time() - t)
    # print("spconv bw time", np.mean(times[5:]))



--- a/test/test_conv.py
+++ b/test/test_conv.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,6 +30,7 @@ from spconv.constants import FILTER_HWIO
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False

+
 class SparseConv3dTestTorch(nn.Module):
    def __init__(self,
                 num_layers,
@@ -363,7 +364,10 @@ class TestSpConv(TestCase):
        strides = [1, 2, 3]
        paddings = [0, 1, 2]
        dilations = [1, 2, 3]
-        algos = [ConvAlgo.Native, ConvAlgo.MaskImplicitGemm, ConvAlgo.MaskSplitImplicitGemm]
+        algos = [
+            ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
+            ConvAlgo.MaskSplitImplicitGemm
+        ]
        algos = [ConvAlgo.MaskSplitImplicitGemm]

        for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
@@ -375,8 +379,16 @@ class TestSpConv(TestCase):
            device = torch.device(dev)
            num_points = [1000] * bs
            dtype = torch.float32
-            net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
-                                        d, algo=al).to(device).to(dtype)
+            net = SparseConv3dTestTorch(1,
+                                        3,
+                                        shape,
+                                        IC,
+                                        OC,
+                                        k,
+                                        s,
+                                        p,
+                                        d,
+                                        algo=al).to(device).to(dtype)
            net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                      d).to(device).to(dtype)

@@ -390,27 +402,32 @@ class TestSpConv(TestCase):
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device).to(dtype)
            features_t.requires_grad = True
-            features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
+            features_dense_t = torch.from_numpy(features_dense).to(device).to(
+                dtype)
            features_dense_t.requires_grad = True
            if net.algo == ConvAlgo.Native:
                if FILTER_HWIO:
-                    filters = np.random.uniform(-1, 1, size=[k, k, k, IC,
-                                                            OC]).astype(np.float32)
+                    filters = np.random.uniform(-1, 1,
+                                                size=[k, k, k, IC,
+                                                      OC]).astype(np.float32)
                else:
-                    filters = np.random.uniform(-1, 1, size=[k, k, k, OC,
-                                                            IC]).astype(np.float32)
+                    filters = np.random.uniform(-1, 1,
+                                                size=[k, k, k, OC,
+                                                      IC]).astype(np.float32)
                filters_t = torch.from_numpy(filters).to(device).to(dtype)
                if FILTER_HWIO:
-                    net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
-                                                                    2).contiguous()
+                    net_ref.net[0].weight.data[:] = filters_t.permute(
+                        4, 3, 0, 1, 2).contiguous()
                else:
-                    net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
-                                                                    2).contiguous()
+                    net_ref.net[0].weight.data[:] = filters_t.permute(
+                        3, 4, 0, 1, 2).contiguous()
            else:
-                filters = np.random.uniform(-1, 1, size=[OC, k, k, k, IC]).astype(np.float32)
+                filters = np.random.uniform(-1, 1,
+                                            size=[OC, k, k, k,
+                                                  IC]).astype(np.float32)
                filters_t = torch.from_numpy(filters).to(device).to(dtype)
-                net_ref.net[0].weight.data[:] = filters_t.permute(0, 4, 1, 2,
-                                                                3).contiguous()
+                net_ref.net[0].weight.data[:] = filters_t.permute(
+                    0, 4, 1, 2, 3).contiguous()

            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
@@ -446,7 +463,6 @@ class TestSpConv(TestCase):
                self.assertAllClose(dw, dw_ref, atol=1e-4)
            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)

-
    def testSpDeConv3d(self):
        np.random.seed(484)
        devices = ["cuda:0"]
@@ -499,11 +515,11 @@ class TestSpConv(TestCase):
            filters_t = torch.from_numpy(filters).to(device)
            print(net_ref.net[0].weight.shape)
            if FILTER_HWIO:
-                net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
-                                                                2).contiguous()
+                net_ref.net[0].weight.data[:] = filters_t.permute(
+                    3, 4, 0, 1, 2).contiguous()
            else:
-                net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
-                                                                2).contiguous()
+                net_ref.net[0].weight.data[:] = filters_t.permute(
+                    4, 3, 0, 1, 2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs).dense()
@@ -532,7 +548,6 @@ class TestSpConv(TestCase):
                    dw = dw.transpose(4, 3, 0, 1, 2)
                self.assertAllClose(dw, dw_ref, atol=1e-4)

-
    def testSpCpConv3d(self):
        np.random.seed(484)
        devices = ["cuda:0", "cpu:0"]

--- a/version.txt
+++ b/version.txt
-2.1.3
\ No newline at end of file
+2.1.5
\ No newline at end of file