Merge branch 'develop'

a6abf55d · yan.yan · fad30002 · 79a3eaf2 · a6abf55d · a6abf55d
Commit a6abf55d authored Oct 20, 2021 by yan.yan
20 changed files
--- a/spconv/build.py
+++ b/spconv/build.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+import pccm
+from pccm.utils import project_is_editable, project_is_installed
+from .constants import PACKAGE_NAME, PACKAGE_ROOT
+if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME):
+    from spconv.core import SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
+    from cumm.gemm.main import GemmMainUnitTest
+    from spconv.csrc.sparse.all import SpconvOps
+    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
+    cu.namespace = "cumm.gemm.main"
+    pccm.builder.build_pybind([cu, SpconvOps()],
+                              PACKAGE_ROOT / "core_cc",
+                              namespace_root=PACKAGE_ROOT,
+                              objects_folder="objects",
+                              load_library=False)
--- a/spconv/constants.py
+++ b/spconv/constants.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+from typing import List
+from pccm.utils import project_is_editable, project_is_installed
+PACKAGE_NAME = "spconv"
+PACKAGE_ROOT = Path(__file__).parent.resolve()
+EDITABLE_INSTALLED = project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME)
+_filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", "0")
+FILTER_HWIO = _filter_hwio_env == "1"
\ No newline at end of file
--- a/spconv/core.py
+++ b/spconv/core.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+from cumm.gemm.main import gen_shuffle_params, GemmAlgoParams
+from cumm.gemm import kernel
+from typing import List
+from cumm.gemm.algospec.core import TensorOpParams
+class ConvAlgo(Enum):
+    Native = "Native"
+    MaskImplicitGemm = "MaskImplicitGemm"
+    MaskSplitImplicitGemm = "MaskSplitImplicitGemm"
+class AlgoHint(Enum):
+    NoHint = 0b000
+    Fowrard = 0b001
+    BackwardInput = 0b010
+    BackwardWeight = 0b100
+# we can't add more kernels here because build in github action is very slow.
+# TODO two step build: build gemm kernels first, then bind for every python
+SHUFFLE_SIMT_PARAMS: List[GemmAlgoParams] = [
+    *gen_shuffle_params(
+        (64, 128, 32), (32, 64, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (128, 64, 32), (64, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (128, 128, 32),
+        (32, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (128, 128, 32),
+        (64, 32, 32), ["s8,s8,s8,s32,s32", "s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (64, 64, 32), (32, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (64, 256, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (64, 256, 8),
+    #     (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 128, 16),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 512, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (128, 128, 8),
+    #     (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (128, 128, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 128, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (64, 128, 8),
+    #     (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (128, 64, 8),
+    #     (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (128, 64, 8),
+        (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 64, 8),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 64, 16),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 32, 16),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 32, 32),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # fall back kernels if mat is misaligned for half
+    # *gen_shuffle_params(
+    #     (128, 128, 8),
+    #     (32, 64, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 64, 32),
+        (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 32, 32),
+        (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (64, 64, 16),
+    #     (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 128, 16),
+        (32, 64, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 64, 8),
+        (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+]
+SHUFFLE_VOLTA_PARAMS: List[GemmAlgoParams] = [
+    *gen_shuffle_params(
+        (64, 64, 32),
+        (32, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    # *gen_shuffle_params(
+    #     (128, 128, 32),
+    #     (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+    #     kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (128, 256, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (256, 128, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (128, 64, 32),
+        (64, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (64, 128, 32),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+]
+# SHUFFLE_VOLTA_PARAMS = []
+SHUFFLE_TURING_PARAMS: List[GemmAlgoParams] = [
+    *gen_shuffle_params(
+        (64, 64, 32),
+        (32, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (128, 128, 32),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    # *gen_shuffle_params(
+    #     (128, 128, 32),
+    #     (64, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+    #     kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 64, 64),
+        (32, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 128, 64),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (128, 256, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (256, 128, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (128, 64, 32),
+        (64, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 128, 32),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 64, 32), (32, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (128, 128, 32),
+        (32, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    # *gen_shuffle_params(
+    #     (128, 128, 32),
+    #     (64, 32, 32), ["s8,s8,s8,s32,s32", "s8,s8,s32,s32,s32"], 2,
+    #     kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (128, 256, 32),
+        (64, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (256, 128, 32),
+        (64, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (128, 64, 32), (64, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (64, 128, 32), (32, 64, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+]
--- a/spconv/core_cc/__init__.pyi
+++ b/spconv/core_cc/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/core_cc/csrc/__init__.pyi
+++ b/spconv/core_cc/csrc/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/core_cc/csrc/sparse/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class SpconvOps:
+    @staticmethod
+    def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> None: 
+        """
+        Args:
+            indices: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            indice_num_per_loc: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+            transposed: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def generate_conv_inds_stage1_5(indice_pairs_uniq: Tensor, ndim: int, uniq_size: int, stream_int: int = 0) -> int: 
+        """
+        Args:
+            indice_pairs_uniq: 
+            ndim: 
+            uniq_size: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            out_inds: 
+            num_out_act: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+            transposed: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def generate_subm_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, input_dims: List[int], ksize: List[int], dilation: List[int], indice_pair_mask: Tensor =  Tensor(), backward: bool = False, stream_int: int =  0) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            out_inds: 
+            indice_num_per_loc: 
+            batch_size: 
+            input_dims: 
+            ksize: 
+            dilation: 
+            indice_pair_mask: 
+            backward: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def maxpool_forward(out: Tensor, inp: Tensor, out_inds: Tensor, in_inds: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            inp: 
+            out_inds: 
+            in_inds: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def maxpool_backward(out: Tensor, inp: Tensor, dout: Tensor, dinp: Tensor, out_inds: Tensor, in_inds: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            inp: 
+            dout: 
+            dinp: 
+            out_inds: 
+            in_inds: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def sort_1d_by_key(data: Tensor) -> Tensor: 
+        """
+        Args:
+            data: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops1d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops1d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops2d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops2d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops3d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops3d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops4d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/cumm/__init__.pyi
+++ b/spconv/core_cc/cumm/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/core_cc/cumm/gemm/__init__.pyi
+++ b/spconv/core_cc/cumm/gemm/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/core_cc/cumm/gemm/gather.pyi
+++ b/spconv/core_cc/cumm/gemm/gather.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class ScatterAll:
+    def __init__(self) -> None: ...
+    @staticmethod
+    def get_all_scatter_params() -> List[Tuple[int, int, int, int]]: ...
+    def supported_scatter(self, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, channel_size: int, dtype: int) -> bool: 
+        """
+        Args:
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            channel_size: 
+            dtype: 
+        """
+        ...
+    @staticmethod
+    def stream_synchronize(stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def scatter(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            stream: 
+        """
+        ...
+    def scatter2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            size: 
+            stream: 
+        """
+        ...
+class GatherAll:
+    def __init__(self) -> None: ...
+    @staticmethod
+    def get_all_gather_params() -> List[Tuple[int, int, int, int]]: ...
+    @staticmethod
+    def supported(bytes_per_access: int, channel_size: int, dtype: int) -> bool: 
+        """
+        Args:
+            bytes_per_access: 
+            channel_size: 
+            dtype: 
+        """
+        ...
+    @staticmethod
+    def stream_synchronize(stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def gather(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            stream: 
+        """
+        ...
+    def gather2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            size: 
+            stream: 
+        """
+        ...
--- a/spconv/core_cc/cumm/gemm/main.pyi
+++ b/spconv/core_cc/cumm/gemm/main.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class GemmAlgoDesp:
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    tile_shape: Tuple[int, int, int]
+    warp_tile_shape: Tuple[int, int, int]
+    num_stage: int
+    dacc: int
+    dcomp: int
+    algo: str
+    tensorop: List[int]
+    split_k_serial_: int
+    split_k_parallel_: int
+    shuffle_type: str
+    element_per_access_a: int
+    element_per_access_b: int
+    element_per_access_c: int
+    def __init__(self) -> None: ...
+    def __repr__(self) -> str: ...
+    @property
+    def split_k_serial(self) -> bool: ...
+    @split_k_serial.setter
+    def split_k_serial(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def split_k_parallel(self) -> bool: ...
+    @split_k_parallel.setter
+    def split_k_parallel(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    def check_valid(self) -> None: ...
+    @property
+    def trans_a(self) -> bool: ...
+    @trans_a.setter
+    def trans_a(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def trans_b(self) -> bool: ...
+    @trans_b.setter
+    def trans_b(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def trans_c(self) -> bool: ...
+    @trans_c.setter
+    def trans_c(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    def query_workspace_size(self, m: int, n: int, k: int, split_k_slices: int) -> int: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+            split_k_slices: 
+        """
+        ...
+    def supported(self, m: int, n: int, k: int) -> bool: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+        """
+        ...
+    def supported_ldx(self, lda: int, ldb: int, ldc: int) -> bool: 
+        """
+        Args:
+            lda: 
+            ldb: 
+            ldc: 
+        """
+        ...
+class GemmParams:
+    algo_desp: GemmAlgoDesp
+    split_k_slices: int
+    workspace: Tensor =  Tensor()
+    a_inds: Tensor =  Tensor()
+    b_inds: Tensor =  Tensor()
+    c_inds: Tensor =  Tensor()
+    alpha: float
+    beta: float
+    stream: int
+    def __init__(self) -> None: ...
+    def check_valid(self) -> None: ...
+    @property
+    def a(self) -> Tensor: ...
+    @a.setter
+    def a(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def b(self) -> Tensor: ...
+    @b.setter
+    def b(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def c(self) -> Tensor: ...
+    @c.setter
+    def c(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+class GemmMainUnitTest:
+    @staticmethod
+    def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
+    @staticmethod
+    def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "NS", a_inds_shape: List[int] =  [], b_inds_shape: List[int] =  [], c_inds_shape: List[int] =  []) -> Tuple[int, int, int]: 
+        """
+        Args:
+            a_shape: 
+            b_shape: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            shuffle_type: 
+            a_inds_shape: 
+            b_inds_shape: 
+            c_inds_shape: 
+        """
+        ...
+    @staticmethod
+    def align_to_power2(val: int) -> int: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @staticmethod
+    def device_synchronize() -> None: ...
+    @staticmethod
+    def stream_synchronize(stream: int) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    @staticmethod
+    def simple_select_tile_shape(m: int, n: int, k: int, tile_ms: List[int], tile_ns: List[int], tile_ks: List[int], tile_shape_to_algos: Dict[int, List[int]], large_k_first: bool) -> List[int]: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+            tile_ms: 
+            tile_ns: 
+            tile_ks: 
+            tile_shape_to_algos: 
+            large_k_first: 
+        """
+        ...
+    @staticmethod
+    def matmul2(params: GemmParams) -> None: 
+        """
+        Args:
+            params: 
+        """
+        ...
--- a/spconv/csrc/__init__.py
+++ b/spconv/csrc/__init__.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.