v2.1.5: add profile tool and python 3.6 for linux

82fd7a8b · yan.yan · f31eee3a · f31eee3a · f31eee3a · 82fd7a8b
Commit 82fd7a8b authored Nov 10, 2021 by yan.yan
20 changed files
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-from cumm.tensorview import Tensor
-class Point2VoxelCPU:
-    densehashdata: Tensor
-    voxels: Tensor
-    indices: Tensor
-    num_per_voxel: Tensor
-    @property
-    def grid_size(self) -> List[int]: ...
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
-    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-class Point2VoxelCommon:
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
@@ -9,14 +9,11 @@ class Point2VoxelCPU:
    @property
    def grid_size(self) -> List[int]: ...
    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> Tuple[List[float], List[int], List[int], List[float]]: 
+    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
        """
        Args:
            vsize_xyz: 
            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
        """
        ...
    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
@@ -30,7 +27,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -38,7 +35,6 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
-            mean_per_voxel: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -47,7 +43,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -55,7 +51,6 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
-            mean_per_voxel: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-from cumm.tensorview import Tensor
-class Point2VoxelCPU:
-    densehashdata: Tensor
-    voxels: Tensor
-    indices: Tensor
-    num_per_voxel: Tensor
-    @property
-    def grid_size(self) -> List[int]: ...
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
-    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-class Point2VoxelCommon:
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
--- a/spconv/core_cc/cumm/__init__.pyi
+++ b/spconv/core_cc/cumm/__init__.pyi
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/spconv/core_cc/cumm/conv/main.pyi
+++ b/spconv/core_cc/cumm/conv/main.pyi
@@ -2,6 +2,7 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
 from pccm.stubs import EnumValue, EnumClassValue
 from ...cumm.gemm.main import GemmAlgoDesp
 from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
 class ConvAlgoDesp(GemmAlgoDesp):
    ndim: int
    op_type: int
@@ -86,17 +87,19 @@ class ConvParams:
    mask_filter: int
    reverse_mask: bool
    verbose: bool
+    timer: CUDAKernelTimer
    workspace: Tensor =  Tensor()
    mask: Tensor =  Tensor()
    mask_argsort: Tensor =  Tensor()
    indices: Tensor =  Tensor()
    mask_output: Tensor =  Tensor()
    stream: int
-    def __init__(self, ndim: int, op_type: int) -> None: 
+    def __init__(self, ndim: int, op_type: int, timer: CUDAKernelTimer =  CUDAKernelTimer(False)) -> None: 
        """
        Args:
            ndim: 
            op_type: 
+            timer: 
        """
        ...
 class ConvMainUnitTest:

--- a/spconv/core_cc/cumm/gemm/__init__.pyi
+++ b/spconv/core_cc/cumm/gemm/__init__.pyi
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/spconv/core_cc/cumm/gemm/main.pyi
+++ b/spconv/core_cc/cumm/gemm/main.pyi
 from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
 class GemmAlgoDesp:
    dtype_a: int
    dtype_b: int
@@ -102,7 +103,13 @@ class GemmParams:
    alpha: float
    beta: float
    stream: int
-    def __init__(self) -> None: ...
+    timer: CUDAKernelTimer
+    def __init__(self, timer: CUDAKernelTimer =  CUDAKernelTimer(False)) -> None: 
+        """
+        Args:
+            timer: 
+        """
+        ...
    def check_valid(self) -> None: ...
    @property
    def a(self) -> Tensor: ...

--- a/spconv/core_cc/cumm/tools/__init__.pyi
+++ b/spconv/core_cc/cumm/tools/__init__.pyi
--- a/spconv/core_cc/cumm/tools/cuda.pyi
+++ b/spconv/core_cc/cumm/tools/cuda.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+class CUDAEvent:
+    def __init__(self, name: str) -> None: 
+        """
+        Args:
+            name: 
+        """
+        ...
+    def record(self, stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def sync(self) -> None: ...
+    @staticmethod
+    def duration(start: "CUDAEvent", stop: "CUDAEvent") -> float: 
+        """
+        Args:
+            start: 
+            stop: 
+        """
+        ...
+class CUDAKernelTimer:
+    enable: bool
+    def __init__(self, enable: bool = True) -> None: 
+        """
+        Args:
+            enable: 
+        """
+        ...
+    def push(self, name: str) -> None: 
+        """
+        Args:
+            name: 
+        """
+        ...
+    def pop(self) -> None: ...
+    def record(self, name: str, stream: int = 0) -> None: 
+        """
+        Args:
+            name: 
+            stream: 
+        """
+        ...
+    def insert_pair(self, name: str, start: str, stop: str) -> None: 
+        """
+        Args:
+            name: 
+            start: 
+            stop: 
+        """
+        ...
+    def get_all_pair_duration(self) -> Dict[str, float]: ...
+    def sync(self) -> None: ...
--- a/spconv/cppconstants.py
+++ b/spconv/cppconstants.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,4 +17,4 @@ import spconv.core_cc as _ext
 if hasattr(_ext, "cumm"):
    CPU_ONLY_BUILD = False
 else:
-    CPU_ONLY_BUILD = True 
+    CPU_ONLY_BUILD = True
--- a/spconv/csrc/__init__.py
+++ b/spconv/csrc/__init__.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
--- a/spconv/csrc/sparse/__init__.py
+++ b/spconv/csrc/sparse/__init__.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,13 +17,14 @@ from cumm.conv.bases import ConvOpType, NHWC
 from cumm.conv.params import ConvProblem
 from cumm import dtypes
 from cumm.constants import CUMM_CPU_ONLY_BUILD
-import pccm 
+import pccm
 from ccimport import compat
 from .pointops import Point2Voxel, Point2VoxelCPU
 from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndicesCPU
 from .maxpool import IndiceMaxPool, IndiceMaxPoolCPU
 from .gather import GatherCPU

+
 class CustomThrustLib(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -32,12 +33,15 @@ class CustomThrustLib(pccm.Class):
        if compat.InLinux:
            self.build_meta.add_cflags("nvcc", "-Xcompiler", "-fno-gnu-unique")

+
 class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
    def __init__(self):
        super().__init__()
        self.add_dependency(TensorView)
        self.add_include("functional", "memory")
-        self.add_pybind_member("alloc_func", "std::function<std::uintptr_t(std::size_t)>", pyanno="Callable[[int], int]")
+        self.add_pybind_member("alloc_func",
+                               "std::function<std::uintptr_t(std::size_t)>",
+                               pyanno="Callable[[int], int]")
        self.add_typedef("value_type", "char")

    @pccm.member_function
@@ -54,14 +58,15 @@ class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
            TV_THROW_RT_ERR("set alloc function first.");
        }}
        """)
-        return code 
+        return code

    @pccm.member_function
    def deallocate(self):
        code = pccm.FunctionCode()
        code.arg("ptr", "char *")
        code.arg("num_bytes", "size_t")
-        return code 
+        return code
+

 class SpconvOps(pccm.Class):
    def __init__(self):
@@ -69,28 +74,38 @@ class SpconvOps(pccm.Class):
        self.add_dependency(ThrustCustomAllocatorV2)
        self.ndims = [1, 2, 3, 4]
        for ndim in self.ndims:
-            p2v = Point2Voxel(dtypes.float32,  ndim)
+            p2v = Point2Voxel(dtypes.float32, ndim)
            p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
-            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
+            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu,
+                                 f"Point2Voxel{ndim}DCPU")

            problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
            indices = SparseConvIndicesKernel(problem, dtypes.int32)
            indices_cpu = SparseConvIndicesCPU(problem, dtypes.int32)
-            self.add_param_class(f"ops_cpu{ndim}d", indices_cpu, f"SpconvIndicesCPU{ndim}D")
+            self.add_param_class(f"ops_cpu{ndim}d", indices_cpu,
+                                 f"SpconvIndicesCPU{ndim}D")
            # self.add_param_class("ops", indices, "SpconvIndices")
            if not CUMM_CPU_ONLY_BUILD:
                self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
-                cuda_funcs = [self.generate_subm_conv_inds, 
-                    self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key,
-                    self.generate_conv_inds_mask_stage1, self.generate_conv_inds_mask_stage2]
-                self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
+                cuda_funcs = [
+                    self.generate_subm_conv_inds,
+                    self.generate_conv_inds_stage1,
+                    self.generate_conv_inds_stage1_5,
+                    self.generate_conv_inds_stage2, self.sort_1d_by_key,
+                    self.generate_conv_inds_mask_stage1,
+                    self.generate_conv_inds_mask_stage2
+                ]
+                self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d",
+                                               indices,
+                                               f"SpconvIndices{ndim}D")

    @pccm.pybind.mark
    @pccm.cuda.static_function
    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
@@ -127,7 +142,7 @@ class SpconvOps(pccm.Class):
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")

-        return code# .ret("int")
+        return code  # .ret("int")

    @pccm.pybind.mark
    @pccm.cuda.static_function
@@ -201,7 +216,8 @@ class SpconvOps(pccm.Class):
            return code.make_invalid()

        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
@@ -236,7 +252,7 @@ class SpconvOps(pccm.Class):
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")

-        return code# .ret("int")
+        return code  # .ret("int")

    @pccm.pybind.mark
    @pccm.cuda.static_function
@@ -245,7 +261,9 @@ class SpconvOps(pccm.Class):
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("indices, hashdata", "tv::Tensor")
-        code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg(
+            "indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
+            "tv::Tensor")
        code.arg("mask_fwd, mask_bwd", "tv::Tensor")
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
@@ -294,7 +312,8 @@ class SpconvOps(pccm.Class):
        code.arg("batch_size", "int")
        code.arg("input_dims", f"std::vector<int>")
        code.arg("ksize, dilation", f"std::vector<int>")
-        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
        code.arg("backward", "bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
        code.raw(f"""
@@ -529,7 +548,10 @@ class SpconvOps(pccm.Class):
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("data", "tv::Tensor")
-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.code_after_include = f"""
        template <typename T> struct SmallOrEqualTo {{
@@ -575,7 +597,10 @@ class SpconvOps(pccm.Class):
        code.arg("data", "tv::Tensor")
        code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>")

-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.code_after_include = f"""
        template <typename T> struct SmallOrEqualTo {{
@@ -613,7 +638,6 @@ class SpconvOps(pccm.Class):
        """)
        return code.ret("tv::Tensor")

-
    @pccm.pybind.mark
    @pccm.cuda.static_function
    def sort_1d_by_key_split(self):
@@ -623,7 +647,10 @@ class SpconvOps(pccm.Class):
        code.arg("data", "tv::Tensor")
        code.arg("mask", "tv::Tensor")

-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.arg("mask_output", "bool", "false")

@@ -678,7 +705,10 @@ class SpconvOps(pccm.Class):

        code.arg("mask", "tv::Tensor")

-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.arg("mask_output", "bool", "false")

@@ -821,9 +851,10 @@ class SpconvOps(pccm.Class):
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code.ret("std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>")
-    
-    
+        return code.ret(
+            "std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>"
+        )
+
    @pccm.pybind.mark
    @pccm.static_function
    def point2voxel_cpu(self):
@@ -876,7 +907,8 @@ class SpconvOps(pccm.Class):
    def point2voxel_cuda(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data", "tv::Tensor")
+        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
+                 "tv::Tensor")
        code.arg("vsize", f"std::vector<float>")
        code.arg("grid_size, grid_stride", f"std::vector<int>")
        code.arg("coors_range", f"std::vector<float>")
@@ -914,4 +946,4 @@ class SpconvOps(pccm.Class):
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
\ No newline at end of file
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
--- a/spconv/csrc/sparse/cpu_core.py
+++ b/spconv/csrc/sparse/cpu_core.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pccm
+from ccimport import compat
+from cumm.common import TensorView
+
+
+class OMPLib(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_include("tensorview/parallel/all.h")
+        if compat.InWindows:
+            self.build_meta.add_cflags("cl", "/openmp")
+        else:
+            self.build_meta.add_cflags("g++", "-fopenmp")
+            self.build_meta.add_cflags("clang++", "-fopenmp")
--- a/spconv/csrc/sparse/devleop/sort_bench.py
+++ b/spconv/csrc/sparse/devleop/sort_bench.py
-import torch 
-import time 
+import torch
+import time
+

 def main():

@@ -34,4 +35,4 @@ def main():


 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
--- a/spconv/csrc/sparse/gather.py
+++ b/spconv/csrc/sparse/gather.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import pccm 
+import pccm
 from cumm.common import TensorView
-from typing import List 
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+from spconv.csrc.sparse.cpu_core import OMPLib
+from typing import List
+

 class GatherCPU(pccm.Class):
    def __init__(self):
        super().__init__()
+        if CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(OMPLib)
        self.add_dependency(TensorView)
-    
+        self.add_include("tensorview/parallel/all.h")
+
    @pccm.static_function
    def gather(self):
        code = pccm.FunctionCode()
@@ -35,15 +41,16 @@ class GatherCPU(pccm.Class):
        int channel = in.dim(1);
        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
            auto indices_data = inds.data_ptr<const int>();
-
            using T = TV_DECLTYPE(I);
            T *buffer_data = out.data_ptr<T>();
            const T *features_data = in.data_ptr<const T>();
-            for (int i = 0; i < nhot; ++i) {{
-                std::memcpy(buffer_data + i * channel,
-                            features_data + indices_data[i] * channel,
-                            sizeof(T) * channel);
-            }}
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    std::memcpy(buffer_data + i * channel,
+                                features_data + indices_data[i] * channel,
+                                sizeof(T) * channel);
+                }}
+            }});
        }});
        """)
        return code
@@ -65,13 +72,15 @@ class GatherCPU(pccm.Class):
            T *features_data = out.data_ptr<T>();
            const T *buf = in.data_ptr<const T>();
            T *out_ptr = out.data_ptr<T>();
-            for (int i = 0; i < nhot; ++i) {{
-                buf = buffer_data + i * channel;
-                out_ptr = features_data + indices_data[i] * channel;
-                for (int j = 0; j < channel; ++j) {{
-                    out_ptr[j] = out_ptr[j] + buf[j];
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    buf = buffer_data + i * channel;
+                    out_ptr = features_data + indices_data[i] * channel;
+                    for (int j = 0; j < channel; ++j) {{
+                        out_ptr[j] = out_ptr[j] + buf[j];
+                    }}
                }}
-            }}
+            }});
        }});
        """)
        return code
--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,15 +16,18 @@ import contextlib
 from cumm.conv.bases import ConvEnum
 from cumm.gemm.core.metaarray import MetaArray, seq
 from cumm import dtypes
-import pccm 
+import pccm
 from cumm.gemm.layout import TensorGeneric, to_stride
 from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
 from cumm.gemm import codeops
-from typing import List 
+from typing import List
 from cumm.conv.params import ConvProblem
 from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
-import numpy as np 
+import numpy as np
 from cumm.gemm import (thread_map)
+from spconv.csrc.sparse.cpu_core import OMPLib
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+

 class IndiceMaxPool(pccm.Class):
    # TODO optimize this function
@@ -32,13 +35,13 @@ class IndiceMaxPool(pccm.Class):
        super().__init__()
        self.add_include("limits")
        self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
-    
+
    @pccm.cuda.cuda_global_function
    def forward_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")

-        code.arg("out_features", f"T*") 
+        code.arg("out_features", f"T*")
        code.arg("in_features", f"const T*")
        code.arg("out_indices", "const int*")
        code.arg("in_indices", "const int*")
@@ -67,7 +70,7 @@ class IndiceMaxPool(pccm.Class):
        code = pccm.FunctionCode()
        code.targ("T")

-        code.arg("out_features", f"T*") 
+        code.arg("out_features", f"T*")
        code.arg("in_features", f"const T*")
        code.arg("indices", "const int*")
        code.arg("num_features", "int")
@@ -104,9 +107,9 @@ class IndiceMaxPool(pccm.Class):
    def backward_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")
-        code.arg("out_features", f"const T*") 
+        code.arg("out_features", f"const T*")
        code.arg("in_features", f"const T*")
-        code.arg("dout_features", f"const T*") 
+        code.arg("dout_features", f"const T*")
        code.arg("din_features", f"T*")
        code.arg("out_indices", "const int*")
        code.arg("in_indices", "const int*")
@@ -137,9 +140,9 @@ class IndiceMaxPool(pccm.Class):
        code = pccm.FunctionCode()
        code.targ("T")

-        code.arg("out_features", f"const T*") 
+        code.arg("out_features", f"const T*")
        code.arg("in_features", f"const T*")
-        code.arg("dout_features", f"const T*") 
+        code.arg("dout_features", f"const T*")
        code.arg("din_features", f"T*")
        code.arg("indices_bwd", "const int*")
        code.arg("num_features", "int")
@@ -351,6 +354,9 @@ class IndiceMaxPoolCPU(pccm.Class):
    def __init__(self):
        super().__init__()
        self.add_dependency(TensorView)
+        if CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(OMPLib)
+        self.add_include("tensorview/parallel/all.h")

    @pccm.static_function
    def forward(self):
@@ -371,20 +377,21 @@ class IndiceMaxPoolCPU(pccm.Class):

            auto in_indices = in_inds.data_ptr<const int>();
            auto out_indices = out_inds.data_ptr<const int>();
-
-            for (int i = 0; i < nhot; ++i) {{
-                int in_idx = in_indices[i];
-                int out_idx = out_indices[i];
-                auto in_ptr = in_features + in_idx * num_features;
-                auto out_ptr = out_features + out_idx * num_features;
-                for (int j = 0; j < num_features; ++j) {{
-                    auto in = in_ptr[j];
-                    auto out = out_ptr[j];
-                    if (in > out){{
-                        out_ptr[j] = in;
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    int in_idx = in_indices[i];
+                    int out_idx = out_indices[i];
+                    auto in_ptr = in_features + in_idx * num_features;
+                    auto out_ptr = out_features + out_idx * num_features;
+                    for (int j = 0; j < num_features; ++j) {{
+                        auto in = in_ptr[j];
+                        auto out = out_ptr[j];
+                        if (in > out){{
+                            out_ptr[j] = in;
+                        }}
                    }}
                }}
-            }}
+            }});
        }});
        """)
        return code
@@ -412,22 +419,24 @@ class IndiceMaxPoolCPU(pccm.Class):

            auto in_indices = in_inds.data_ptr<const int>();
            auto out_indices = out_inds.data_ptr<const int>();
-
-            for (int i = 0; i < nhot; ++i) {{
-                int in_idx_offset = in_indices[i] * num_features;
-                int out_idx_offset = out_indices[i] * num_features;
-                auto in_ptr = in_features + in_idx_offset;
-                auto out_ptr = out_features + out_idx_offset;
-                auto din_ptr = din_features + in_idx_offset;
-                auto dout_ptr = dout_features + out_idx_offset;
-                for (int j = 0; j < num_features; ++j) {{
-                    auto in = in_ptr[j];
-                    auto out = out_ptr[j];
-                    if (in == out){{
-                        din_ptr[j] = din_ptr[j] + dout_ptr[j];
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    int in_idx_offset = in_indices[i] * num_features;
+                    int out_idx_offset = out_indices[i] * num_features;
+                    auto in_ptr = in_features + in_idx_offset;
+                    auto out_ptr = out_features + out_idx_offset;
+                    auto din_ptr = din_features + in_idx_offset;
+                    auto dout_ptr = dout_features + out_idx_offset;
+                    for (int j = 0; j < num_features; ++j) {{
+                        auto in = in_ptr[j];
+                        auto out = out_ptr[j];
+                        if (in == out){{
+                            din_ptr[j] = din_ptr[j] + dout_ptr[j];
+                        }}
                    }}
                }}
-            }}
+            }});
+
        }});
        """)
        return code