v2.1.5: add profile tool and python 3.6 for linux

82fd7a8b · yan.yan · f31eee3a · f31eee3a · f31eee3a · 82fd7a8b
Commit 82fd7a8b authored Nov 10, 2021 by yan.yan
20 changed files
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-from cumm.tensorview import Tensor
-class Point2VoxelCPU:
-    densehashdata: Tensor
-    voxels: Tensor
-    indices: Tensor
-    num_per_voxel: Tensor
-    @property
-    def grid_size(self) -> List[int]: ...
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
-    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-class Point2VoxelCommon:
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
@@ -9,14 +9,11 @@ class Point2VoxelCPU:
    @property
    def grid_size(self) -> List[int]: ...
    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> Tuple[List[float], List[int], List[int], List[float]]: 
+    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
        """
        Args:
            vsize_xyz: 
            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
        """
        ...
    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
@@ -30,7 +27,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -38,7 +35,6 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
-            mean_per_voxel: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -47,7 +43,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -55,7 +51,6 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
-            mean_per_voxel: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-from cumm.tensorview import Tensor
-class Point2VoxelCPU:
-    densehashdata: Tensor
-    voxels: Tensor
-    indices: Tensor
-    num_per_voxel: Tensor
-    @property
-    def grid_size(self) -> List[int]: ...
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
-    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-class Point2VoxelCommon:
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
--- a/spconv/core_cc/cumm/__init__.pyi
+++ b/spconv/core_cc/cumm/__init__.pyi
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/spconv/core_cc/cumm/conv/main.pyi
+++ b/spconv/core_cc/cumm/conv/main.pyi
@@ -2,6 +2,7 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
 from pccm.stubs import EnumValue, EnumClassValue
 from ...cumm.gemm.main import GemmAlgoDesp
 from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
 class ConvAlgoDesp(GemmAlgoDesp):
    ndim: int
    op_type: int
@@ -86,17 +87,19 @@ class ConvParams:
    mask_filter: int
    reverse_mask: bool
    verbose: bool
+    timer: CUDAKernelTimer
    workspace: Tensor =  Tensor()
    mask: Tensor =  Tensor()
    mask_argsort: Tensor =  Tensor()
    indices: Tensor =  Tensor()
    mask_output: Tensor =  Tensor()
    stream: int
-    def __init__(self, ndim: int, op_type: int) -> None: 
+    def __init__(self, ndim: int, op_type: int, timer: CUDAKernelTimer =  CUDAKernelTimer(False)) -> None: 
        """
        Args:
            ndim: 
            op_type: 
+            timer: 
        """
        ...
 class ConvMainUnitTest:

--- a/spconv/core_cc/cumm/gemm/__init__.pyi
+++ b/spconv/core_cc/cumm/gemm/__init__.pyi
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/spconv/core_cc/cumm/gemm/main.pyi
+++ b/spconv/core_cc/cumm/gemm/main.pyi
 from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
 class GemmAlgoDesp:
    dtype_a: int
    dtype_b: int
@@ -102,7 +103,13 @@ class GemmParams:
    alpha: float
    beta: float
    stream: int
-    def __init__(self) -> None: ...
+    timer: CUDAKernelTimer
+    def __init__(self, timer: CUDAKernelTimer =  CUDAKernelTimer(False)) -> None: 
+        """
+        Args:
+            timer: 
+        """
+        ...
    def check_valid(self) -> None: ...
    @property
    def a(self) -> Tensor: ...

--- a/spconv/core_cc/cumm/tools/__init__.pyi
+++ b/spconv/core_cc/cumm/tools/__init__.pyi
--- a/spconv/core_cc/cumm/tools/cuda.pyi
+++ b/spconv/core_cc/cumm/tools/cuda.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+class CUDAEvent:
+    def __init__(self, name: str) -> None: 
+        """
+        Args:
+            name: 
+        """
+        ...
+    def record(self, stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def sync(self) -> None: ...
+    @staticmethod
+    def duration(start: "CUDAEvent", stop: "CUDAEvent") -> float: 
+        """
+        Args:
+            start: 
+            stop: 
+        """
+        ...
+class CUDAKernelTimer:
+    enable: bool
+    def __init__(self, enable: bool = True) -> None: 
+        """
+        Args:
+            enable: 
+        """
+        ...
+    def push(self, name: str) -> None: 
+        """
+        Args:
+            name: 
+        """
+        ...
+    def pop(self) -> None: ...
+    def record(self, name: str, stream: int = 0) -> None: 
+        """
+        Args:
+            name: 
+            stream: 
+        """
+        ...
+    def insert_pair(self, name: str, start: str, stop: str) -> None: 
+        """
+        Args:
+            name: 
+            start: 
+            stop: 
+        """
+        ...
+    def get_all_pair_duration(self) -> Dict[str, float]: ...
+    def sync(self) -> None: ...
--- a/spconv/cppconstants.py
+++ b/spconv/cppconstants.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,4 +17,4 @@ import spconv.core_cc as _ext
 if hasattr(_ext, "cumm"):
    CPU_ONLY_BUILD = False
 else:
-    CPU_ONLY_BUILD = True 
+    CPU_ONLY_BUILD = True
--- a/spconv/csrc/__init__.py
+++ b/spconv/csrc/__init__.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
--- a/spconv/csrc/sparse/__init__.py
+++ b/spconv/csrc/sparse/__init__.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,13 +17,14 @@ from cumm.conv.bases import ConvOpType, NHWC
 from cumm.conv.params import ConvProblem
 from cumm import dtypes
 from cumm.constants import CUMM_CPU_ONLY_BUILD
-import pccm 
+import pccm
 from ccimport import compat
 from .pointops import Point2Voxel, Point2VoxelCPU
 from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndicesCPU
 from .maxpool import IndiceMaxPool, IndiceMaxPoolCPU
 from .gather import GatherCPU

+
 class CustomThrustLib(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -32,12 +33,15 @@ class CustomThrustLib(pccm.Class):
        if compat.InLinux:
            self.build_meta.add_cflags("nvcc", "-Xcompiler", "-fno-gnu-unique")

+
 class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
    def __init__(self):
        super().__init__()
        self.add_dependency(TensorView)
        self.add_include("functional", "memory")
-        self.add_pybind_member("alloc_func", "std::function<std::uintptr_t(std::size_t)>", pyanno="Callable[[int], int]")
+        self.add_pybind_member("alloc_func",
+                               "std::function<std::uintptr_t(std::size_t)>",
+                               pyanno="Callable[[int], int]")
        self.add_typedef("value_type", "char")

    @pccm.member_function
@@ -54,14 +58,15 @@ class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
            TV_THROW_RT_ERR("set alloc function first.");
        }}
        """)
-        return code 
+        return code

    @pccm.member_function
    def deallocate(self):
        code = pccm.FunctionCode()
        code.arg("ptr", "char *")
        code.arg("num_bytes", "size_t")
-        return code 
+        return code
+

 class SpconvOps(pccm.Class):
    def __init__(self):
@@ -69,28 +74,38 @@ class SpconvOps(pccm.Class):
        self.add_dependency(ThrustCustomAllocatorV2)
        self.ndims = [1, 2, 3, 4]
        for ndim in self.ndims:
-            p2v = Point2Voxel(dtypes.float32,  ndim)
+            p2v = Point2Voxel(dtypes.float32, ndim)
            p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
-            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
+            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu,
+                                 f"Point2Voxel{ndim}DCPU")

            problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
            indices = SparseConvIndicesKernel(problem, dtypes.int32)
            indices_cpu = SparseConvIndicesCPU(problem, dtypes.int32)
-            self.add_param_class(f"ops_cpu{ndim}d", indices_cpu, f"SpconvIndicesCPU{ndim}D")
+            self.add_param_class(f"ops_cpu{ndim}d", indices_cpu,
+                                 f"SpconvIndicesCPU{ndim}D")
            # self.add_param_class("ops", indices, "SpconvIndices")
            if not CUMM_CPU_ONLY_BUILD:
                self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
-                cuda_funcs = [self.generate_subm_conv_inds, 
-                    self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key,
-                    self.generate_conv_inds_mask_stage1, self.generate_conv_inds_mask_stage2]
-                self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
+                cuda_funcs = [
+                    self.generate_subm_conv_inds,
+                    self.generate_conv_inds_stage1,
+                    self.generate_conv_inds_stage1_5,
+                    self.generate_conv_inds_stage2, self.sort_1d_by_key,
+                    self.generate_conv_inds_mask_stage1,
+                    self.generate_conv_inds_mask_stage2
+                ]
+                self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d",
+                                               indices,
+                                               f"SpconvIndices{ndim}D")

    @pccm.pybind.mark
    @pccm.cuda.static_function
    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
@@ -127,7 +142,7 @@ class SpconvOps(pccm.Class):
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")

-        return code# .ret("int")
+        return code  # .ret("int")

    @pccm.pybind.mark
    @pccm.cuda.static_function
@@ -201,7 +216,8 @@ class SpconvOps(pccm.Class):
            return code.make_invalid()

        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
@@ -236,7 +252,7 @@ class SpconvOps(pccm.Class):
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")

-        return code# .ret("int")
+        return code  # .ret("int")

    @pccm.pybind.mark
    @pccm.cuda.static_function
@@ -245,7 +261,9 @@ class SpconvOps(pccm.Class):
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("indices, hashdata", "tv::Tensor")
-        code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg(
+            "indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
+            "tv::Tensor")
        code.arg("mask_fwd, mask_bwd", "tv::Tensor")
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
@@ -294,7 +312,8 @@ class SpconvOps(pccm.Class):
        code.arg("batch_size", "int")
        code.arg("input_dims", f"std::vector<int>")
        code.arg("ksize, dilation", f"std::vector<int>")
-        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
        code.arg("backward", "bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
        code.raw(f"""
@@ -529,7 +548,10 @@ class SpconvOps(pccm.Class):
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("data", "tv::Tensor")
-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.code_after_include = f"""
        template <typename T> struct SmallOrEqualTo {{
@@ -575,7 +597,10 @@ class SpconvOps(pccm.Class):
        code.arg("data", "tv::Tensor")
        code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>")

-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.code_after_include = f"""
        template <typename T> struct SmallOrEqualTo {{
@@ -613,7 +638,6 @@ class SpconvOps(pccm.Class):
        """)
        return code.ret("tv::Tensor")

-
    @pccm.pybind.mark
    @pccm.cuda.static_function
    def sort_1d_by_key_split(self):
@@ -623,7 +647,10 @@ class SpconvOps(pccm.Class):
        code.arg("data", "tv::Tensor")
        code.arg("mask", "tv::Tensor")

-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.arg("mask_output", "bool", "false")

@@ -678,7 +705,10 @@ class SpconvOps(pccm.Class):

        code.arg("mask", "tv::Tensor")

-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.arg("mask_output", "bool", "false")

@@ -821,9 +851,10 @@ class SpconvOps(pccm.Class):
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code.ret("std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>")
-    
-    
+        return code.ret(
+            "std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>"
+        )
+
    @pccm.pybind.mark
    @pccm.static_function
    def point2voxel_cpu(self):
@@ -876,7 +907,8 @@ class SpconvOps(pccm.Class):
    def point2voxel_cuda(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data", "tv::Tensor")
+        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
+                 "tv::Tensor")
        code.arg("vsize", f"std::vector<float>")
        code.arg("grid_size, grid_stride", f"std::vector<int>")
        code.arg("coors_range", f"std::vector<float>")
@@ -914,4 +946,4 @@ class SpconvOps(pccm.Class):
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
\ No newline at end of file
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
--- a/spconv/csrc/sparse/cpu_core.py
+++ b/spconv/csrc/sparse/cpu_core.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pccm
+from ccimport import compat
+from cumm.common import TensorView
+
+
+class OMPLib(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_include("tensorview/parallel/all.h")
+        if compat.InWindows:
+            self.build_meta.add_cflags("cl", "/openmp")
+        else:
+            self.build_meta.add_cflags("g++", "-fopenmp")
+            self.build_meta.add_cflags("clang++", "-fopenmp")
--- a/spconv/csrc/sparse/devleop/sort_bench.py
+++ b/spconv/csrc/sparse/devleop/sort_bench.py
-import torch 
-import time 
+import torch
+import time
+

 def main():

@@ -34,4 +35,4 @@ def main():


 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
--- a/spconv/csrc/sparse/gather.py
+++ b/spconv/csrc/sparse/gather.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import pccm 
+import pccm
 from cumm.common import TensorView
-from typing import List 
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+from spconv.csrc.sparse.cpu_core import OMPLib
+from typing import List
+

 class GatherCPU(pccm.Class):
    def __init__(self):
        super().__init__()
+        if CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(OMPLib)
        self.add_dependency(TensorView)
-    
+        self.add_include("tensorview/parallel/all.h")
+
    @pccm.static_function
    def gather(self):
        code = pccm.FunctionCode()
@@ -35,15 +41,16 @@ class GatherCPU(pccm.Class):
        int channel = in.dim(1);
        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
            auto indices_data = inds.data_ptr<const int>();
-
            using T = TV_DECLTYPE(I);
            T *buffer_data = out.data_ptr<T>();
            const T *features_data = in.data_ptr<const T>();
-            for (int i = 0; i < nhot; ++i) {{
-                std::memcpy(buffer_data + i * channel,
-                            features_data + indices_data[i] * channel,
-                            sizeof(T) * channel);
-            }}
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    std::memcpy(buffer_data + i * channel,
+                                features_data + indices_data[i] * channel,
+                                sizeof(T) * channel);
+                }}
+            }});
        }});
        """)
        return code
@@ -65,13 +72,15 @@ class GatherCPU(pccm.Class):
            T *features_data = out.data_ptr<T>();
            const T *buf = in.data_ptr<const T>();
            T *out_ptr = out.data_ptr<T>();
-            for (int i = 0; i < nhot; ++i) {{
-                buf = buffer_data + i * channel;
-                out_ptr = features_data + indices_data[i] * channel;
-                for (int j = 0; j < channel; ++j) {{
-                    out_ptr[j] = out_ptr[j] + buf[j];
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    buf = buffer_data + i * channel;
+                    out_ptr = features_data + indices_data[i] * channel;
+                    for (int j = 0; j < channel; ++j) {{
+                        out_ptr[j] = out_ptr[j] + buf[j];
+                    }}
                }}
-            }}
+            }});
        }});
        """)
        return code
--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,13 +16,14 @@ import contextlib
 from cumm.conv.bases import ConvEnum
 from cumm.gemm.core.metaarray import MetaArray, seq
 from cumm import dtypes
-import pccm 
+import pccm
 from cumm.gemm.layout import TensorGeneric, to_stride
 from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib
 from cumm.gemm import codeops
-from typing import List 
+from typing import List
 from cumm.conv.params import ConvProblem
-import numpy as np 
+import numpy as np
+

 class CudaCommonKernel(pccm.ParameterizedClass):
    # we need to use PClass instead of Class
@@ -31,8 +32,8 @@ class CudaCommonKernel(pccm.ParameterizedClass):
    def arange_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")
-        code.arg("data", f"T*") 
-        code.arg("size", f"int") 
+        code.arg("data", f"T*")
+        code.arg("size", f"int")
        code.raw(f"""
        for (int i : tv::KernelLoopX<int>(size)) {{
            data[i] = T(i);
@@ -44,9 +45,9 @@ class CudaCommonKernel(pccm.ParameterizedClass):
    def fill_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")
-        code.arg("data", f"T*") 
+        code.arg("data", f"T*")
        code.arg("val", f"T")
-        code.arg("size", f"int") 
+        code.arg("size", f"int")
        code.raw(f"""
        for (int i : tv::KernelLoopX<int>(size)) {{
            data[i] = T(val);
@@ -66,7 +67,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        self.add_param_class("lociter", layout_npq, "LayoutNPQ")
        self.add_param_class("lociter_rs", layout_rs, "LayoutRS")

-        self.ndim = problem.ndim 
+        self.ndim = problem.ndim
        self.add_member("problem_", f"ConvProblem")
        self.add_member("count_", f"tv::array<int, {self.ndim}>")
        self.add_member("layout_npq", f"LayoutNPQ")
@@ -82,13 +83,15 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        pqs = codeops.unpack("problem.output_dims", range(self.ndim))
        rss = codeops.unpack("problem.ksize", range(self.ndim))

-        code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
+        code.ctor_init("layout_npq",
+                       f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
        code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
-        
-        return code 

-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
-                               name="operator++")
+        return code
+
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
+                          name="operator++")
    def increment(self):
        code = pccm.FunctionCode()
        for i in range(self.ndim - 1, -1, -1):
@@ -110,8 +113,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code

-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
-                               const=True)
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
+                          const=True)
    def nhw_to_npq(self):
        code = pccm.FunctionCode()
        code.arg("nhw_offset", "const int*")
@@ -128,8 +132,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code.ret(f"tv::array<int, {self.ndim + 1}>")

-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
-                               const=True)
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
+                          const=True)
    def npq_to_nhw(self):
        code = pccm.FunctionCode()
        code.arg("npq_offset", "const int*")
@@ -144,9 +149,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code.ret(f"tv::array<int, {self.ndim + 1}>")

-
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
-                               const=True)
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
+                          const=True)
    def query_npq(self):
        code = pccm.FunctionCode()
        code.arg("nhw_offset", "const int*")
@@ -156,22 +161,27 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
        npq_offset[0] = npq_no_stride[0];
        """)
-        hw_valid = [] # type: List[str]
-        stride_valid = [] # type: List[str]
+        hw_valid = []  # type: List[str]
+        stride_valid = []  # type: List[str]
        for i in range(self.ndim):
-            code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
-            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
-                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
-            stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
+            code.raw(
+                f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];"
+            )
+            hw_valid.append(
+                (f"npq_offset[{i + 1}] >= 0 && "
+                 f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+            stride_valid.append(
+                f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
        code.raw(f"""
        return npq_no_stride[0] < problem_.N && 
            {' && '.join(hw_valid)} &&
            {' && '.join(stride_valid)};
        """)
-        return code 
+        return code

-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
-                               const=True)
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
+                          const=True)
    def query_npq_no_stride(self):
        code = pccm.FunctionCode()
        code.arg("nhw_offset", "const int*")
@@ -180,18 +190,20 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        code.raw(f"""
        npq_offset = nhw_to_npq<true>(nhw_offset);
        """)
-        hw_valid = [] # type: List[str]
+        hw_valid = []  # type: List[str]
        for i in range(self.ndim):
-            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
-                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+            hw_valid.append(
+                (f"npq_offset[{i + 1}] >= 0 && "
+                 f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
        code.raw(f"""
        return npq_offset[0] < problem_.N && 
            {' && '.join(hw_valid)};
        """)
-        return code 
+        return code

-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
-                               const=True)
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
+                          const=True)
    def query_nhw(self):
        code = pccm.FunctionCode()
        code.arg("npq_offset", "const int*")
@@ -200,18 +212,20 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        code.raw(f"""
        nhw_offset = npq_to_nhw(npq_offset);
        """)
-        hw_valid = [] # type: List[str]
+        hw_valid = []  # type: List[str]
        for i in range(self.ndim):
-            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
-                            f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
+            hw_valid.append(
+                (f"nhw_offset[{i + 1}] >= 0 && "
+                 f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
        code.raw(f"""
        return nhw_offset[0] < problem_.N && 
            {' && '.join(hw_valid)};
        """)
-        return code 
+        return code

-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
-                               const=True)
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
+                          const=True)
    def query_nhw_out(self):
        code = pccm.FunctionCode()
        code.arg("npq_offset", "const int*")
@@ -220,41 +234,45 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        code.raw(f"""
        nhw_offset = npq_to_nhw(npq_offset);
        """)
-        hw_valid = [] # type: List[str]
+        hw_valid = []  # type: List[str]
        for i in range(self.ndim):
-            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
-                            f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
+            hw_valid.append(
+                (f"nhw_offset[{i + 1}] >= 0 && "
+                 f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
        code.raw(f"""
        return nhw_offset[0] < problem_.N && 
            {' && '.join(hw_valid)};
        """)
-        return code 
+        return code
+

 class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
        super().__init__()
-        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
+        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel,
+                            ThrustLib)
        self.loc_iter = ConvOutLocIter(problem)
        self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
-        self.add_param_class("spinds", problem, "ConvProblem")        
-        self.add_param_class("cudakers", CudaCommonKernel())        
+        self.add_param_class("spinds", problem, "ConvProblem")
+        self.add_param_class("cudakers", CudaCommonKernel())

-        self.ndim = problem.ndim 
+        self.ndim = problem.ndim
        self.dtype_indices = dtype_indices
        self.dtype_indices_uniq = dtype_indices

        assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64

-
    @pccm.cuda.cuda_global_function
    def calc_conv_indices_stage1(self):
        code = pccm.FunctionCode()
-        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("loc_iter", f"ConvLocIter")  # [N, ndim + 1]

-        code.arg("indices_in", f"const int*") # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
-        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
-        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
+        code.arg("indice_pairs",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]

        code.arg("num_indices_in", "int")
        code.arg("indices_pair_size", "int")
@@ -288,17 +306,18 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code

-
    @pccm.cuda.cuda_global_function
    def build_conv_hash_table(self):
        code = pccm.FunctionCode()
        code.targ("TTable")

-        code.arg("table", f"TTable") # [N, ndim + 1]
-        code.arg("indices_out", f"int*") # [N, ndim + 1]
-        code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("table", f"TTable")  # [N, ndim + 1]
+        code.arg("indices_out", f"int*")  # [N, ndim + 1]
+        code.arg("indice_pairs_for_uniq",
+                 f"const {self.dtype_indices}*")  # [2, kernelProd, MaxSize]

-        code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
+        code.arg("layout_npq",
+                 f"spinds::LayoutNPQ")  # [2, kernelProd, MaxSize]

        code.arg("num_indices", "int")

@@ -315,8 +334,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def calc_conv_indices_stage2(self):
        code = pccm.FunctionCode()
        code.targ("TTable")
-        code.arg("table", f"TTable") # [N, ndim + 1]
-        code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize]
+        code.arg("table", f"TTable")  # [N, ndim + 1]
+        code.arg("indice_pairs_out_part", f"int*")  # [2, kernelProd, MaxSize]
        code.arg("num_indices_in", "int")
        code.arg("indices_pair_size", "int")
        # TODO use block instead of filter_offset?
@@ -338,12 +357,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    @pccm.cuda.cuda_global_function
    def calc_conv_indices_stage1_mask(self):
        code = pccm.FunctionCode()
-        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("loc_iter", f"ConvLocIter")  # [N, ndim + 1]

-        code.arg("indices_in", f"const int*") # [N, ndim + 1]
-        code.arg("indice_pairs_bwd", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
-        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
-        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
+        code.arg("indice_pairs_bwd",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]

        code.arg("num_indices_in", "int")

@@ -381,11 +402,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def calc_conv_indices_stage2_mask(self):
        code = pccm.FunctionCode()
        code.targ("TTable")
-        code.arg("table", f"TTable") # [N, ndim + 1]
-        code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out
-        code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
-        code.arg("mask_fwd", f"uint32_t*") # [kernelProd]
-        code.arg("mask_bwd", f"uint32_t*") # [kernelProd]
+        code.arg("table", f"TTable")  # [N, ndim + 1]
+        code.arg("indice_pairs_fwd",
+                 f"int*")  # [kernelProd, MaxSize], inp -> out
+        code.arg("indice_pairs_bwd",
+                 f"int*")  # [kernelProd, MaxSize], out -> inp
+        code.arg("mask_fwd", f"uint32_t*")  # [kernelProd]
+        code.arg("mask_bwd", f"uint32_t*")  # [kernelProd]

        code.arg("num_indices_in", "int")
        code.arg("num_indices_out", "int")
@@ -418,8 +441,9 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    @pccm.cuda.cuda_global_function
    def calc_conv_indices_stage2_mask_output(self):
        code = pccm.FunctionCode()
-        code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
-        code.arg("mask_bwd", f"uint32_t*") # [kernelProd]
+        code.arg("indice_pairs_bwd",
+                 f"int*")  # [kernelProd, MaxSize], out -> inp
+        code.arg("mask_bwd", f"uint32_t*")  # [kernelProd]

        code.arg("num_indices_in", "int")
        code.arg("kv", "int")
@@ -441,10 +465,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def calc_conv_indices_stage2_inference_mask(self):
        code = pccm.FunctionCode()
        code.targ("TTable")
-        code.arg("table", f"TTable") # [N, ndim + 1]
-        code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out
-        code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
-        code.arg("mask_fwd", f"uint32_t*") # [kernelProd]
+        code.arg("table", f"TTable")  # [N, ndim + 1]
+        code.arg("indice_pairs_fwd",
+                 f"int*")  # [kernelProd, MaxSize], inp -> out
+        code.arg("indice_pairs_bwd",
+                 f"int*")  # [kernelProd, MaxSize], out -> inp
+        code.arg("mask_fwd", f"uint32_t*")  # [kernelProd]
        code.arg("num_indices_in", "int")
        code.arg("num_indices_out", "int")

@@ -469,16 +495,15 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code

-
    @pccm.cuda.cuda_global_function
    def build_subm_conv_hash_table(self):
        code = pccm.FunctionCode()
        code.targ("TTable")

-        code.arg("table", f"TTable") # [N, ndim + 1]
-        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("table", f"TTable")  # [N, ndim + 1]
+        code.arg("indices_in", f"const int*")  # [N, ndim + 1]

-        code.arg("layout_npq", f"spinds::LayoutNPQ") 
+        code.arg("layout_npq", f"spinds::LayoutNPQ")

        code.arg("num_indices", "int")

@@ -493,8 +518,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    @pccm.cuda.cuda_global_function
    def clean_indices_uniq(self):
        code = pccm.FunctionCode()
-        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") 
-        code.arg("size", f"{self.dtype_indices}") 
+        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*")
+        code.arg("size", f"{self.dtype_indices}")
        code.raw(f"""
        for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{
            indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max();
@@ -506,12 +531,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def calc_subm_conv_indices(self):
        code = pccm.FunctionCode()
        code.targ("TTable")
-        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
-        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("loc_iter", f"ConvLocIter")  # [N, ndim + 1]
+        code.arg("table", f"TTable")  # [N, ndim + 1]

-        code.arg("indices_in", f"const int*") # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
-        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
+        code.arg("indice_pairs",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]

        code.arg("num_indices_in", "int")
        code.arg("indices_pair_size", "int")
@@ -552,12 +578,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def calc_subm_conv_indices_mask(self):
        code = pccm.FunctionCode()
        code.targ("TTable")
-        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
-        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("loc_iter", f"ConvLocIter")  # [N, ndim + 1]
+        code.arg("table", f"TTable")  # [N, ndim + 1]

-        code.arg("indices_in", f"const int*") # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
-        code.arg("mask", f"uint32_t*") # [kernelProd]
+        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
+        code.arg("indice_pairs",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("mask", f"uint32_t*")  # [kernelProd]

        code.arg("num_indices", "int")
        code.arg("indices_pair_size", "int")
@@ -609,13 +636,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def calc_subm_conv_indices_split_mask(self):
        code = pccm.FunctionCode()
        code.targ("TTable")
-        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
-        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("loc_iter", f"ConvLocIter")  # [N, ndim + 1]
+        code.arg("table", f"TTable")  # [N, ndim + 1]

-        code.arg("indices_in", f"const int*") # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
-        code.arg("mask1", f"uint32_t*") # [kernelProd]
-        code.arg("mask2", f"uint32_t*") # [kernelProd]
+        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
+        code.arg("indice_pairs",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("mask1", f"uint32_t*")  # [kernelProd]
+        code.arg("mask2", f"uint32_t*")  # [kernelProd]

        code.arg("num_indices", "int")
        code.arg("indices_pair_size", "int")
@@ -665,10 +693,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")

        code.arg("stream_int", f"std::uintptr_t", "0")
@@ -706,9 +736,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        // auto num_out_act = new_end - ptr_tr - 1;
        // return num_out_act;
        """)
-        return code# .ret("int")
-
-
+        return code  # .ret("int")

    @pccm.cuda.static_function
    def generate_conv_inds_stage1_5(self):
@@ -726,7 +754,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code.ret("int")

-
    @pccm.cuda.static_function
    def generate_conv_inds_stage2(self):
        code = pccm.FunctionCode()
@@ -735,7 +762,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
        code.raw(f"""
@@ -783,10 +811,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def generate_conv_inds_mask_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")

        code.arg("stream_int", f"std::uintptr_t", "0")
@@ -817,21 +847,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
            indice_pairs_bwd.data_ptr<{self.dtype_indices}>(), 
            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
            kv, transposed);
-        auto timer = tv::CudaContextTimer<>();
        """)
-        return code# .ret("int")
+        return code  # .ret("int")

    @pccm.cuda.static_function
    def generate_conv_inds_stage2_mask(self):
        code = pccm.FunctionCode()
        code.arg("indices, hashdata", "tv::Tensor")
-        code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg(
+            "indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
+            "tv::Tensor")
        code.arg("mask_fwd, mask_bwd", "tv::Tensor")

        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
        code.raw(f"""
@@ -903,7 +935,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code.ret("int")

-
    @pccm.cuda.static_function
    def generate_subm_conv_inds(self):
        code = pccm.FunctionCode()
@@ -912,7 +943,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("batch_size", "int")
        code.arg("input_dims", f"tv::array<int, {self.ndim}>")
        code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
-        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
        code.arg("backward", "bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")

@@ -993,6 +1025,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):

        return code.ret("int")

+
 class SparseConvIndicesCPU(pccm.ParameterizedClass):
    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
        super().__init__()
@@ -1000,9 +1033,9 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
        self.add_include("unordered_map")
        self.loc_iter = ConvOutLocIter(problem)
        self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
-        self.add_param_class("spinds", problem, "ConvProblem")        
+        self.add_param_class("spinds", problem, "ConvProblem")

-        self.ndim = problem.ndim 
+        self.ndim = problem.ndim
        self.dtype_indices = dtype_indices
        self.dtype_indices_uniq = dtype_indices

@@ -1016,7 +1049,7 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
        code.arg("batch_size", "int")
        code.arg("input_dims", f"tv::array<int, {self.ndim}>")
        code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
-    
+
        code.raw(f"""
        tv::array<int, {self.ndim}> stride, padding;
        for (int i = 0; i < {self.ndim}; ++i){{
@@ -1079,7 +1112,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.raw(f"""
        int kv = tv::arrayops::prod(ksize);

--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
 # Copyright 2021 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,15 +16,18 @@ import contextlib
 from cumm.conv.bases import ConvEnum
 from cumm.gemm.core.metaarray import MetaArray, seq
 from cumm import dtypes
-import pccm 
+import pccm
 from cumm.gemm.layout import TensorGeneric, to_stride
 from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
 from cumm.gemm import codeops
-from typing import List 
+from typing import List
 from cumm.conv.params import ConvProblem
 from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
-import numpy as np 
+import numpy as np
 from cumm.gemm import (thread_map)
+from spconv.csrc.sparse.cpu_core import OMPLib
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+

 class IndiceMaxPool(pccm.Class):
    # TODO optimize this function
@@ -32,13 +35,13 @@ class IndiceMaxPool(pccm.Class):
        super().__init__()
        self.add_include("limits")
        self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
-    
+
    @pccm.cuda.cuda_global_function
    def forward_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")

-        code.arg("out_features", f"T*") 
+        code.arg("out_features", f"T*")
        code.arg("in_features", f"const T*")
        code.arg("out_indices", "const int*")
        code.arg("in_indices", "const int*")
@@ -67,7 +70,7 @@ class IndiceMaxPool(pccm.Class):
        code = pccm.FunctionCode()
        code.targ("T")

-        code.arg("out_features", f"T*") 
+        code.arg("out_features", f"T*")
        code.arg("in_features", f"const T*")
        code.arg("indices", "const int*")
        code.arg("num_features", "int")
@@ -104,9 +107,9 @@ class IndiceMaxPool(pccm.Class):
    def backward_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")
-        code.arg("out_features", f"const T*") 
+        code.arg("out_features", f"const T*")
        code.arg("in_features", f"const T*")
-        code.arg("dout_features", f"const T*") 
+        code.arg("dout_features", f"const T*")
        code.arg("din_features", f"T*")
        code.arg("out_indices", "const int*")
        code.arg("in_indices", "const int*")
@@ -137,9 +140,9 @@ class IndiceMaxPool(pccm.Class):
        code = pccm.FunctionCode()
        code.targ("T")

-        code.arg("out_features", f"const T*") 
+        code.arg("out_features", f"const T*")
        code.arg("in_features", f"const T*")
-        code.arg("dout_features", f"const T*") 
+        code.arg("dout_features", f"const T*")
        code.arg("din_features", f"T*")
        code.arg("indices_bwd", "const int*")
        code.arg("num_features", "int")
@@ -351,6 +354,9 @@ class IndiceMaxPoolCPU(pccm.Class):
    def __init__(self):
        super().__init__()
        self.add_dependency(TensorView)
+        if CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(OMPLib)
+        self.add_include("tensorview/parallel/all.h")

    @pccm.static_function
    def forward(self):
@@ -371,20 +377,21 @@ class IndiceMaxPoolCPU(pccm.Class):

            auto in_indices = in_inds.data_ptr<const int>();
            auto out_indices = out_inds.data_ptr<const int>();
-
-            for (int i = 0; i < nhot; ++i) {{
-                int in_idx = in_indices[i];
-                int out_idx = out_indices[i];
-                auto in_ptr = in_features + in_idx * num_features;
-                auto out_ptr = out_features + out_idx * num_features;
-                for (int j = 0; j < num_features; ++j) {{
-                    auto in = in_ptr[j];
-                    auto out = out_ptr[j];
-                    if (in > out){{
-                        out_ptr[j] = in;
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    int in_idx = in_indices[i];
+                    int out_idx = out_indices[i];
+                    auto in_ptr = in_features + in_idx * num_features;
+                    auto out_ptr = out_features + out_idx * num_features;
+                    for (int j = 0; j < num_features; ++j) {{
+                        auto in = in_ptr[j];
+                        auto out = out_ptr[j];
+                        if (in > out){{
+                            out_ptr[j] = in;
+                        }}
                    }}
                }}
-            }}
+            }});
        }});
        """)
        return code
@@ -412,22 +419,24 @@ class IndiceMaxPoolCPU(pccm.Class):

            auto in_indices = in_inds.data_ptr<const int>();
            auto out_indices = out_inds.data_ptr<const int>();
-
-            for (int i = 0; i < nhot; ++i) {{
-                int in_idx_offset = in_indices[i] * num_features;
-                int out_idx_offset = out_indices[i] * num_features;
-                auto in_ptr = in_features + in_idx_offset;
-                auto out_ptr = out_features + out_idx_offset;
-                auto din_ptr = din_features + in_idx_offset;
-                auto dout_ptr = dout_features + out_idx_offset;
-                for (int j = 0; j < num_features; ++j) {{
-                    auto in = in_ptr[j];
-                    auto out = out_ptr[j];
-                    if (in == out){{
-                        din_ptr[j] = din_ptr[j] + dout_ptr[j];
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
+                    int in_idx_offset = in_indices[i] * num_features;
+                    int out_idx_offset = out_indices[i] * num_features;
+                    auto in_ptr = in_features + in_idx_offset;
+                    auto out_ptr = out_features + out_idx_offset;
+                    auto din_ptr = din_features + in_idx_offset;
+                    auto dout_ptr = dout_features + out_idx_offset;
+                    for (int j = 0; j < num_features; ++j) {{
+                        auto in = in_ptr[j];
+                        auto out = out_ptr[j];
+                        if (in == out){{
+                            din_ptr[j] = din_ptr[j] + dout_ptr[j];
+                        }}
                    }}
                }}
-            }}
+            }});
+
        }});
        """)
        return code