v2.1.5: add profile tool and python 3.6 for linux

82fd7a8b · yan.yan · f31eee3a · f31eee3a · f31eee3a · 82fd7a8b
Commit 82fd7a8b authored Nov 10, 2021 by yan.yan
20 changed files
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-from cumm.tensorview import Tensor
-class Point2VoxelCPU:
-    densehashdata: Tensor
-    voxels: Tensor
-    indices: Tensor
-    num_per_voxel: Tensor
-    @property
-    def grid_size(self) -> List[int]: ...
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
-    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-class Point2VoxelCommon:
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
@@ -9,14 +9,11 @@ class Point2VoxelCPU:
    @property
    def grid_size(self) -> List[int]: ...
    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> Tuple[List[float], List[int], List[int], List[float]]: 
+    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
        """
        Args:
            vsize_xyz: 
            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
        """
        ...
    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
@@ -30,7 +27,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -38,7 +35,6 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
-            mean_per_voxel: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -47,7 +43,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -55,7 +51,6 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
-            mean_per_voxel: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-from cumm.tensorview import Tensor
-class Point2VoxelCPU:
-    densehashdata: Tensor
-    voxels: Tensor
-    indices: Tensor
-    num_per_voxel: Tensor
-    @property
-    def grid_size(self) -> List[int]: ...
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
-    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-            num_point_features: 
-            max_num_voxels: 
-            max_num_points_per_voxel: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            voxels: 
-            indices: 
-            num_per_voxel: 
-            densehashdata: 
-            vsize: 
-            grid_size: 
-            grid_stride: 
-            coors_range: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
-    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
-        """
-        Args:
-            points: 
-            clear_voxels: 
-        """
-        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
-from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
-from pccm.stubs import EnumValue, EnumClassValue
-class Point2VoxelCommon:
-    @staticmethod
-    def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
-        """
-        Args:
-            vsize_xyz: 
-            coors_range_xyz: 
-        """
-        ...
--- a/spconv/core_cc/cumm/__init__.pyi
+++ b/spconv/core_cc/cumm/__init__.pyi
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/spconv/core_cc/cumm/conv/main.pyi
+++ b/spconv/core_cc/cumm/conv/main.pyi
@@ -2,6 +2,7 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
 from pccm.stubs import EnumValue, EnumClassValue
 from ...cumm.gemm.main import GemmAlgoDesp
 from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
 class ConvAlgoDesp(GemmAlgoDesp):
    ndim: int
    op_type: int
@@ -86,17 +87,19 @@ class ConvParams:
    mask_filter: int
    reverse_mask: bool
    verbose: bool
+    timer: CUDAKernelTimer
    workspace: Tensor =  Tensor()
    mask: Tensor =  Tensor()
    mask_argsort: Tensor =  Tensor()
    indices: Tensor =  Tensor()
    mask_output: Tensor =  Tensor()
    stream: int
-    def __init__(self, ndim: int, op_type: int) -> None: 
+    def __init__(self, ndim: int, op_type: int, timer: CUDAKernelTimer =  CUDAKernelTimer(False)) -> None: 
        """
        Args:
            ndim: 
            op_type: 
+            timer: 
        """
        ...
 class ConvMainUnitTest:

--- a/spconv/core_cc/cumm/gemm/__init__.pyi
+++ b/spconv/core_cc/cumm/gemm/__init__.pyi
-# Copyright 2021 Yan Yan
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/spconv/core_cc/cumm/gemm/main.pyi
+++ b/spconv/core_cc/cumm/gemm/main.pyi
 from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
 class GemmAlgoDesp:
    dtype_a: int
    dtype_b: int
@@ -102,7 +103,13 @@ class GemmParams:
    alpha: float
    beta: float
    stream: int
-    def __init__(self) -> None: ...
+    timer: CUDAKernelTimer
+    def __init__(self, timer: CUDAKernelTimer =  CUDAKernelTimer(False)) -> None: 
+        """
+        Args:
+            timer: 
+        """
+        ...
    def check_valid(self) -> None: ...
    @property
    def a(self) -> Tensor: ...

--- a/spconv/core_cc/cumm/tools/__init__.pyi
+++ b/spconv/core_cc/cumm/tools/__init__.pyi
--- a/spconv/core_cc/cumm/tools/cuda.pyi
+++ b/spconv/core_cc/cumm/tools/cuda.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+class CUDAEvent:
+    def __init__(self, name: str) -> None: 
+        """
+        Args:
+            name: 
+        """
+        ...
+    def record(self, stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def sync(self) -> None: ...
+    @staticmethod
+    def duration(start: "CUDAEvent", stop: "CUDAEvent") -> float: 
+        """
+        Args:
+            start: 
+            stop: 
+        """
+        ...
+class CUDAKernelTimer:
+    enable: bool
+    def __init__(self, enable: bool = True) -> None: 
+        """
+        Args:
+            enable: 
+        """
+        ...
+    def push(self, name: str) -> None: 
+        """
+        Args:
+            name: 
+        """
+        ...
+    def pop(self) -> None: ...
+    def record(self, name: str, stream: int = 0) -> None: 
+        """
+        Args:
+            name: 
+            stream: 
+        """
+        ...
+    def insert_pair(self, name: str, start: str, stop: str) -> None: 
+        """
+        Args:
+            name: 
+            start: 
+            stop: 
+        """
+        ...
+    def get_all_pair_duration(self) -> Dict[str, float]: ...
+    def sync(self) -> None: ...
--- a/spconv/cppconstants.py
+++ b/spconv/cppconstants.py
--- a/spconv/csrc/__init__.py
+++ b/spconv/csrc/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/spconv/csrc/sparse/__init__.py
+++ b/spconv/csrc/sparse/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
@@ -24,6 +24,7 @@ from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndice
 from .maxpool import IndiceMaxPool, IndiceMaxPoolCPU
 from .gather import GatherCPU
 class CustomThrustLib(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -32,12 +33,15 @@ class CustomThrustLib(pccm.Class):
        if compat.InLinux:
            self.build_meta.add_cflags("nvcc", "-Xcompiler", "-fno-gnu-unique")
 class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
    def __init__(self):
        super().__init__()
        self.add_dependency(TensorView)
        self.add_include("functional", "memory")
-        self.add_pybind_member("alloc_func", "std::function<std::uintptr_t(std::size_t)>", pyanno="Callable[[int], int]")
+        self.add_pybind_member("alloc_func",
+                               "std::function<std::uintptr_t(std::size_t)>",
+                               pyanno="Callable[[int], int]")
        self.add_typedef("value_type", "char")
    @pccm.member_function
@@ -63,6 +67,7 @@ class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
        code.arg("num_bytes", "size_t")
        return code
 class SpconvOps(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -71,26 +76,36 @@ class SpconvOps(pccm.Class):
        for ndim in self.ndims:
            p2v = Point2Voxel(dtypes.float32, ndim)
            p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
-            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
+            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu,
+                                 f"Point2Voxel{ndim}DCPU")
            problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
            indices = SparseConvIndicesKernel(problem, dtypes.int32)
            indices_cpu = SparseConvIndicesCPU(problem, dtypes.int32)
-            self.add_param_class(f"ops_cpu{ndim}d", indices_cpu, f"SpconvIndicesCPU{ndim}D")
+            self.add_param_class(f"ops_cpu{ndim}d", indices_cpu,
+                                 f"SpconvIndicesCPU{ndim}D")
            # self.add_param_class("ops", indices, "SpconvIndices")
            if not CUMM_CPU_ONLY_BUILD:
                self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
-                cuda_funcs = [self.generate_subm_conv_inds, 
+                cuda_funcs = [
-                    self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key,
+                    self.generate_subm_conv_inds,
-                    self.generate_conv_inds_mask_stage1, self.generate_conv_inds_mask_stage2]
+                    self.generate_conv_inds_stage1,
-                self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
+                    self.generate_conv_inds_stage1_5,
+                    self.generate_conv_inds_stage2, self.sort_1d_by_key,
+                    self.generate_conv_inds_mask_stage1,
+                    self.generate_conv_inds_mask_stage2
+                ]
+                self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d",
+                                               indices,
+                                               f"SpconvIndices{ndim}D")
    @pccm.pybind.mark
    @pccm.cuda.static_function
    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
@@ -127,7 +142,7 @@ class SpconvOps(pccm.Class):
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code# .ret("int")
+        return code  # .ret("int")
    @pccm.pybind.mark
    @pccm.cuda.static_function
@@ -201,7 +216,8 @@ class SpconvOps(pccm.Class):
            return code.make_invalid()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
@@ -236,7 +252,7 @@ class SpconvOps(pccm.Class):
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code# .ret("int")
+        return code  # .ret("int")
    @pccm.pybind.mark
    @pccm.cuda.static_function
@@ -245,7 +261,9 @@ class SpconvOps(pccm.Class):
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("indices, hashdata", "tv::Tensor")
-        code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg(
+            "indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
+            "tv::Tensor")
        code.arg("mask_fwd, mask_bwd", "tv::Tensor")
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
@@ -294,7 +312,8 @@ class SpconvOps(pccm.Class):
        code.arg("batch_size", "int")
        code.arg("input_dims", f"std::vector<int>")
        code.arg("ksize, dilation", f"std::vector<int>")
-        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
        code.arg("backward", "bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
        code.raw(f"""
@@ -529,7 +548,10 @@ class SpconvOps(pccm.Class):
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("data", "tv::Tensor")
-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.code_after_include = f"""
        template <typename T> struct SmallOrEqualTo {{
@@ -575,7 +597,10 @@ class SpconvOps(pccm.Class):
        code.arg("data", "tv::Tensor")
        code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>")
-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.code_after_include = f"""
        template <typename T> struct SmallOrEqualTo {{
@@ -613,7 +638,6 @@ class SpconvOps(pccm.Class):
        """)
        return code.ret("tv::Tensor")
    @pccm.pybind.mark
    @pccm.cuda.static_function
    def sort_1d_by_key_split(self):
@@ -623,7 +647,10 @@ class SpconvOps(pccm.Class):
        code.arg("data", "tv::Tensor")
        code.arg("mask", "tv::Tensor")
-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.arg("mask_output", "bool", "false")
@@ -678,7 +705,10 @@ class SpconvOps(pccm.Class):
        code.arg("mask", "tv::Tensor")
-        code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("indices",
+                 "tv::Tensor",
+                 "tv::Tensor()",
+                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.arg("mask_output", "bool", "false")
@@ -821,8 +851,9 @@ class SpconvOps(pccm.Class):
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code.ret("std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>")
+        return code.ret(
+            "std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>"
+        )
    @pccm.pybind.mark
    @pccm.static_function
@@ -876,7 +907,8 @@ class SpconvOps(pccm.Class):
    def point2voxel_cuda(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data", "tv::Tensor")
+        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
+                 "tv::Tensor")
        code.arg("vsize", f"std::vector<float>")
        code.arg("grid_size, grid_stride", f"std::vector<int>")
        code.arg("coors_range", f"std::vector<float>")

--- a/spconv/csrc/sparse/cpu_core.py
+++ b/spconv/csrc/sparse/cpu_core.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pccm
+from ccimport import compat
+from cumm.common import TensorView
+class OMPLib(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_include("tensorview/parallel/all.h")
+        if compat.InWindows:
+            self.build_meta.add_cflags("cl", "/openmp")
+        else:
+            self.build_meta.add_cflags("g++", "-fopenmp")
+            self.build_meta.add_cflags("clang++", "-fopenmp")
--- a/spconv/csrc/sparse/devleop/sort_bench.py
+++ b/spconv/csrc/sparse/devleop/sort_bench.py
 import torch
 import time
 def main():
    arr = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()

--- a/spconv/csrc/sparse/gather.py
+++ b/spconv/csrc/sparse/gather.py
@@ -14,12 +14,18 @@
 import pccm
 from cumm.common import TensorView
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+from spconv.csrc.sparse.cpu_core import OMPLib
 from typing import List
 class GatherCPU(pccm.Class):
    def __init__(self):
        super().__init__()
+        if CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(OMPLib)
        self.add_dependency(TensorView)
+        self.add_include("tensorview/parallel/all.h")
    @pccm.static_function
    def gather(self):
@@ -35,16 +41,17 @@ class GatherCPU(pccm.Class):
        int channel = in.dim(1);
        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
            auto indices_data = inds.data_ptr<const int>();
            using T = TV_DECLTYPE(I);
            T *buffer_data = out.data_ptr<T>();
            const T *features_data = in.data_ptr<const T>();
-            for (int i = 0; i < nhot; ++i) {{
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
                    std::memcpy(buffer_data + i * channel,
                                features_data + indices_data[i] * channel,
                                sizeof(T) * channel);
                }}
            }});
+        }});
        """)
        return code
@@ -65,7 +72,8 @@ class GatherCPU(pccm.Class):
            T *features_data = out.data_ptr<T>();
            const T *buf = in.data_ptr<const T>();
            T *out_ptr = out.data_ptr<T>();
-            for (int i = 0; i < nhot; ++i) {{
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
+                for (int i = begin; i < end; i += step) {{
                    buf = buffer_data + i * channel;
                    out_ptr = features_data + indices_data[i] * channel;
                    for (int j = 0; j < channel; ++j) {{
@@ -73,5 +81,6 @@ class GatherCPU(pccm.Class):
                    }}
                }}
            }});
+        }});
        """)
        return code
--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
@@ -24,6 +24,7 @@ from typing import List
 from cumm.conv.params import ConvProblem
 import numpy as np
 class CudaCommonKernel(pccm.ParameterizedClass):
    # we need to use PClass instead of Class
    # because cuda global function can't be put in class body.
@@ -82,12 +83,14 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        pqs = codeops.unpack("problem.output_dims", range(self.ndim))
        rss = codeops.unpack("problem.ksize", range(self.ndim))
-        code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
+        code.ctor_init("layout_npq",
+                       f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
        code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
        return code
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
                          name="operator++")
    def increment(self):
        code = pccm.FunctionCode()
@@ -110,7 +113,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
                          const=True)
    def nhw_to_npq(self):
        code = pccm.FunctionCode()
@@ -128,7 +132,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code.ret(f"tv::array<int, {self.ndim + 1}>")
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
                          const=True)
    def npq_to_nhw(self):
        code = pccm.FunctionCode()
@@ -144,8 +149,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code.ret(f"tv::array<int, {self.ndim + 1}>")
+    @pccm.member_function(header_only=True,
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
+                          attrs=["TV_HOST_DEVICE_INLINE"],
                          const=True)
    def query_npq(self):
        code = pccm.FunctionCode()
@@ -159,10 +164,14 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        hw_valid = []  # type: List[str]
        stride_valid = []  # type: List[str]
        for i in range(self.ndim):
-            code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
+            code.raw(
-            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+                f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];"
+            )
+            hw_valid.append(
+                (f"npq_offset[{i + 1}] >= 0 && "
                 f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
-            stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
+            stride_valid.append(
+                f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
        code.raw(f"""
        return npq_no_stride[0] < problem_.N && 
            {' && '.join(hw_valid)} &&
@@ -170,7 +179,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
                          const=True)
    def query_npq_no_stride(self):
        code = pccm.FunctionCode()
@@ -182,7 +192,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        hw_valid = []  # type: List[str]
        for i in range(self.ndim):
-            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+            hw_valid.append(
+                (f"npq_offset[{i + 1}] >= 0 && "
                 f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
        code.raw(f"""
        return npq_offset[0] < problem_.N && 
@@ -190,7 +201,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
                          const=True)
    def query_nhw(self):
        code = pccm.FunctionCode()
@@ -202,7 +214,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        hw_valid = []  # type: List[str]
        for i in range(self.ndim):
-            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
+            hw_valid.append(
+                (f"nhw_offset[{i + 1}] >= 0 && "
                 f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
        code.raw(f"""
        return nhw_offset[0] < problem_.N && 
@@ -210,7 +223,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code
-    @pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
+    @pccm.member_function(header_only=True,
+                          attrs=["TV_HOST_DEVICE_INLINE"],
                          const=True)
    def query_nhw_out(self):
        code = pccm.FunctionCode()
@@ -222,7 +236,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        hw_valid = []  # type: List[str]
        for i in range(self.ndim):
-            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
+            hw_valid.append(
+                (f"nhw_offset[{i + 1}] >= 0 && "
                 f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
        code.raw(f"""
        return nhw_offset[0] < problem_.N && 
@@ -230,10 +245,12 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code
 class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
        super().__init__()
-        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
+        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel,
+                            ThrustLib)
        self.loc_iter = ConvOutLocIter(problem)
        self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
        self.add_param_class("spinds", problem, "ConvProblem")
@@ -245,15 +262,16 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
    @pccm.cuda.cuda_global_function
    def calc_conv_indices_stage1(self):
        code = pccm.FunctionCode()
        code.arg("loc_iter", f"ConvLocIter")  # [N, ndim + 1]
        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs",
-        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]
        code.arg("num_indices_in", "int")
@@ -288,7 +306,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code
    @pccm.cuda.cuda_global_function
    def build_conv_hash_table(self):
        code = pccm.FunctionCode()
@@ -296,9 +313,11 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("table", f"TTable")  # [N, ndim + 1]
        code.arg("indices_out", f"int*")  # [N, ndim + 1]
-        code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq",
+                 f"const {self.dtype_indices}*")  # [2, kernelProd, MaxSize]
-        code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
+        code.arg("layout_npq",
+                 f"spinds::LayoutNPQ")  # [2, kernelProd, MaxSize]
        code.arg("num_indices", "int")
@@ -341,8 +360,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("loc_iter", f"ConvLocIter")  # [N, ndim + 1]
        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
-        code.arg("indice_pairs_bwd", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_bwd",
-        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]
        code.arg("num_indices_in", "int")
@@ -382,8 +403,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code = pccm.FunctionCode()
        code.targ("TTable")
        code.arg("table", f"TTable")  # [N, ndim + 1]
-        code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out
+        code.arg("indice_pairs_fwd",
-        code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
+                 f"int*")  # [kernelProd, MaxSize], inp -> out
+        code.arg("indice_pairs_bwd",
+                 f"int*")  # [kernelProd, MaxSize], out -> inp
        code.arg("mask_fwd", f"uint32_t*")  # [kernelProd]
        code.arg("mask_bwd", f"uint32_t*")  # [kernelProd]
@@ -418,7 +441,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    @pccm.cuda.cuda_global_function
    def calc_conv_indices_stage2_mask_output(self):
        code = pccm.FunctionCode()
-        code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
+        code.arg("indice_pairs_bwd",
+                 f"int*")  # [kernelProd, MaxSize], out -> inp
        code.arg("mask_bwd", f"uint32_t*")  # [kernelProd]
        code.arg("num_indices_in", "int")
@@ -442,8 +466,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code = pccm.FunctionCode()
        code.targ("TTable")
        code.arg("table", f"TTable")  # [N, ndim + 1]
-        code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out
+        code.arg("indice_pairs_fwd",
-        code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
+                 f"int*")  # [kernelProd, MaxSize], inp -> out
+        code.arg("indice_pairs_bwd",
+                 f"int*")  # [kernelProd, MaxSize], out -> inp
        code.arg("mask_fwd", f"uint32_t*")  # [kernelProd]
        code.arg("num_indices_in", "int")
        code.arg("num_indices_out", "int")
@@ -469,7 +495,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code
    @pccm.cuda.cuda_global_function
    def build_subm_conv_hash_table(self):
        code = pccm.FunctionCode()
@@ -510,7 +535,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("table", f"TTable")  # [N, ndim + 1]
        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]
        code.arg("num_indices_in", "int")
@@ -556,7 +582,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("table", f"TTable")  # [N, ndim + 1]
        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
        code.arg("mask", f"uint32_t*")  # [kernelProd]
        code.arg("num_indices", "int")
@@ -613,7 +640,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("table", f"TTable")  # [N, ndim + 1]
        code.arg("indices_in", f"const int*")  # [N, ndim + 1]
-        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs",
+                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
        code.arg("mask1", f"uint32_t*")  # [kernelProd]
        code.arg("mask2", f"uint32_t*")  # [kernelProd]
@@ -665,10 +693,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
@@ -706,9 +736,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        // auto num_out_act = new_end - ptr_tr - 1;
        // return num_out_act;
        """)
-        return code# .ret("int")
+        return code  # .ret("int")
    @pccm.cuda.static_function
    def generate_conv_inds_stage1_5(self):
@@ -726,7 +754,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code.ret("int")
    @pccm.cuda.static_function
    def generate_conv_inds_stage2(self):
        code = pccm.FunctionCode()
@@ -735,7 +762,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
        code.raw(f"""
@@ -783,10 +811,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def generate_conv_inds_mask_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
+                 "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
@@ -817,21 +847,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
            indice_pairs_bwd.data_ptr<{self.dtype_indices}>(), 
            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
            kv, transposed);
-        auto timer = tv::CudaContextTimer<>();
        """)
-        return code# .ret("int")
+        return code  # .ret("int")
    @pccm.cuda.static_function
    def generate_conv_inds_stage2_mask(self):
        code = pccm.FunctionCode()
        code.arg("indices, hashdata", "tv::Tensor")
-        code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg(
+            "indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
+            "tv::Tensor")
        code.arg("mask_fwd, mask_bwd", "tv::Tensor")
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
        code.raw(f"""
@@ -903,7 +935,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code.ret("int")
    @pccm.cuda.static_function
    def generate_subm_conv_inds(self):
        code = pccm.FunctionCode()
@@ -912,7 +943,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("batch_size", "int")
        code.arg("input_dims", f"tv::array<int, {self.ndim}>")
        code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
-        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
        code.arg("backward", "bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
@@ -993,6 +1025,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        return code.ret("int")
 class SparseConvIndicesCPU(pccm.ParameterizedClass):
    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
        super().__init__()
@@ -1079,7 +1112,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation",
+                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.raw(f"""
        int kv = tv::arrayops::prod(ksize);

--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
@@ -25,6 +25,9 @@ from cumm.conv.params import ConvProblem
 from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
 import numpy as np
 from cumm.gemm import (thread_map)
+from spconv.csrc.sparse.cpu_core import OMPLib
+from cumm.constants import CUMM_CPU_ONLY_BUILD
 class IndiceMaxPool(pccm.Class):
    # TODO optimize this function
@@ -351,6 +354,9 @@ class IndiceMaxPoolCPU(pccm.Class):
    def __init__(self):
        super().__init__()
        self.add_dependency(TensorView)
+        if CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(OMPLib)
+        self.add_include("tensorview/parallel/all.h")
    @pccm.static_function
    def forward(self):
@@ -371,8 +377,8 @@ class IndiceMaxPoolCPU(pccm.Class):
            auto in_indices = in_inds.data_ptr<const int>();
            auto out_indices = out_inds.data_ptr<const int>();
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
-            for (int i = 0; i < nhot; ++i) {{
+                for (int i = begin; i < end; i += step) {{
                    int in_idx = in_indices[i];
                    int out_idx = out_indices[i];
                    auto in_ptr = in_features + in_idx * num_features;
@@ -386,6 +392,7 @@ class IndiceMaxPoolCPU(pccm.Class):
                    }}
                }}
            }});
+        }});
        """)
        return code
@@ -412,8 +419,8 @@ class IndiceMaxPoolCPU(pccm.Class):
            auto in_indices = in_inds.data_ptr<const int>();
            auto out_indices = out_inds.data_ptr<const int>();
+            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
-            for (int i = 0; i < nhot; ++i) {{
+                for (int i = begin; i < end; i += step) {{
                    int in_idx_offset = in_indices[i] * num_features;
                    int out_idx_offset = out_indices[i] * num_features;
                    auto in_ptr = in_features + in_idx_offset;
@@ -429,5 +436,7 @@ class IndiceMaxPoolCPU(pccm.Class):
                    }}
                }}
            }});
+        }});
        """)
        return code