Commit 82fd7a8b authored by yan.yan's avatar yan.yan
Browse files

v2.1.5: add profile tool and python 3.6 for linux

parent f31eee3a
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class Point2VoxelCommon:
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
...@@ -9,14 +9,11 @@ class Point2VoxelCPU: ...@@ -9,14 +9,11 @@ class Point2VoxelCPU:
@property @property
def grid_size(self) -> List[int]: ... def grid_size(self) -> List[int]: ...
@staticmethod @staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> Tuple[List[float], List[int], List[int], List[float]]: def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
""" """
Args: Args:
vsize_xyz: vsize_xyz:
coors_range_xyz: coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
""" """
... ...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
...@@ -30,7 +27,7 @@ class Point2VoxelCPU: ...@@ -30,7 +27,7 @@ class Point2VoxelCPU:
""" """
... ...
@staticmethod @staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
""" """
Args: Args:
points: points:
...@@ -38,7 +35,6 @@ class Point2VoxelCPU: ...@@ -38,7 +35,6 @@ class Point2VoxelCPU:
indices: indices:
num_per_voxel: num_per_voxel:
densehashdata: densehashdata:
mean_per_voxel:
vsize: vsize:
grid_size: grid_size:
grid_stride: grid_stride:
...@@ -47,7 +43,7 @@ class Point2VoxelCPU: ...@@ -47,7 +43,7 @@ class Point2VoxelCPU:
""" """
... ...
@staticmethod @staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
""" """
Args: Args:
points: points:
...@@ -55,7 +51,6 @@ class Point2VoxelCPU: ...@@ -55,7 +51,6 @@ class Point2VoxelCPU:
indices: indices:
num_per_voxel: num_per_voxel:
densehashdata: densehashdata:
mean_per_voxel:
vsize: vsize:
grid_size: grid_size:
grid_stride: grid_stride:
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class Point2VoxelCommon:
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -2,6 +2,7 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty ...@@ -2,6 +2,7 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue from pccm.stubs import EnumValue, EnumClassValue
from ...cumm.gemm.main import GemmAlgoDesp from ...cumm.gemm.main import GemmAlgoDesp
from cumm.tensorview import Tensor from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class ConvAlgoDesp(GemmAlgoDesp): class ConvAlgoDesp(GemmAlgoDesp):
ndim: int ndim: int
op_type: int op_type: int
...@@ -86,17 +87,19 @@ class ConvParams: ...@@ -86,17 +87,19 @@ class ConvParams:
mask_filter: int mask_filter: int
reverse_mask: bool reverse_mask: bool
verbose: bool verbose: bool
timer: CUDAKernelTimer
workspace: Tensor = Tensor() workspace: Tensor = Tensor()
mask: Tensor = Tensor() mask: Tensor = Tensor()
mask_argsort: Tensor = Tensor() mask_argsort: Tensor = Tensor()
indices: Tensor = Tensor() indices: Tensor = Tensor()
mask_output: Tensor = Tensor() mask_output: Tensor = Tensor()
stream: int stream: int
def __init__(self, ndim: int, op_type: int) -> None: def __init__(self, ndim: int, op_type: int, timer: CUDAKernelTimer = CUDAKernelTimer(False)) -> None:
""" """
Args: Args:
ndim: ndim:
op_type: op_type:
timer:
""" """
... ...
class ConvMainUnitTest: class ConvMainUnitTest:
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class GemmAlgoDesp: class GemmAlgoDesp:
dtype_a: int dtype_a: int
dtype_b: int dtype_b: int
...@@ -102,7 +103,13 @@ class GemmParams: ...@@ -102,7 +103,13 @@ class GemmParams:
alpha: float alpha: float
beta: float beta: float
stream: int stream: int
def __init__(self) -> None: ... timer: CUDAKernelTimer
def __init__(self, timer: CUDAKernelTimer = CUDAKernelTimer(False)) -> None:
"""
Args:
timer:
"""
...
def check_valid(self) -> None: ... def check_valid(self) -> None: ...
@property @property
def a(self) -> Tensor: ... def a(self) -> Tensor: ...
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class CUDAEvent:
def __init__(self, name: str) -> None:
"""
Args:
name:
"""
...
def record(self, stream: int = 0) -> None:
"""
Args:
stream:
"""
...
def sync(self) -> None: ...
@staticmethod
def duration(start: "CUDAEvent", stop: "CUDAEvent") -> float:
"""
Args:
start:
stop:
"""
...
class CUDAKernelTimer:
enable: bool
def __init__(self, enable: bool = True) -> None:
"""
Args:
enable:
"""
...
def push(self, name: str) -> None:
"""
Args:
name:
"""
...
def pop(self) -> None: ...
def record(self, name: str, stream: int = 0) -> None:
"""
Args:
name:
stream:
"""
...
def insert_pair(self, name: str, start: str, stop: str) -> None:
"""
Args:
name:
start:
stop:
"""
...
def get_all_pair_duration(self) -> Dict[str, float]: ...
def sync(self) -> None: ...
# Copyright 2021 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -17,4 +17,4 @@ import spconv.core_cc as _ext ...@@ -17,4 +17,4 @@ import spconv.core_cc as _ext
if hasattr(_ext, "cumm"): if hasattr(_ext, "cumm"):
CPU_ONLY_BUILD = False CPU_ONLY_BUILD = False
else: else:
CPU_ONLY_BUILD = True CPU_ONLY_BUILD = True
# Copyright 2021 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Copyright 2021 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Copyright 2021 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -17,13 +17,14 @@ from cumm.conv.bases import ConvOpType, NHWC ...@@ -17,13 +17,14 @@ from cumm.conv.bases import ConvOpType, NHWC
from cumm.conv.params import ConvProblem from cumm.conv.params import ConvProblem
from cumm import dtypes from cumm import dtypes
from cumm.constants import CUMM_CPU_ONLY_BUILD from cumm.constants import CUMM_CPU_ONLY_BUILD
import pccm import pccm
from ccimport import compat from ccimport import compat
from .pointops import Point2Voxel, Point2VoxelCPU from .pointops import Point2Voxel, Point2VoxelCPU
from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndicesCPU from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndicesCPU
from .maxpool import IndiceMaxPool, IndiceMaxPoolCPU from .maxpool import IndiceMaxPool, IndiceMaxPoolCPU
from .gather import GatherCPU from .gather import GatherCPU
class CustomThrustLib(pccm.Class): class CustomThrustLib(pccm.Class):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -32,12 +33,15 @@ class CustomThrustLib(pccm.Class): ...@@ -32,12 +33,15 @@ class CustomThrustLib(pccm.Class):
if compat.InLinux: if compat.InLinux:
self.build_meta.add_cflags("nvcc", "-Xcompiler", "-fno-gnu-unique") self.build_meta.add_cflags("nvcc", "-Xcompiler", "-fno-gnu-unique")
class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin): class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.add_dependency(TensorView) self.add_dependency(TensorView)
self.add_include("functional", "memory") self.add_include("functional", "memory")
self.add_pybind_member("alloc_func", "std::function<std::uintptr_t(std::size_t)>", pyanno="Callable[[int], int]") self.add_pybind_member("alloc_func",
"std::function<std::uintptr_t(std::size_t)>",
pyanno="Callable[[int], int]")
self.add_typedef("value_type", "char") self.add_typedef("value_type", "char")
@pccm.member_function @pccm.member_function
...@@ -54,14 +58,15 @@ class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin): ...@@ -54,14 +58,15 @@ class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
TV_THROW_RT_ERR("set alloc function first."); TV_THROW_RT_ERR("set alloc function first.");
}} }}
""") """)
return code return code
@pccm.member_function @pccm.member_function
def deallocate(self): def deallocate(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("ptr", "char *") code.arg("ptr", "char *")
code.arg("num_bytes", "size_t") code.arg("num_bytes", "size_t")
return code return code
class SpconvOps(pccm.Class): class SpconvOps(pccm.Class):
def __init__(self): def __init__(self):
...@@ -69,28 +74,38 @@ class SpconvOps(pccm.Class): ...@@ -69,28 +74,38 @@ class SpconvOps(pccm.Class):
self.add_dependency(ThrustCustomAllocatorV2) self.add_dependency(ThrustCustomAllocatorV2)
self.ndims = [1, 2, 3, 4] self.ndims = [1, 2, 3, 4]
for ndim in self.ndims: for ndim in self.ndims:
p2v = Point2Voxel(dtypes.float32, ndim) p2v = Point2Voxel(dtypes.float32, ndim)
p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim) p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU") self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu,
f"Point2Voxel{ndim}DCPU")
problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC) problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
indices = SparseConvIndicesKernel(problem, dtypes.int32) indices = SparseConvIndicesKernel(problem, dtypes.int32)
indices_cpu = SparseConvIndicesCPU(problem, dtypes.int32) indices_cpu = SparseConvIndicesCPU(problem, dtypes.int32)
self.add_param_class(f"ops_cpu{ndim}d", indices_cpu, f"SpconvIndicesCPU{ndim}D") self.add_param_class(f"ops_cpu{ndim}d", indices_cpu,
f"SpconvIndicesCPU{ndim}D")
# self.add_param_class("ops", indices, "SpconvIndices") # self.add_param_class("ops", indices, "SpconvIndices")
if not CUMM_CPU_ONLY_BUILD: if not CUMM_CPU_ONLY_BUILD:
self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D") self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
cuda_funcs = [self.generate_subm_conv_inds, cuda_funcs = [
self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key, self.generate_subm_conv_inds,
self.generate_conv_inds_mask_stage1, self.generate_conv_inds_mask_stage2] self.generate_conv_inds_stage1,
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D") self.generate_conv_inds_stage1_5,
self.generate_conv_inds_stage2, self.sort_1d_by_key,
self.generate_conv_inds_mask_stage1,
self.generate_conv_inds_mask_stage2
]
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d",
indices,
f"SpconvIndices{ndim}D")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds_stage1(self): def generate_conv_inds_stage1(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor") code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor") code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>") code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>") code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
...@@ -127,7 +142,7 @@ class SpconvOps(pccm.Class): ...@@ -127,7 +142,7 @@ class SpconvOps(pccm.Class):
""") """)
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code# .ret("int") return code # .ret("int")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.cuda.static_function @pccm.cuda.static_function
...@@ -201,7 +216,8 @@ class SpconvOps(pccm.Class): ...@@ -201,7 +216,8 @@ class SpconvOps(pccm.Class):
return code.make_invalid() return code.make_invalid()
code.arg("indices", "tv::Tensor") code.arg("indices", "tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor") code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>") code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>") code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
...@@ -236,7 +252,7 @@ class SpconvOps(pccm.Class): ...@@ -236,7 +252,7 @@ class SpconvOps(pccm.Class):
""") """)
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code# .ret("int") return code # .ret("int")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.cuda.static_function @pccm.cuda.static_function
...@@ -245,7 +261,9 @@ class SpconvOps(pccm.Class): ...@@ -245,7 +261,9 @@ class SpconvOps(pccm.Class):
if CUMM_CPU_ONLY_BUILD: if CUMM_CPU_ONLY_BUILD:
return code.make_invalid() return code.make_invalid()
code.arg("indices, hashdata", "tv::Tensor") code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor") code.arg(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
"tv::Tensor")
code.arg("mask_fwd, mask_bwd", "tv::Tensor") code.arg("mask_fwd, mask_bwd", "tv::Tensor")
code.arg("num_out_act", "int") code.arg("num_out_act", "int")
code.arg("batch_size", "int") code.arg("batch_size", "int")
...@@ -294,7 +312,8 @@ class SpconvOps(pccm.Class): ...@@ -294,7 +312,8 @@ class SpconvOps(pccm.Class):
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("input_dims", f"std::vector<int>") code.arg("input_dims", f"std::vector<int>")
code.arg("ksize, dilation", f"std::vector<int>") code.arg("ksize, dilation", f"std::vector<int>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()") code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false") code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0") code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
code.raw(f""" code.raw(f"""
...@@ -529,7 +548,10 @@ class SpconvOps(pccm.Class): ...@@ -529,7 +548,10 @@ class SpconvOps(pccm.Class):
if CUMM_CPU_ONLY_BUILD: if CUMM_CPU_ONLY_BUILD:
return code.make_invalid() return code.make_invalid()
code.arg("data", "tv::Tensor") code.arg("data", "tv::Tensor")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()") code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int") code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.code_after_include = f""" code.code_after_include = f"""
template <typename T> struct SmallOrEqualTo {{ template <typename T> struct SmallOrEqualTo {{
...@@ -575,7 +597,10 @@ class SpconvOps(pccm.Class): ...@@ -575,7 +597,10 @@ class SpconvOps(pccm.Class):
code.arg("data", "tv::Tensor") code.arg("data", "tv::Tensor")
code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>") code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()") code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int") code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.code_after_include = f""" code.code_after_include = f"""
template <typename T> struct SmallOrEqualTo {{ template <typename T> struct SmallOrEqualTo {{
...@@ -613,7 +638,6 @@ class SpconvOps(pccm.Class): ...@@ -613,7 +638,6 @@ class SpconvOps(pccm.Class):
""") """)
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.cuda.static_function @pccm.cuda.static_function
def sort_1d_by_key_split(self): def sort_1d_by_key_split(self):
...@@ -623,7 +647,10 @@ class SpconvOps(pccm.Class): ...@@ -623,7 +647,10 @@ class SpconvOps(pccm.Class):
code.arg("data", "tv::Tensor") code.arg("data", "tv::Tensor")
code.arg("mask", "tv::Tensor") code.arg("mask", "tv::Tensor")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()") code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int") code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.arg("mask_output", "bool", "false") code.arg("mask_output", "bool", "false")
...@@ -678,7 +705,10 @@ class SpconvOps(pccm.Class): ...@@ -678,7 +705,10 @@ class SpconvOps(pccm.Class):
code.arg("mask", "tv::Tensor") code.arg("mask", "tv::Tensor")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()") code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int") code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.arg("mask_output", "bool", "false") code.arg("mask_output", "bool", "false")
...@@ -821,9 +851,10 @@ class SpconvOps(pccm.Class): ...@@ -821,9 +851,10 @@ class SpconvOps(pccm.Class):
}} }}
""") """)
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>") return code.ret(
"std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>"
)
@pccm.pybind.mark @pccm.pybind.mark
@pccm.static_function @pccm.static_function
def point2voxel_cpu(self): def point2voxel_cpu(self):
...@@ -876,7 +907,8 @@ class SpconvOps(pccm.Class): ...@@ -876,7 +907,8 @@ class SpconvOps(pccm.Class):
def point2voxel_cuda(self): def point2voxel_cuda(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("points", "tv::Tensor") code.arg("points", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data", "tv::Tensor") code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
"tv::Tensor")
code.arg("vsize", f"std::vector<float>") code.arg("vsize", f"std::vector<float>")
code.arg("grid_size, grid_stride", f"std::vector<int>") code.arg("grid_size, grid_stride", f"std::vector<int>")
code.arg("coors_range", f"std::vector<float>") code.arg("coors_range", f"std::vector<float>")
...@@ -914,4 +946,4 @@ class SpconvOps(pccm.Class): ...@@ -914,4 +946,4 @@ class SpconvOps(pccm.Class):
}} }}
""") """)
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>") return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pccm
from ccimport import compat
from cumm.common import TensorView
class OMPLib(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.add_include("tensorview/parallel/all.h")
if compat.InWindows:
self.build_meta.add_cflags("cl", "/openmp")
else:
self.build_meta.add_cflags("g++", "-fopenmp")
self.build_meta.add_cflags("clang++", "-fopenmp")
import torch import torch
import time import time
def main(): def main():
...@@ -34,4 +35,4 @@ def main(): ...@@ -34,4 +35,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
# Copyright 2021 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import pccm import pccm
from cumm.common import TensorView from cumm.common import TensorView
from typing import List from cumm.constants import CUMM_CPU_ONLY_BUILD
from spconv.csrc.sparse.cpu_core import OMPLib
from typing import List
class GatherCPU(pccm.Class): class GatherCPU(pccm.Class):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
if CUMM_CPU_ONLY_BUILD:
self.add_dependency(OMPLib)
self.add_dependency(TensorView) self.add_dependency(TensorView)
self.add_include("tensorview/parallel/all.h")
@pccm.static_function @pccm.static_function
def gather(self): def gather(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
...@@ -35,15 +41,16 @@ class GatherCPU(pccm.Class): ...@@ -35,15 +41,16 @@ class GatherCPU(pccm.Class):
int channel = in.dim(1); int channel = in.dim(1);
tv::dispatch<float, double>(out.dtype(), [&](auto I){{ tv::dispatch<float, double>(out.dtype(), [&](auto I){{
auto indices_data = inds.data_ptr<const int>(); auto indices_data = inds.data_ptr<const int>();
using T = TV_DECLTYPE(I); using T = TV_DECLTYPE(I);
T *buffer_data = out.data_ptr<T>(); T *buffer_data = out.data_ptr<T>();
const T *features_data = in.data_ptr<const T>(); const T *features_data = in.data_ptr<const T>();
for (int i = 0; i < nhot; ++i) {{ tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
std::memcpy(buffer_data + i * channel, for (int i = begin; i < end; i += step) {{
features_data + indices_data[i] * channel, std::memcpy(buffer_data + i * channel,
sizeof(T) * channel); features_data + indices_data[i] * channel,
}} sizeof(T) * channel);
}}
}});
}}); }});
""") """)
return code return code
...@@ -65,13 +72,15 @@ class GatherCPU(pccm.Class): ...@@ -65,13 +72,15 @@ class GatherCPU(pccm.Class):
T *features_data = out.data_ptr<T>(); T *features_data = out.data_ptr<T>();
const T *buf = in.data_ptr<const T>(); const T *buf = in.data_ptr<const T>();
T *out_ptr = out.data_ptr<T>(); T *out_ptr = out.data_ptr<T>();
for (int i = 0; i < nhot; ++i) {{ tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
buf = buffer_data + i * channel; for (int i = begin; i < end; i += step) {{
out_ptr = features_data + indices_data[i] * channel; buf = buffer_data + i * channel;
for (int j = 0; j < channel; ++j) {{ out_ptr = features_data + indices_data[i] * channel;
out_ptr[j] = out_ptr[j] + buf[j]; for (int j = 0; j < channel; ++j) {{
out_ptr[j] = out_ptr[j] + buf[j];
}}
}} }}
}} }});
}}); }});
""") """)
return code return code
# Copyright 2021 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -16,13 +16,14 @@ import contextlib ...@@ -16,13 +16,14 @@ import contextlib
from cumm.conv.bases import ConvEnum from cumm.conv.bases import ConvEnum
from cumm.gemm.core.metaarray import MetaArray, seq from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes from cumm import dtypes
import pccm import pccm
from cumm.gemm.layout import TensorGeneric, to_stride from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib
from cumm.gemm import codeops from cumm.gemm import codeops
from typing import List from typing import List
from cumm.conv.params import ConvProblem from cumm.conv.params import ConvProblem
import numpy as np import numpy as np
class CudaCommonKernel(pccm.ParameterizedClass): class CudaCommonKernel(pccm.ParameterizedClass):
# we need to use PClass instead of Class # we need to use PClass instead of Class
...@@ -31,8 +32,8 @@ class CudaCommonKernel(pccm.ParameterizedClass): ...@@ -31,8 +32,8 @@ class CudaCommonKernel(pccm.ParameterizedClass):
def arange_kernel(self): def arange_kernel(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("T") code.targ("T")
code.arg("data", f"T*") code.arg("data", f"T*")
code.arg("size", f"int") code.arg("size", f"int")
code.raw(f""" code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{ for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(i); data[i] = T(i);
...@@ -44,9 +45,9 @@ class CudaCommonKernel(pccm.ParameterizedClass): ...@@ -44,9 +45,9 @@ class CudaCommonKernel(pccm.ParameterizedClass):
def fill_kernel(self): def fill_kernel(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("T") code.targ("T")
code.arg("data", f"T*") code.arg("data", f"T*")
code.arg("val", f"T") code.arg("val", f"T")
code.arg("size", f"int") code.arg("size", f"int")
code.raw(f""" code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{ for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(val); data[i] = T(val);
...@@ -66,7 +67,7 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -66,7 +67,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
self.add_param_class("lociter", layout_npq, "LayoutNPQ") self.add_param_class("lociter", layout_npq, "LayoutNPQ")
self.add_param_class("lociter_rs", layout_rs, "LayoutRS") self.add_param_class("lociter_rs", layout_rs, "LayoutRS")
self.ndim = problem.ndim self.ndim = problem.ndim
self.add_member("problem_", f"ConvProblem") self.add_member("problem_", f"ConvProblem")
self.add_member("count_", f"tv::array<int, {self.ndim}>") self.add_member("count_", f"tv::array<int, {self.ndim}>")
self.add_member("layout_npq", f"LayoutNPQ") self.add_member("layout_npq", f"LayoutNPQ")
...@@ -82,13 +83,15 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -82,13 +83,15 @@ class ConvOutLocIter(pccm.ParameterizedClass):
pqs = codeops.unpack("problem.output_dims", range(self.ndim)) pqs = codeops.unpack("problem.output_dims", range(self.ndim))
rss = codeops.unpack("problem.ksize", range(self.ndim)) rss = codeops.unpack("problem.ksize", range(self.ndim))
code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})") code.ctor_init("layout_npq",
f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})") code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"], return code
name="operator++")
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
name="operator++")
def increment(self): def increment(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
for i in range(self.ndim - 1, -1, -1): for i in range(self.ndim - 1, -1, -1):
...@@ -110,8 +113,9 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -110,8 +113,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""") """)
return code return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"], @pccm.member_function(header_only=True,
const=True) attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def nhw_to_npq(self): def nhw_to_npq(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*") code.arg("nhw_offset", "const int*")
...@@ -128,8 +132,9 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -128,8 +132,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""") """)
return code.ret(f"tv::array<int, {self.ndim + 1}>") return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"], @pccm.member_function(header_only=True,
const=True) attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def npq_to_nhw(self): def npq_to_nhw(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("npq_offset", "const int*") code.arg("npq_offset", "const int*")
...@@ -144,9 +149,9 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -144,9 +149,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""") """)
return code.ret(f"tv::array<int, {self.ndim + 1}>") return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.member_function(header_only=True,
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"], attrs=["TV_HOST_DEVICE_INLINE"],
const=True) const=True)
def query_npq(self): def query_npq(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*") code.arg("nhw_offset", "const int*")
...@@ -156,22 +161,27 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -156,22 +161,27 @@ class ConvOutLocIter(pccm.ParameterizedClass):
auto npq_no_stride = nhw_to_npq<true>(nhw_offset); auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
npq_offset[0] = npq_no_stride[0]; npq_offset[0] = npq_no_stride[0];
""") """)
hw_valid = [] # type: List[str] hw_valid = [] # type: List[str]
stride_valid = [] # type: List[str] stride_valid = [] # type: List[str]
for i in range(self.ndim): for i in range(self.ndim):
code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];") code.raw(
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && " f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];"
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]")) )
stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])") hw_valid.append(
(f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
stride_valid.append(
f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
code.raw(f""" code.raw(f"""
return npq_no_stride[0] < problem_.N && return npq_no_stride[0] < problem_.N &&
{' && '.join(hw_valid)} && {' && '.join(hw_valid)} &&
{' && '.join(stride_valid)}; {' && '.join(stride_valid)};
""") """)
return code return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"], @pccm.member_function(header_only=True,
const=True) attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def query_npq_no_stride(self): def query_npq_no_stride(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*") code.arg("nhw_offset", "const int*")
...@@ -180,18 +190,20 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -180,18 +190,20 @@ class ConvOutLocIter(pccm.ParameterizedClass):
code.raw(f""" code.raw(f"""
npq_offset = nhw_to_npq<true>(nhw_offset); npq_offset = nhw_to_npq<true>(nhw_offset);
""") """)
hw_valid = [] # type: List[str] hw_valid = [] # type: List[str]
for i in range(self.ndim): for i in range(self.ndim):
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && " hw_valid.append(
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]")) (f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f""" code.raw(f"""
return npq_offset[0] < problem_.N && return npq_offset[0] < problem_.N &&
{' && '.join(hw_valid)}; {' && '.join(hw_valid)};
""") """)
return code return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"], @pccm.member_function(header_only=True,
const=True) attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def query_nhw(self): def query_nhw(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("npq_offset", "const int*") code.arg("npq_offset", "const int*")
...@@ -200,18 +212,20 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -200,18 +212,20 @@ class ConvOutLocIter(pccm.ParameterizedClass):
code.raw(f""" code.raw(f"""
nhw_offset = npq_to_nhw(npq_offset); nhw_offset = npq_to_nhw(npq_offset);
""") """)
hw_valid = [] # type: List[str] hw_valid = [] # type: List[str]
for i in range(self.ndim): for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && " hw_valid.append(
f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]")) (f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
code.raw(f""" code.raw(f"""
return nhw_offset[0] < problem_.N && return nhw_offset[0] < problem_.N &&
{' && '.join(hw_valid)}; {' && '.join(hw_valid)};
""") """)
return code return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"], @pccm.member_function(header_only=True,
const=True) attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def query_nhw_out(self): def query_nhw_out(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("npq_offset", "const int*") code.arg("npq_offset", "const int*")
...@@ -220,41 +234,45 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -220,41 +234,45 @@ class ConvOutLocIter(pccm.ParameterizedClass):
code.raw(f""" code.raw(f"""
nhw_offset = npq_to_nhw(npq_offset); nhw_offset = npq_to_nhw(npq_offset);
""") """)
hw_valid = [] # type: List[str] hw_valid = [] # type: List[str]
for i in range(self.ndim): for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && " hw_valid.append(
f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]")) (f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f""" code.raw(f"""
return nhw_offset[0] < problem_.N && return nhw_offset[0] < problem_.N &&
{' && '.join(hw_valid)}; {' && '.join(hw_valid)};
""") """)
return code return code
class SparseConvIndicesKernel(pccm.ParameterizedClass): class SparseConvIndicesKernel(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType): def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__() super().__init__()
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib) self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel,
ThrustLib)
self.loc_iter = ConvOutLocIter(problem) self.loc_iter = ConvOutLocIter(problem)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter") self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds", problem, "ConvProblem") self.add_param_class("spinds", problem, "ConvProblem")
self.add_param_class("cudakers", CudaCommonKernel()) self.add_param_class("cudakers", CudaCommonKernel())
self.ndim = problem.ndim self.ndim = problem.ndim
self.dtype_indices = dtype_indices self.dtype_indices = dtype_indices
self.dtype_indices_uniq = dtype_indices self.dtype_indices_uniq = dtype_indices
assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64 assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
@pccm.cuda.cuda_global_function @pccm.cuda.cuda_global_function
def calc_conv_indices_stage1(self): def calc_conv_indices_stage1(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1] code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1] code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize] code.arg("indice_pairs",
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize] f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd] code.arg("indice_pairs_for_uniq",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int") code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int") code.arg("indices_pair_size", "int")
...@@ -288,17 +306,18 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -288,17 +306,18 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""") """)
return code return code
@pccm.cuda.cuda_global_function @pccm.cuda.cuda_global_function
def build_conv_hash_table(self): def build_conv_hash_table(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_out", f"int*") # [N, ndim + 1] code.arg("indices_out", f"int*") # [N, ndim + 1]
code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize] code.arg("indice_pairs_for_uniq",
f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize] code.arg("layout_npq",
f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.arg("num_indices", "int") code.arg("num_indices", "int")
...@@ -315,8 +334,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -315,8 +334,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_conv_indices_stage2(self): def calc_conv_indices_stage2(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize] code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize]
code.arg("num_indices_in", "int") code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int") code.arg("indices_pair_size", "int")
# TODO use block instead of filter_offset? # TODO use block instead of filter_offset?
...@@ -338,12 +357,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -338,12 +357,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
@pccm.cuda.cuda_global_function @pccm.cuda.cuda_global_function
def calc_conv_indices_stage1_mask(self): def calc_conv_indices_stage1_mask(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1] code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1] code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs_bwd", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize] code.arg("indice_pairs_bwd",
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize] f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd] code.arg("indice_pairs_for_uniq",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int") code.arg("num_indices_in", "int")
...@@ -381,11 +402,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -381,11 +402,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_conv_indices_stage2_mask(self): def calc_conv_indices_stage2_mask(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out code.arg("indice_pairs_fwd",
code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp f"int*") # [kernelProd, MaxSize], inp -> out
code.arg("mask_fwd", f"uint32_t*") # [kernelProd] code.arg("indice_pairs_bwd",
code.arg("mask_bwd", f"uint32_t*") # [kernelProd] f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("mask_fwd", f"uint32_t*") # [kernelProd]
code.arg("mask_bwd", f"uint32_t*") # [kernelProd]
code.arg("num_indices_in", "int") code.arg("num_indices_in", "int")
code.arg("num_indices_out", "int") code.arg("num_indices_out", "int")
...@@ -418,8 +441,9 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -418,8 +441,9 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
@pccm.cuda.cuda_global_function @pccm.cuda.cuda_global_function
def calc_conv_indices_stage2_mask_output(self): def calc_conv_indices_stage2_mask_output(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp code.arg("indice_pairs_bwd",
code.arg("mask_bwd", f"uint32_t*") # [kernelProd] f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("mask_bwd", f"uint32_t*") # [kernelProd]
code.arg("num_indices_in", "int") code.arg("num_indices_in", "int")
code.arg("kv", "int") code.arg("kv", "int")
...@@ -441,10 +465,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -441,10 +465,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_conv_indices_stage2_inference_mask(self): def calc_conv_indices_stage2_inference_mask(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out code.arg("indice_pairs_fwd",
code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp f"int*") # [kernelProd, MaxSize], inp -> out
code.arg("mask_fwd", f"uint32_t*") # [kernelProd] code.arg("indice_pairs_bwd",
f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("mask_fwd", f"uint32_t*") # [kernelProd]
code.arg("num_indices_in", "int") code.arg("num_indices_in", "int")
code.arg("num_indices_out", "int") code.arg("num_indices_out", "int")
...@@ -469,16 +495,15 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -469,16 +495,15 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""") """)
return code return code
@pccm.cuda.cuda_global_function @pccm.cuda.cuda_global_function
def build_subm_conv_hash_table(self): def build_subm_conv_hash_table(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1] code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("layout_npq", f"spinds::LayoutNPQ") code.arg("layout_npq", f"spinds::LayoutNPQ")
code.arg("num_indices", "int") code.arg("num_indices", "int")
...@@ -493,8 +518,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -493,8 +518,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
@pccm.cuda.cuda_global_function @pccm.cuda.cuda_global_function
def clean_indices_uniq(self): def clean_indices_uniq(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*")
code.arg("size", f"{self.dtype_indices}") code.arg("size", f"{self.dtype_indices}")
code.raw(f""" code.raw(f"""
for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{ for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{
indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max(); indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max();
...@@ -506,12 +531,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -506,12 +531,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_subm_conv_indices(self): def calc_subm_conv_indices(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1] code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1] code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize] code.arg("indice_pairs",
code.arg("indice_num_per_loc", f"int*") # [kernelProd] f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int") code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int") code.arg("indices_pair_size", "int")
...@@ -552,12 +578,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -552,12 +578,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_subm_conv_indices_mask(self): def calc_subm_conv_indices_mask(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1] code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1] code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize] code.arg("indice_pairs",
code.arg("mask", f"uint32_t*") # [kernelProd] f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask", f"uint32_t*") # [kernelProd]
code.arg("num_indices", "int") code.arg("num_indices", "int")
code.arg("indices_pair_size", "int") code.arg("indices_pair_size", "int")
...@@ -609,13 +636,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -609,13 +636,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_subm_conv_indices_split_mask(self): def calc_subm_conv_indices_split_mask(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("TTable") code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1] code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1] code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1] code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize] code.arg("indice_pairs",
code.arg("mask1", f"uint32_t*") # [kernelProd] f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask2", f"uint32_t*") # [kernelProd] code.arg("mask1", f"uint32_t*") # [kernelProd]
code.arg("mask2", f"uint32_t*") # [kernelProd]
code.arg("num_indices", "int") code.arg("num_indices", "int")
code.arg("indices_pair_size", "int") code.arg("indices_pair_size", "int")
...@@ -665,10 +693,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -665,10 +693,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def generate_conv_inds_stage1(self): def generate_conv_inds_stage1(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor") code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor") code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>") code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false") code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0") code.arg("stream_int", f"std::uintptr_t", "0")
...@@ -706,9 +736,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -706,9 +736,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// auto num_out_act = new_end - ptr_tr - 1; // auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act; // return num_out_act;
""") """)
return code# .ret("int") return code # .ret("int")
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds_stage1_5(self): def generate_conv_inds_stage1_5(self):
...@@ -726,7 +754,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -726,7 +754,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""") """)
return code.ret("int") return code.ret("int")
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds_stage2(self): def generate_conv_inds_stage2(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
...@@ -735,7 +762,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -735,7 +762,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("num_out_act", "int") code.arg("num_out_act", "int")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>") code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false") code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0") code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
...@@ -783,10 +811,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -783,10 +811,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def generate_conv_inds_mask_stage1(self): def generate_conv_inds_mask_stage1(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor") code.arg("indices", "tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor") code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>") code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false") code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0") code.arg("stream_int", f"std::uintptr_t", "0")
...@@ -817,21 +847,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -817,21 +847,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
indice_pairs_bwd.data_ptr<{self.dtype_indices}>(), indice_pairs_bwd.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
kv, transposed); kv, transposed);
auto timer = tv::CudaContextTimer<>();
""") """)
return code# .ret("int") return code # .ret("int")
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds_stage2_mask(self): def generate_conv_inds_stage2_mask(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor") code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor") code.arg(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
"tv::Tensor")
code.arg("mask_fwd, mask_bwd", "tv::Tensor") code.arg("mask_fwd, mask_bwd", "tv::Tensor")
code.arg("num_out_act", "int") code.arg("num_out_act", "int")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>") code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false") code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0") code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
...@@ -903,7 +935,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -903,7 +935,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""") """)
return code.ret("int") return code.ret("int")
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_subm_conv_inds(self): def generate_subm_conv_inds(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
...@@ -912,7 +943,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -912,7 +943,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("input_dims", f"tv::array<int, {self.ndim}>") code.arg("input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()") code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false") code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0") code.arg("stream_int", f"std::uintptr_t", "0")
...@@ -993,6 +1025,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -993,6 +1025,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
return code.ret("int") return code.ret("int")
class SparseConvIndicesCPU(pccm.ParameterizedClass): class SparseConvIndicesCPU(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType): def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__() super().__init__()
...@@ -1000,9 +1033,9 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass): ...@@ -1000,9 +1033,9 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
self.add_include("unordered_map") self.add_include("unordered_map")
self.loc_iter = ConvOutLocIter(problem) self.loc_iter = ConvOutLocIter(problem)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter") self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds", problem, "ConvProblem") self.add_param_class("spinds", problem, "ConvProblem")
self.ndim = problem.ndim self.ndim = problem.ndim
self.dtype_indices = dtype_indices self.dtype_indices = dtype_indices
self.dtype_indices_uniq = dtype_indices self.dtype_indices_uniq = dtype_indices
...@@ -1016,7 +1049,7 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass): ...@@ -1016,7 +1049,7 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("input_dims", f"tv::array<int, {self.ndim}>") code.arg("input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
code.raw(f""" code.raw(f"""
tv::array<int, {self.ndim}> stride, padding; tv::array<int, {self.ndim}> stride, padding;
for (int i = 0; i < {self.ndim}; ++i){{ for (int i = 0; i < {self.ndim}; ++i){{
...@@ -1079,7 +1112,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass): ...@@ -1079,7 +1112,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor") code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>") code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false") code.arg("transposed", f"bool", "false")
code.raw(f""" code.raw(f"""
int kv = tv::arrayops::prod(ksize); int kv = tv::arrayops::prod(ksize);
......
# Copyright 2021 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -16,15 +16,18 @@ import contextlib ...@@ -16,15 +16,18 @@ import contextlib
from cumm.conv.bases import ConvEnum from cumm.conv.bases import ConvEnum
from cumm.gemm.core.metaarray import MetaArray, seq from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes from cumm import dtypes
import pccm import pccm
from cumm.gemm.layout import TensorGeneric, to_stride from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
from cumm.gemm import codeops from cumm.gemm import codeops
from typing import List from typing import List
from cumm.conv.params import ConvProblem from cumm.conv.params import ConvProblem
from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
import numpy as np import numpy as np
from cumm.gemm import (thread_map) from cumm.gemm import (thread_map)
from spconv.csrc.sparse.cpu_core import OMPLib
from cumm.constants import CUMM_CPU_ONLY_BUILD
class IndiceMaxPool(pccm.Class): class IndiceMaxPool(pccm.Class):
# TODO optimize this function # TODO optimize this function
...@@ -32,13 +35,13 @@ class IndiceMaxPool(pccm.Class): ...@@ -32,13 +35,13 @@ class IndiceMaxPool(pccm.Class):
super().__init__() super().__init__()
self.add_include("limits") self.add_include("limits")
self.add_dependency(TensorViewKernel, TensorView, GemmBasic) self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
@pccm.cuda.cuda_global_function @pccm.cuda.cuda_global_function
def forward_kernel(self): def forward_kernel(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("T") code.targ("T")
code.arg("out_features", f"T*") code.arg("out_features", f"T*")
code.arg("in_features", f"const T*") code.arg("in_features", f"const T*")
code.arg("out_indices", "const int*") code.arg("out_indices", "const int*")
code.arg("in_indices", "const int*") code.arg("in_indices", "const int*")
...@@ -67,7 +70,7 @@ class IndiceMaxPool(pccm.Class): ...@@ -67,7 +70,7 @@ class IndiceMaxPool(pccm.Class):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("T") code.targ("T")
code.arg("out_features", f"T*") code.arg("out_features", f"T*")
code.arg("in_features", f"const T*") code.arg("in_features", f"const T*")
code.arg("indices", "const int*") code.arg("indices", "const int*")
code.arg("num_features", "int") code.arg("num_features", "int")
...@@ -104,9 +107,9 @@ class IndiceMaxPool(pccm.Class): ...@@ -104,9 +107,9 @@ class IndiceMaxPool(pccm.Class):
def backward_kernel(self): def backward_kernel(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("T") code.targ("T")
code.arg("out_features", f"const T*") code.arg("out_features", f"const T*")
code.arg("in_features", f"const T*") code.arg("in_features", f"const T*")
code.arg("dout_features", f"const T*") code.arg("dout_features", f"const T*")
code.arg("din_features", f"T*") code.arg("din_features", f"T*")
code.arg("out_indices", "const int*") code.arg("out_indices", "const int*")
code.arg("in_indices", "const int*") code.arg("in_indices", "const int*")
...@@ -137,9 +140,9 @@ class IndiceMaxPool(pccm.Class): ...@@ -137,9 +140,9 @@ class IndiceMaxPool(pccm.Class):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.targ("T") code.targ("T")
code.arg("out_features", f"const T*") code.arg("out_features", f"const T*")
code.arg("in_features", f"const T*") code.arg("in_features", f"const T*")
code.arg("dout_features", f"const T*") code.arg("dout_features", f"const T*")
code.arg("din_features", f"T*") code.arg("din_features", f"T*")
code.arg("indices_bwd", "const int*") code.arg("indices_bwd", "const int*")
code.arg("num_features", "int") code.arg("num_features", "int")
...@@ -351,6 +354,9 @@ class IndiceMaxPoolCPU(pccm.Class): ...@@ -351,6 +354,9 @@ class IndiceMaxPoolCPU(pccm.Class):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.add_dependency(TensorView) self.add_dependency(TensorView)
if CUMM_CPU_ONLY_BUILD:
self.add_dependency(OMPLib)
self.add_include("tensorview/parallel/all.h")
@pccm.static_function @pccm.static_function
def forward(self): def forward(self):
...@@ -371,20 +377,21 @@ class IndiceMaxPoolCPU(pccm.Class): ...@@ -371,20 +377,21 @@ class IndiceMaxPoolCPU(pccm.Class):
auto in_indices = in_inds.data_ptr<const int>(); auto in_indices = in_inds.data_ptr<const int>();
auto out_indices = out_inds.data_ptr<const int>(); auto out_indices = out_inds.data_ptr<const int>();
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = 0; i < nhot; ++i) {{ for (int i = begin; i < end; i += step) {{
int in_idx = in_indices[i]; int in_idx = in_indices[i];
int out_idx = out_indices[i]; int out_idx = out_indices[i];
auto in_ptr = in_features + in_idx * num_features; auto in_ptr = in_features + in_idx * num_features;
auto out_ptr = out_features + out_idx * num_features; auto out_ptr = out_features + out_idx * num_features;
for (int j = 0; j < num_features; ++j) {{ for (int j = 0; j < num_features; ++j) {{
auto in = in_ptr[j]; auto in = in_ptr[j];
auto out = out_ptr[j]; auto out = out_ptr[j];
if (in > out){{ if (in > out){{
out_ptr[j] = in; out_ptr[j] = in;
}}
}} }}
}} }}
}} }});
}}); }});
""") """)
return code return code
...@@ -412,22 +419,24 @@ class IndiceMaxPoolCPU(pccm.Class): ...@@ -412,22 +419,24 @@ class IndiceMaxPoolCPU(pccm.Class):
auto in_indices = in_inds.data_ptr<const int>(); auto in_indices = in_inds.data_ptr<const int>();
auto out_indices = out_inds.data_ptr<const int>(); auto out_indices = out_inds.data_ptr<const int>();
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = 0; i < nhot; ++i) {{ for (int i = begin; i < end; i += step) {{
int in_idx_offset = in_indices[i] * num_features; int in_idx_offset = in_indices[i] * num_features;
int out_idx_offset = out_indices[i] * num_features; int out_idx_offset = out_indices[i] * num_features;
auto in_ptr = in_features + in_idx_offset; auto in_ptr = in_features + in_idx_offset;
auto out_ptr = out_features + out_idx_offset; auto out_ptr = out_features + out_idx_offset;
auto din_ptr = din_features + in_idx_offset; auto din_ptr = din_features + in_idx_offset;
auto dout_ptr = dout_features + out_idx_offset; auto dout_ptr = dout_features + out_idx_offset;
for (int j = 0; j < num_features; ++j) {{ for (int j = 0; j < num_features; ++j) {{
auto in = in_ptr[j]; auto in = in_ptr[j];
auto out = out_ptr[j]; auto out = out_ptr[j];
if (in == out){{ if (in == out){{
din_ptr[j] = din_ptr[j] + dout_ptr[j]; din_ptr[j] = din_ptr[j] + dout_ptr[j];
}}
}} }}
}} }}
}} }});
}}); }});
""") """)
return code return code
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment