Commit 82fd7a8b authored by yan.yan's avatar yan.yan
Browse files

v2.1.5: add profile tool and python 3.6 for linux

parent f31eee3a
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class Point2VoxelCommon:
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
......@@ -9,14 +9,11 @@ class Point2VoxelCPU:
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> Tuple[List[float], List[int], List[int], List[float]]:
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
......@@ -30,7 +27,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -38,7 +35,6 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
mean_per_voxel:
vsize:
grid_size:
grid_stride:
......@@ -47,7 +43,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, mean_per_voxel: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -55,7 +51,6 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
mean_per_voxel:
vsize:
grid_size:
grid_stride:
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class Point2VoxelCommon:
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -2,6 +2,7 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue
from ...cumm.gemm.main import GemmAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class ConvAlgoDesp(GemmAlgoDesp):
ndim: int
op_type: int
......@@ -86,17 +87,19 @@ class ConvParams:
mask_filter: int
reverse_mask: bool
verbose: bool
timer: CUDAKernelTimer
workspace: Tensor = Tensor()
mask: Tensor = Tensor()
mask_argsort: Tensor = Tensor()
indices: Tensor = Tensor()
mask_output: Tensor = Tensor()
stream: int
def __init__(self, ndim: int, op_type: int) -> None:
def __init__(self, ndim: int, op_type: int, timer: CUDAKernelTimer = CUDAKernelTimer(False)) -> None:
"""
Args:
ndim:
op_type:
timer:
"""
...
class ConvMainUnitTest:
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class GemmAlgoDesp:
dtype_a: int
dtype_b: int
......@@ -102,7 +103,13 @@ class GemmParams:
alpha: float
beta: float
stream: int
def __init__(self) -> None: ...
timer: CUDAKernelTimer
def __init__(self, timer: CUDAKernelTimer = CUDAKernelTimer(False)) -> None:
"""
Args:
timer:
"""
...
def check_valid(self) -> None: ...
@property
def a(self) -> Tensor: ...
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class CUDAEvent:
def __init__(self, name: str) -> None:
"""
Args:
name:
"""
...
def record(self, stream: int = 0) -> None:
"""
Args:
stream:
"""
...
def sync(self) -> None: ...
@staticmethod
def duration(start: "CUDAEvent", stop: "CUDAEvent") -> float:
"""
Args:
start:
stop:
"""
...
class CUDAKernelTimer:
enable: bool
def __init__(self, enable: bool = True) -> None:
"""
Args:
enable:
"""
...
def push(self, name: str) -> None:
"""
Args:
name:
"""
...
def pop(self) -> None: ...
def record(self, name: str, stream: int = 0) -> None:
"""
Args:
name:
stream:
"""
...
def insert_pair(self, name: str, start: str, stop: str) -> None:
"""
Args:
name:
start:
stop:
"""
...
def get_all_pair_duration(self) -> Dict[str, float]: ...
def sync(self) -> None: ...
......@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -24,6 +24,7 @@ from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndice
from .maxpool import IndiceMaxPool, IndiceMaxPoolCPU
from .gather import GatherCPU
class CustomThrustLib(pccm.Class):
def __init__(self):
super().__init__()
......@@ -32,12 +33,15 @@ class CustomThrustLib(pccm.Class):
if compat.InLinux:
self.build_meta.add_cflags("nvcc", "-Xcompiler", "-fno-gnu-unique")
class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.add_include("functional", "memory")
self.add_pybind_member("alloc_func", "std::function<std::uintptr_t(std::size_t)>", pyanno="Callable[[int], int]")
self.add_pybind_member("alloc_func",
"std::function<std::uintptr_t(std::size_t)>",
pyanno="Callable[[int], int]")
self.add_typedef("value_type", "char")
@pccm.member_function
......@@ -63,6 +67,7 @@ class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
code.arg("num_bytes", "size_t")
return code
class SpconvOps(pccm.Class):
def __init__(self):
super().__init__()
......@@ -71,26 +76,36 @@ class SpconvOps(pccm.Class):
for ndim in self.ndims:
p2v = Point2Voxel(dtypes.float32, ndim)
p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu,
f"Point2Voxel{ndim}DCPU")
problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
indices = SparseConvIndicesKernel(problem, dtypes.int32)
indices_cpu = SparseConvIndicesCPU(problem, dtypes.int32)
self.add_param_class(f"ops_cpu{ndim}d", indices_cpu, f"SpconvIndicesCPU{ndim}D")
self.add_param_class(f"ops_cpu{ndim}d", indices_cpu,
f"SpconvIndicesCPU{ndim}D")
# self.add_param_class("ops", indices, "SpconvIndices")
if not CUMM_CPU_ONLY_BUILD:
self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
cuda_funcs = [self.generate_subm_conv_inds,
self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key,
self.generate_conv_inds_mask_stage1, self.generate_conv_inds_mask_stage2]
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
cuda_funcs = [
self.generate_subm_conv_inds,
self.generate_conv_inds_stage1,
self.generate_conv_inds_stage1_5,
self.generate_conv_inds_stage2, self.sort_1d_by_key,
self.generate_conv_inds_mask_stage1,
self.generate_conv_inds_mask_stage2
]
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d",
indices,
f"SpconvIndices{ndim}D")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
......@@ -127,7 +142,7 @@ class SpconvOps(pccm.Class):
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code# .ret("int")
return code # .ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
......@@ -201,7 +216,8 @@ class SpconvOps(pccm.Class):
return code.make_invalid()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
......@@ -236,7 +252,7 @@ class SpconvOps(pccm.Class):
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code# .ret("int")
return code # .ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
......@@ -245,7 +261,9 @@ class SpconvOps(pccm.Class):
if CUMM_CPU_ONLY_BUILD:
return code.make_invalid()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor")
code.arg(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
"tv::Tensor")
code.arg("mask_fwd, mask_bwd", "tv::Tensor")
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
......@@ -294,7 +312,8 @@ class SpconvOps(pccm.Class):
code.arg("batch_size", "int")
code.arg("input_dims", f"std::vector<int>")
code.arg("ksize, dilation", f"std::vector<int>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
code.raw(f"""
......@@ -529,7 +548,10 @@ class SpconvOps(pccm.Class):
if CUMM_CPU_ONLY_BUILD:
return code.make_invalid()
code.arg("data", "tv::Tensor")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.code_after_include = f"""
template <typename T> struct SmallOrEqualTo {{
......@@ -575,7 +597,10 @@ class SpconvOps(pccm.Class):
code.arg("data", "tv::Tensor")
code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.code_after_include = f"""
template <typename T> struct SmallOrEqualTo {{
......@@ -613,7 +638,6 @@ class SpconvOps(pccm.Class):
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.cuda.static_function
def sort_1d_by_key_split(self):
......@@ -623,7 +647,10 @@ class SpconvOps(pccm.Class):
code.arg("data", "tv::Tensor")
code.arg("mask", "tv::Tensor")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.arg("mask_output", "bool", "false")
......@@ -678,7 +705,10 @@ class SpconvOps(pccm.Class):
code.arg("mask", "tv::Tensor")
code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("indices",
"tv::Tensor",
"tv::Tensor()",
pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.arg("mask_output", "bool", "false")
......@@ -821,8 +851,9 @@ class SpconvOps(pccm.Class):
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>")
return code.ret(
"std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>"
)
@pccm.pybind.mark
@pccm.static_function
......@@ -876,7 +907,8 @@ class SpconvOps(pccm.Class):
def point2voxel_cuda(self):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
"tv::Tensor")
code.arg("vsize", f"std::vector<float>")
code.arg("grid_size, grid_stride", f"std::vector<int>")
code.arg("coors_range", f"std::vector<float>")
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pccm
from ccimport import compat
from cumm.common import TensorView
class OMPLib(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.add_include("tensorview/parallel/all.h")
if compat.InWindows:
self.build_meta.add_cflags("cl", "/openmp")
else:
self.build_meta.add_cflags("g++", "-fopenmp")
self.build_meta.add_cflags("clang++", "-fopenmp")
import torch
import time
def main():
arr = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
......
......@@ -14,12 +14,18 @@
import pccm
from cumm.common import TensorView
from cumm.constants import CUMM_CPU_ONLY_BUILD
from spconv.csrc.sparse.cpu_core import OMPLib
from typing import List
class GatherCPU(pccm.Class):
def __init__(self):
super().__init__()
if CUMM_CPU_ONLY_BUILD:
self.add_dependency(OMPLib)
self.add_dependency(TensorView)
self.add_include("tensorview/parallel/all.h")
@pccm.static_function
def gather(self):
......@@ -35,16 +41,17 @@ class GatherCPU(pccm.Class):
int channel = in.dim(1);
tv::dispatch<float, double>(out.dtype(), [&](auto I){{
auto indices_data = inds.data_ptr<const int>();
using T = TV_DECLTYPE(I);
T *buffer_data = out.data_ptr<T>();
const T *features_data = in.data_ptr<const T>();
for (int i = 0; i < nhot; ++i) {{
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
std::memcpy(buffer_data + i * channel,
features_data + indices_data[i] * channel,
sizeof(T) * channel);
}}
}});
}});
""")
return code
......@@ -65,7 +72,8 @@ class GatherCPU(pccm.Class):
T *features_data = out.data_ptr<T>();
const T *buf = in.data_ptr<const T>();
T *out_ptr = out.data_ptr<T>();
for (int i = 0; i < nhot; ++i) {{
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
buf = buffer_data + i * channel;
out_ptr = features_data + indices_data[i] * channel;
for (int j = 0; j < channel; ++j) {{
......@@ -73,5 +81,6 @@ class GatherCPU(pccm.Class):
}}
}}
}});
}});
""")
return code
......@@ -24,6 +24,7 @@ from typing import List
from cumm.conv.params import ConvProblem
import numpy as np
class CudaCommonKernel(pccm.ParameterizedClass):
# we need to use PClass instead of Class
# because cuda global function can't be put in class body.
......@@ -82,12 +83,14 @@ class ConvOutLocIter(pccm.ParameterizedClass):
pqs = codeops.unpack("problem.output_dims", range(self.ndim))
rss = codeops.unpack("problem.ksize", range(self.ndim))
code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
code.ctor_init("layout_npq",
f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
name="operator++")
def increment(self):
code = pccm.FunctionCode()
......@@ -110,7 +113,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def nhw_to_npq(self):
code = pccm.FunctionCode()
......@@ -128,7 +132,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def npq_to_nhw(self):
code = pccm.FunctionCode()
......@@ -144,8 +149,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def query_npq(self):
code = pccm.FunctionCode()
......@@ -159,10 +164,14 @@ class ConvOutLocIter(pccm.ParameterizedClass):
hw_valid = [] # type: List[str]
stride_valid = [] # type: List[str]
for i in range(self.ndim):
code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
code.raw(
f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];"
)
hw_valid.append(
(f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
stride_valid.append(
f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
code.raw(f"""
return npq_no_stride[0] < problem_.N &&
{' && '.join(hw_valid)} &&
......@@ -170,7 +179,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def query_npq_no_stride(self):
code = pccm.FunctionCode()
......@@ -182,7 +192,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
hw_valid.append(
(f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f"""
return npq_offset[0] < problem_.N &&
......@@ -190,7 +201,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def query_nhw(self):
code = pccm.FunctionCode()
......@@ -202,7 +214,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
hw_valid.append(
(f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
code.raw(f"""
return nhw_offset[0] < problem_.N &&
......@@ -210,7 +223,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code
@pccm.member_function(header_only=True, attrs=["TV_HOST_DEVICE_INLINE"],
@pccm.member_function(header_only=True,
attrs=["TV_HOST_DEVICE_INLINE"],
const=True)
def query_nhw_out(self):
code = pccm.FunctionCode()
......@@ -222,7 +236,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
hw_valid.append(
(f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f"""
return nhw_offset[0] < problem_.N &&
......@@ -230,10 +245,12 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code
class SparseConvIndicesKernel(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__()
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel,
ThrustLib)
self.loc_iter = ConvOutLocIter(problem)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds", problem, "ConvProblem")
......@@ -245,15 +262,16 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage1(self):
code = pccm.FunctionCode()
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
......@@ -288,7 +306,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code
@pccm.cuda.cuda_global_function
def build_conv_hash_table(self):
code = pccm.FunctionCode()
......@@ -296,9 +313,11 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_out", f"int*") # [N, ndim + 1]
code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq",
f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.arg("layout_npq",
f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.arg("num_indices", "int")
......@@ -341,8 +360,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs_bwd", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_bwd",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
......@@ -382,8 +403,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out
code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("indice_pairs_fwd",
f"int*") # [kernelProd, MaxSize], inp -> out
code.arg("indice_pairs_bwd",
f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("mask_fwd", f"uint32_t*") # [kernelProd]
code.arg("mask_bwd", f"uint32_t*") # [kernelProd]
......@@ -418,7 +441,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage2_mask_output(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("indice_pairs_bwd",
f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("mask_bwd", f"uint32_t*") # [kernelProd]
code.arg("num_indices_in", "int")
......@@ -442,8 +466,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_fwd", f"int*") # [kernelProd, MaxSize], inp -> out
code.arg("indice_pairs_bwd", f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("indice_pairs_fwd",
f"int*") # [kernelProd, MaxSize], inp -> out
code.arg("indice_pairs_bwd",
f"int*") # [kernelProd, MaxSize], out -> inp
code.arg("mask_fwd", f"uint32_t*") # [kernelProd]
code.arg("num_indices_in", "int")
code.arg("num_indices_out", "int")
......@@ -469,7 +495,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code
@pccm.cuda.cuda_global_function
def build_subm_conv_hash_table(self):
code = pccm.FunctionCode()
......@@ -510,7 +535,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
......@@ -556,7 +582,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask", f"uint32_t*") # [kernelProd]
code.arg("num_indices", "int")
......@@ -613,7 +640,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask1", f"uint32_t*") # [kernelProd]
code.arg("mask2", f"uint32_t*") # [kernelProd]
......@@ -665,10 +693,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
......@@ -706,9 +736,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act;
""")
return code# .ret("int")
return code # .ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage1_5(self):
......@@ -726,7 +754,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage2(self):
code = pccm.FunctionCode()
......@@ -735,7 +762,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
......@@ -783,10 +811,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def generate_conv_inds_mask_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc",
"tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
......@@ -817,21 +847,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
indice_pairs_bwd.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
kv, transposed);
auto timer = tv::CudaContextTimer<>();
""")
return code# .ret("int")
return code # .ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage2_mask(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor")
code.arg(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
"tv::Tensor")
code.arg("mask_fwd, mask_bwd", "tv::Tensor")
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
......@@ -903,7 +935,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_subm_conv_inds(self):
code = pccm.FunctionCode()
......@@ -912,7 +943,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("batch_size", "int")
code.arg("input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
......@@ -993,6 +1025,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
return code.ret("int")
class SparseConvIndicesCPU(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__()
......@@ -1079,7 +1112,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation",
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.raw(f"""
int kv = tv::arrayops::prod(ksize);
......
......@@ -25,6 +25,9 @@ from cumm.conv.params import ConvProblem
from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
import numpy as np
from cumm.gemm import (thread_map)
from spconv.csrc.sparse.cpu_core import OMPLib
from cumm.constants import CUMM_CPU_ONLY_BUILD
class IndiceMaxPool(pccm.Class):
# TODO optimize this function
......@@ -351,6 +354,9 @@ class IndiceMaxPoolCPU(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
if CUMM_CPU_ONLY_BUILD:
self.add_dependency(OMPLib)
self.add_include("tensorview/parallel/all.h")
@pccm.static_function
def forward(self):
......@@ -371,8 +377,8 @@ class IndiceMaxPoolCPU(pccm.Class):
auto in_indices = in_inds.data_ptr<const int>();
auto out_indices = out_inds.data_ptr<const int>();
for (int i = 0; i < nhot; ++i) {{
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
int in_idx = in_indices[i];
int out_idx = out_indices[i];
auto in_ptr = in_features + in_idx * num_features;
......@@ -386,6 +392,7 @@ class IndiceMaxPoolCPU(pccm.Class):
}}
}}
}});
}});
""")
return code
......@@ -412,8 +419,8 @@ class IndiceMaxPoolCPU(pccm.Class):
auto in_indices = in_inds.data_ptr<const int>();
auto out_indices = out_inds.data_ptr<const int>();
for (int i = 0; i < nhot; ++i) {{
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
int in_idx_offset = in_indices[i] * num_features;
int out_idx_offset = out_indices[i] * num_features;
auto in_ptr = in_features + in_idx_offset;
......@@ -429,5 +436,7 @@ class IndiceMaxPoolCPU(pccm.Class):
}}
}}
}});
}});
""")
return code
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment