Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
82fd7a8b
Commit
82fd7a8b
authored
Nov 10, 2021
by
yan.yan
Browse files
v2.1.5: add profile tool and python 3.6 for linux
parent
f31eee3a
Changes
80
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
407 additions
and
432 deletions
+407
-432
spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
+0
-74
spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
+0
-11
spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+3
-8
spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
+0
-74
spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
+0
-11
spconv/core_cc/cumm/__init__.pyi
spconv/core_cc/cumm/__init__.pyi
+0
-14
spconv/core_cc/cumm/conv/main.pyi
spconv/core_cc/cumm/conv/main.pyi
+4
-1
spconv/core_cc/cumm/gemm/__init__.pyi
spconv/core_cc/cumm/gemm/__init__.pyi
+0
-14
spconv/core_cc/cumm/gemm/main.pyi
spconv/core_cc/cumm/gemm/main.pyi
+8
-1
spconv/core_cc/cumm/tools/__init__.pyi
spconv/core_cc/cumm/tools/__init__.pyi
+0
-0
spconv/core_cc/cumm/tools/cuda.pyi
spconv/core_cc/cumm/tools/cuda.pyi
+56
-0
spconv/cppconstants.py
spconv/cppconstants.py
+4
-4
spconv/csrc/__init__.py
spconv/csrc/__init__.py
+3
-4
spconv/csrc/sparse/__init__.py
spconv/csrc/sparse/__init__.py
+3
-4
spconv/csrc/sparse/all.py
spconv/csrc/sparse/all.py
+62
-30
spconv/csrc/sparse/cpu_core.py
spconv/csrc/sparse/cpu_core.py
+29
-0
spconv/csrc/sparse/devleop/sort_bench.py
spconv/csrc/sparse/devleop/sort_bench.py
+4
-3
spconv/csrc/sparse/gather.py
spconv/csrc/sparse/gather.py
+27
-18
spconv/csrc/sparse/indices.py
spconv/csrc/sparse/indices.py
+156
-122
spconv/csrc/sparse/maxpool.py
spconv/csrc/sparse/maxpool.py
+48
-39
No files found.
spconv/core_cc/csrc/sparse/all/ops_cpu3d/__init__.pyi
deleted
100644 → 0
View file @
f31eee3a
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
spconv/core_cc/csrc/sparse/all/ops_cpu3d/p2v_c.pyi
deleted
100644 → 0
View file @
f31eee3a
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class Point2VoxelCommon:
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
View file @
82fd7a8b
...
...
@@ -9,14 +9,11 @@ class Point2VoxelCPU:
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]
, num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int
) -> Tuple[List[float], List[int], List[int], List[float]]:
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
...
...
@@ -30,7 +27,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor,
mean_per_voxel: Tensor,
vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
...
...
@@ -38,7 +35,6 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
mean_per_voxel:
vsize:
grid_size:
grid_stride:
...
...
@@ -47,7 +43,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor,
mean_per_voxel: Tensor,
vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
...
...
@@ -55,7 +51,6 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
mean_per_voxel:
vsize:
grid_size:
grid_stride:
...
...
spconv/core_cc/csrc/sparse/all/ops_cpu4d/__init__.pyi
deleted
100644 → 0
View file @
f31eee3a
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
voxels:
indices:
num_per_voxel:
densehashdata:
vsize:
grid_size:
grid_stride:
coors_range:
clear_voxels:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
spconv/core_cc/csrc/sparse/all/ops_cpu4d/p2v_c.pyi
deleted
100644 → 0
View file @
f31eee3a
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class Point2VoxelCommon:
@staticmethod
def calc_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
vsize_xyz:
coors_range_xyz:
"""
...
spconv/core_cc/cumm/__init__.pyi
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
spconv/core_cc/cumm/conv/main.pyi
View file @
82fd7a8b
...
...
@@ -2,6 +2,7 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue
from ...cumm.gemm.main import GemmAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class ConvAlgoDesp(GemmAlgoDesp):
ndim: int
op_type: int
...
...
@@ -86,17 +87,19 @@ class ConvParams:
mask_filter: int
reverse_mask: bool
verbose: bool
timer: CUDAKernelTimer
workspace: Tensor = Tensor()
mask: Tensor = Tensor()
mask_argsort: Tensor = Tensor()
indices: Tensor = Tensor()
mask_output: Tensor = Tensor()
stream: int
def __init__(self, ndim: int, op_type: int) -> None:
def __init__(self, ndim: int, op_type: int
, timer: CUDAKernelTimer = CUDAKernelTimer(False)
) -> None:
"""
Args:
ndim:
op_type:
timer:
"""
...
class ConvMainUnitTest:
...
...
spconv/core_cc/cumm/gemm/__init__.pyi
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
spconv/core_cc/cumm/gemm/main.pyi
View file @
82fd7a8b
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class GemmAlgoDesp:
dtype_a: int
dtype_b: int
...
...
@@ -102,7 +103,13 @@ class GemmParams:
alpha: float
beta: float
stream: int
def __init__(self) -> None: ...
timer: CUDAKernelTimer
def __init__(self, timer: CUDAKernelTimer = CUDAKernelTimer(False)) -> None:
"""
Args:
timer:
"""
...
def check_valid(self) -> None: ...
@property
def a(self) -> Tensor: ...
...
...
spconv/core_cc/cumm/tools/__init__.pyi
0 → 100644
View file @
82fd7a8b
spconv/core_cc/cumm/tools/cuda.pyi
0 → 100644
View file @
82fd7a8b
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class CUDAEvent:
def __init__(self, name: str) -> None:
"""
Args:
name:
"""
...
def record(self, stream: int = 0) -> None:
"""
Args:
stream:
"""
...
def sync(self) -> None: ...
@staticmethod
def duration(start: "CUDAEvent", stop: "CUDAEvent") -> float:
"""
Args:
start:
stop:
"""
...
class CUDAKernelTimer:
enable: bool
def __init__(self, enable: bool = True) -> None:
"""
Args:
enable:
"""
...
def push(self, name: str) -> None:
"""
Args:
name:
"""
...
def pop(self) -> None: ...
def record(self, name: str, stream: int = 0) -> None:
"""
Args:
name:
stream:
"""
...
def insert_pair(self, name: str, start: str, stop: str) -> None:
"""
Args:
name:
start:
stop:
"""
...
def get_all_pair_duration(self) -> Dict[str, float]: ...
def sync(self) -> None: ...
spconv/cppconstants.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -17,4 +17,4 @@ import spconv.core_cc as _ext
if
hasattr
(
_ext
,
"cumm"
):
CPU_ONLY_BUILD
=
False
else
:
CPU_ONLY_BUILD
=
True
CPU_ONLY_BUILD
=
True
spconv/csrc/__init__.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
spconv/csrc/sparse/__init__.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
spconv/csrc/sparse/all.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -17,13 +17,14 @@ from cumm.conv.bases import ConvOpType, NHWC
from
cumm.conv.params
import
ConvProblem
from
cumm
import
dtypes
from
cumm.constants
import
CUMM_CPU_ONLY_BUILD
import
pccm
import
pccm
from
ccimport
import
compat
from
.pointops
import
Point2Voxel
,
Point2VoxelCPU
from
.indices
import
SparseConvIndicesKernel
,
CudaCommonKernel
,
SparseConvIndicesCPU
from
.maxpool
import
IndiceMaxPool
,
IndiceMaxPoolCPU
from
.gather
import
GatherCPU
class
CustomThrustLib
(
pccm
.
Class
):
def
__init__
(
self
):
super
().
__init__
()
...
...
@@ -32,12 +33,15 @@ class CustomThrustLib(pccm.Class):
if
compat
.
InLinux
:
self
.
build_meta
.
add_cflags
(
"nvcc"
,
"-Xcompiler"
,
"-fno-gnu-unique"
)
class
ThrustCustomAllocatorV2
(
pccm
.
Class
,
pccm
.
pybind
.
PybindClassMixin
):
def
__init__
(
self
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
)
self
.
add_include
(
"functional"
,
"memory"
)
self
.
add_pybind_member
(
"alloc_func"
,
"std::function<std::uintptr_t(std::size_t)>"
,
pyanno
=
"Callable[[int], int]"
)
self
.
add_pybind_member
(
"alloc_func"
,
"std::function<std::uintptr_t(std::size_t)>"
,
pyanno
=
"Callable[[int], int]"
)
self
.
add_typedef
(
"value_type"
,
"char"
)
@
pccm
.
member_function
...
...
@@ -54,14 +58,15 @@ class ThrustCustomAllocatorV2(pccm.Class, pccm.pybind.PybindClassMixin):
TV_THROW_RT_ERR("set alloc function first.");
}}
"""
)
return
code
return
code
@
pccm
.
member_function
def
deallocate
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"ptr"
,
"char *"
)
code
.
arg
(
"num_bytes"
,
"size_t"
)
return
code
return
code
class
SpconvOps
(
pccm
.
Class
):
def
__init__
(
self
):
...
...
@@ -69,28 +74,38 @@ class SpconvOps(pccm.Class):
self
.
add_dependency
(
ThrustCustomAllocatorV2
)
self
.
ndims
=
[
1
,
2
,
3
,
4
]
for
ndim
in
self
.
ndims
:
p2v
=
Point2Voxel
(
dtypes
.
float32
,
ndim
)
p2v
=
Point2Voxel
(
dtypes
.
float32
,
ndim
)
p2v_cpu
=
Point2VoxelCPU
(
dtypes
.
float32
,
ndim
)
self
.
add_param_class
(
f
"ops_cpu
{
ndim
}
d"
,
p2v_cpu
,
f
"Point2Voxel
{
ndim
}
DCPU"
)
self
.
add_param_class
(
f
"ops_cpu
{
ndim
}
d"
,
p2v_cpu
,
f
"Point2Voxel
{
ndim
}
DCPU"
)
problem
=
ConvProblem
(
ndim
,
ConvOpType
.
kForward
,
NHWC
,
NHWC
,
NHWC
)
indices
=
SparseConvIndicesKernel
(
problem
,
dtypes
.
int32
)
indices_cpu
=
SparseConvIndicesCPU
(
problem
,
dtypes
.
int32
)
self
.
add_param_class
(
f
"ops_cpu
{
ndim
}
d"
,
indices_cpu
,
f
"SpconvIndicesCPU
{
ndim
}
D"
)
self
.
add_param_class
(
f
"ops_cpu
{
ndim
}
d"
,
indices_cpu
,
f
"SpconvIndicesCPU
{
ndim
}
D"
)
# self.add_param_class("ops", indices, "SpconvIndices")
if
not
CUMM_CPU_ONLY_BUILD
:
self
.
add_param_class
(
f
"ops
{
ndim
}
d"
,
p2v
,
f
"Point2Voxel
{
ndim
}
D"
)
cuda_funcs
=
[
self
.
generate_subm_conv_inds
,
self
.
generate_conv_inds_stage1
,
self
.
generate_conv_inds_stage1_5
,
self
.
generate_conv_inds_stage2
,
self
.
sort_1d_by_key
,
self
.
generate_conv_inds_mask_stage1
,
self
.
generate_conv_inds_mask_stage2
]
self
.
add_impl_only_param_class
(
cuda_funcs
,
f
"ops
{
ndim
}
d"
,
indices
,
f
"SpconvIndices
{
ndim
}
D"
)
cuda_funcs
=
[
self
.
generate_subm_conv_inds
,
self
.
generate_conv_inds_stage1
,
self
.
generate_conv_inds_stage1_5
,
self
.
generate_conv_inds_stage2
,
self
.
sort_1d_by_key
,
self
.
generate_conv_inds_mask_stage1
,
self
.
generate_conv_inds_mask_stage2
]
self
.
add_impl_only_param_class
(
cuda_funcs
,
f
"ops
{
ndim
}
d"
,
indices
,
f
"SpconvIndices
{
ndim
}
D"
)
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage1
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indices"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"std::vector<int>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"std::vector<int>"
)
...
...
@@ -127,7 +142,7 @@ class SpconvOps(pccm.Class):
"""
)
code
.
raw
(
f
"""TV_THROW_RT_ERR("unknown ndim", ndim);"""
)
return
code
# .ret("int")
return
code
# .ret("int")
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
static_function
...
...
@@ -201,7 +216,8 @@ class SpconvOps(pccm.Class):
return
code
.
make_invalid
()
code
.
arg
(
"indices"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"std::vector<int>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"std::vector<int>"
)
...
...
@@ -236,7 +252,7 @@ class SpconvOps(pccm.Class):
"""
)
code
.
raw
(
f
"""TV_THROW_RT_ERR("unknown ndim", ndim);"""
)
return
code
# .ret("int")
return
code
# .ret("int")
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
static_function
...
...
@@ -245,7 +261,9 @@ class SpconvOps(pccm.Class):
if
CUMM_CPU_ONLY_BUILD
:
return
code
.
make_invalid
()
code
.
arg
(
"indices, hashdata"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds"
,
"tv::Tensor"
)
code
.
arg
(
"mask_fwd, mask_bwd"
,
"tv::Tensor"
)
code
.
arg
(
"num_out_act"
,
"int"
)
code
.
arg
(
"batch_size"
,
"int"
)
...
...
@@ -294,7 +312,8 @@ class SpconvOps(pccm.Class):
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"input_dims"
,
f
"std::vector<int>"
)
code
.
arg
(
"ksize, dilation"
,
f
"std::vector<int>"
)
code
.
arg
(
"indice_pair_mask"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"indice_pair_mask"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"backward"
,
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
,
pyanno
=
"int = 0"
)
code
.
raw
(
f
"""
...
...
@@ -529,7 +548,10 @@ class SpconvOps(pccm.Class):
if
CUMM_CPU_ONLY_BUILD
:
return
code
.
make_invalid
()
code
.
arg
(
"data"
,
"tv::Tensor"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
code_after_include
=
f
"""
template <typename T> struct SmallOrEqualTo {{
...
...
@@ -575,7 +597,10 @@ class SpconvOps(pccm.Class):
code
.
arg
(
"data"
,
"tv::Tensor"
)
code
.
arg
(
"alloc_func"
,
"std::function<std::uintptr_t(std::size_t)>"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
code_after_include
=
f
"""
template <typename T> struct SmallOrEqualTo {{
...
...
@@ -613,7 +638,6 @@ class SpconvOps(pccm.Class):
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
static_function
def
sort_1d_by_key_split
(
self
):
...
...
@@ -623,7 +647,10 @@ class SpconvOps(pccm.Class):
code
.
arg
(
"data"
,
"tv::Tensor"
)
code
.
arg
(
"mask"
,
"tv::Tensor"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
arg
(
"mask_output"
,
"bool"
,
"false"
)
...
...
@@ -678,7 +705,10 @@ class SpconvOps(pccm.Class):
code
.
arg
(
"mask"
,
"tv::Tensor"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
,
"tv::Tensor()"
,
pyanno
=
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
arg
(
"mask_output"
,
"bool"
,
"false"
)
...
...
@@ -821,9 +851,10 @@ class SpconvOps(pccm.Class):
}}
"""
)
code
.
raw
(
f
"""TV_THROW_RT_ERR("unknown ndim", ndim);"""
)
return
code
.
ret
(
"std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>"
)
return
code
.
ret
(
"std::tuple<std::vector<float>, std::vector<int>, std::vector<int>, std::vector<float>>"
)
@
pccm
.
pybind
.
mark
@
pccm
.
static_function
def
point2voxel_cpu
(
self
):
...
...
@@ -876,7 +907,8 @@ class SpconvOps(pccm.Class):
def
point2voxel_cuda
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"points"
,
"tv::Tensor"
)
code
.
arg
(
"voxels, indices, num_per_voxel, hashdata, point_indice_data"
,
"tv::Tensor"
)
code
.
arg
(
"voxels, indices, num_per_voxel, hashdata, point_indice_data"
,
"tv::Tensor"
)
code
.
arg
(
"vsize"
,
f
"std::vector<float>"
)
code
.
arg
(
"grid_size, grid_stride"
,
f
"std::vector<int>"
)
code
.
arg
(
"coors_range"
,
f
"std::vector<float>"
)
...
...
@@ -914,4 +946,4 @@ class SpconvOps(pccm.Class):
}}
"""
)
code
.
raw
(
f
"""TV_THROW_RT_ERR("unknown ndim", ndim);"""
)
return
code
.
ret
(
"std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>"
)
\ No newline at end of file
return
code
.
ret
(
"std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>"
)
spconv/csrc/sparse/cpu_core.py
0 → 100644
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pccm
from
ccimport
import
compat
from
cumm.common
import
TensorView
class
OMPLib
(
pccm
.
Class
):
def
__init__
(
self
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
)
self
.
add_include
(
"tensorview/parallel/all.h"
)
if
compat
.
InWindows
:
self
.
build_meta
.
add_cflags
(
"cl"
,
"/openmp"
)
else
:
self
.
build_meta
.
add_cflags
(
"g++"
,
"-fopenmp"
)
self
.
build_meta
.
add_cflags
(
"clang++"
,
"-fopenmp"
)
spconv/csrc/sparse/devleop/sort_bench.py
View file @
82fd7a8b
import
torch
import
time
import
torch
import
time
def
main
():
...
...
@@ -34,4 +35,4 @@ def main():
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
main
()
spconv/csrc/sparse/gather.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pccm
import
pccm
from
cumm.common
import
TensorView
from
typing
import
List
from
cumm.constants
import
CUMM_CPU_ONLY_BUILD
from
spconv.csrc.sparse.cpu_core
import
OMPLib
from
typing
import
List
class
GatherCPU
(
pccm
.
Class
):
def
__init__
(
self
):
super
().
__init__
()
if
CUMM_CPU_ONLY_BUILD
:
self
.
add_dependency
(
OMPLib
)
self
.
add_dependency
(
TensorView
)
self
.
add_include
(
"tensorview/parallel/all.h"
)
@
pccm
.
static_function
def
gather
(
self
):
code
=
pccm
.
FunctionCode
()
...
...
@@ -35,15 +41,16 @@ class GatherCPU(pccm.Class):
int channel = in.dim(1);
tv::dispatch<float, double>(out.dtype(), [&](auto I){{
auto indices_data = inds.data_ptr<const int>();
using T = TV_DECLTYPE(I);
T *buffer_data = out.data_ptr<T>();
const T *features_data = in.data_ptr<const T>();
for (int i = 0; i < nhot; ++i) {{
std::memcpy(buffer_data + i * channel,
features_data + indices_data[i] * channel,
sizeof(T) * channel);
}}
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
std::memcpy(buffer_data + i * channel,
features_data + indices_data[i] * channel,
sizeof(T) * channel);
}}
}});
}});
"""
)
return
code
...
...
@@ -65,13 +72,15 @@ class GatherCPU(pccm.Class):
T *features_data = out.data_ptr<T>();
const T *buf = in.data_ptr<const T>();
T *out_ptr = out.data_ptr<T>();
for (int i = 0; i < nhot; ++i) {{
buf = buffer_data + i * channel;
out_ptr = features_data + indices_data[i] * channel;
for (int j = 0; j < channel; ++j) {{
out_ptr[j] = out_ptr[j] + buf[j];
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
buf = buffer_data + i * channel;
out_ptr = features_data + indices_data[i] * channel;
for (int j = 0; j < channel; ++j) {{
out_ptr[j] = out_ptr[j] + buf[j];
}}
}}
}}
}}
);
}});
"""
)
return
code
spconv/csrc/sparse/indices.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -16,13 +16,14 @@ import contextlib
from
cumm.conv.bases
import
ConvEnum
from
cumm.gemm.core.metaarray
import
MetaArray
,
seq
from
cumm
import
dtypes
import
pccm
import
pccm
from
cumm.gemm.layout
import
TensorGeneric
,
to_stride
from
cumm.common
import
TensorView
,
TensorViewHashKernel
,
TensorViewKernel
,
ThrustLib
from
cumm.gemm
import
codeops
from
typing
import
List
from
typing
import
List
from
cumm.conv.params
import
ConvProblem
import
numpy
as
np
import
numpy
as
np
class
CudaCommonKernel
(
pccm
.
ParameterizedClass
):
# we need to use PClass instead of Class
...
...
@@ -31,8 +32,8 @@ class CudaCommonKernel(pccm.ParameterizedClass):
def
arange_kernel
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"data"
,
f
"T*"
)
code
.
arg
(
"size"
,
f
"int"
)
code
.
arg
(
"data"
,
f
"T*"
)
code
.
arg
(
"size"
,
f
"int"
)
code
.
raw
(
f
"""
for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(i);
...
...
@@ -44,9 +45,9 @@ class CudaCommonKernel(pccm.ParameterizedClass):
def
fill_kernel
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"data"
,
f
"T*"
)
code
.
arg
(
"data"
,
f
"T*"
)
code
.
arg
(
"val"
,
f
"T"
)
code
.
arg
(
"size"
,
f
"int"
)
code
.
arg
(
"size"
,
f
"int"
)
code
.
raw
(
f
"""
for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(val);
...
...
@@ -66,7 +67,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
self
.
add_param_class
(
"lociter"
,
layout_npq
,
"LayoutNPQ"
)
self
.
add_param_class
(
"lociter_rs"
,
layout_rs
,
"LayoutRS"
)
self
.
ndim
=
problem
.
ndim
self
.
ndim
=
problem
.
ndim
self
.
add_member
(
"problem_"
,
f
"ConvProblem"
)
self
.
add_member
(
"count_"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
self
.
add_member
(
"layout_npq"
,
f
"LayoutNPQ"
)
...
...
@@ -82,13 +83,15 @@ class ConvOutLocIter(pccm.ParameterizedClass):
pqs
=
codeops
.
unpack
(
"problem.output_dims"
,
range
(
self
.
ndim
))
rss
=
codeops
.
unpack
(
"problem.ksize"
,
range
(
self
.
ndim
))
code
.
ctor_init
(
"layout_npq"
,
f
"LayoutNPQ::from_shape({{problem.N,
{
pqs
}
}})"
)
code
.
ctor_init
(
"layout_npq"
,
f
"LayoutNPQ::from_shape({{problem.N,
{
pqs
}
}})"
)
code
.
ctor_init
(
"layout_rs"
,
f
"LayoutRS::from_shape({{
{
rss
}
}})"
)
return
code
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
name
=
"operator++"
)
return
code
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
name
=
"operator++"
)
def
increment
(
self
):
code
=
pccm
.
FunctionCode
()
for
i
in
range
(
self
.
ndim
-
1
,
-
1
,
-
1
):
...
...
@@ -110,8 +113,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
"""
)
return
code
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
def
nhw_to_npq
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"nhw_offset"
,
"const int*"
)
...
...
@@ -128,8 +132,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
"""
)
return
code
.
ret
(
f
"tv::array<int,
{
self
.
ndim
+
1
}
>"
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
def
npq_to_nhw
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"npq_offset"
,
"const int*"
)
...
...
@@ -144,9 +149,9 @@ class ConvOutLocIter(pccm.ParameterizedClass):
"""
)
return
code
.
ret
(
f
"tv::array<int,
{
self
.
ndim
+
1
}
>"
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
def
query_npq
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"nhw_offset"
,
"const int*"
)
...
...
@@ -156,22 +161,27 @@ class ConvOutLocIter(pccm.ParameterizedClass):
auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
npq_offset[0] = npq_no_stride[0];
"""
)
hw_valid
=
[]
# type: List[str]
stride_valid
=
[]
# type: List[str]
hw_valid
=
[]
# type: List[str]
stride_valid
=
[]
# type: List[str]
for
i
in
range
(
self
.
ndim
):
code
.
raw
(
f
"npq_offset[
{
i
+
1
}
] = npq_no_stride[
{
i
+
1
}
] / problem_.stride[
{
i
}
];"
)
hw_valid
.
append
((
f
"npq_offset[
{
i
+
1
}
] >= 0 && "
f
"npq_offset[
{
i
+
1
}
] < problem_.output_dims[
{
i
}
]"
))
stride_valid
.
append
(
f
"!(npq_no_stride[
{
i
+
1
}
] % problem_.stride[
{
i
}
])"
)
code
.
raw
(
f
"npq_offset[
{
i
+
1
}
] = npq_no_stride[
{
i
+
1
}
] / problem_.stride[
{
i
}
];"
)
hw_valid
.
append
(
(
f
"npq_offset[
{
i
+
1
}
] >= 0 && "
f
"npq_offset[
{
i
+
1
}
] < problem_.output_dims[
{
i
}
]"
))
stride_valid
.
append
(
f
"!(npq_no_stride[
{
i
+
1
}
] % problem_.stride[
{
i
}
])"
)
code
.
raw
(
f
"""
return npq_no_stride[0] < problem_.N &&
{
' && '
.
join
(
hw_valid
)
}
&&
{
' && '
.
join
(
stride_valid
)
}
;
"""
)
return
code
return
code
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
def
query_npq_no_stride
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"nhw_offset"
,
"const int*"
)
...
...
@@ -180,18 +190,20 @@ class ConvOutLocIter(pccm.ParameterizedClass):
code
.
raw
(
f
"""
npq_offset = nhw_to_npq<true>(nhw_offset);
"""
)
hw_valid
=
[]
# type: List[str]
hw_valid
=
[]
# type: List[str]
for
i
in
range
(
self
.
ndim
):
hw_valid
.
append
((
f
"npq_offset[
{
i
+
1
}
] >= 0 && "
f
"npq_offset[
{
i
+
1
}
] < problem_.output_dims[
{
i
}
]"
))
hw_valid
.
append
(
(
f
"npq_offset[
{
i
+
1
}
] >= 0 && "
f
"npq_offset[
{
i
+
1
}
] < problem_.output_dims[
{
i
}
]"
))
code
.
raw
(
f
"""
return npq_offset[0] < problem_.N &&
{
' && '
.
join
(
hw_valid
)
}
;
"""
)
return
code
return
code
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
def
query_nhw
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"npq_offset"
,
"const int*"
)
...
...
@@ -200,18 +212,20 @@ class ConvOutLocIter(pccm.ParameterizedClass):
code
.
raw
(
f
"""
nhw_offset = npq_to_nhw(npq_offset);
"""
)
hw_valid
=
[]
# type: List[str]
hw_valid
=
[]
# type: List[str]
for
i
in
range
(
self
.
ndim
):
hw_valid
.
append
((
f
"nhw_offset[
{
i
+
1
}
] >= 0 && "
f
"nhw_offset[
{
i
+
1
}
] < problem_.input_dims[
{
i
}
]"
))
hw_valid
.
append
(
(
f
"nhw_offset[
{
i
+
1
}
] >= 0 && "
f
"nhw_offset[
{
i
+
1
}
] < problem_.input_dims[
{
i
}
]"
))
code
.
raw
(
f
"""
return nhw_offset[0] < problem_.N &&
{
' && '
.
join
(
hw_valid
)
}
;
"""
)
return
code
return
code
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
@
pccm
.
member_function
(
header_only
=
True
,
attrs
=
[
"TV_HOST_DEVICE_INLINE"
],
const
=
True
)
def
query_nhw_out
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"npq_offset"
,
"const int*"
)
...
...
@@ -220,41 +234,45 @@ class ConvOutLocIter(pccm.ParameterizedClass):
code
.
raw
(
f
"""
nhw_offset = npq_to_nhw(npq_offset);
"""
)
hw_valid
=
[]
# type: List[str]
hw_valid
=
[]
# type: List[str]
for
i
in
range
(
self
.
ndim
):
hw_valid
.
append
((
f
"nhw_offset[
{
i
+
1
}
] >= 0 && "
f
"nhw_offset[
{
i
+
1
}
] < problem_.output_dims[
{
i
}
]"
))
hw_valid
.
append
(
(
f
"nhw_offset[
{
i
+
1
}
] >= 0 && "
f
"nhw_offset[
{
i
+
1
}
] < problem_.output_dims[
{
i
}
]"
))
code
.
raw
(
f
"""
return nhw_offset[0] < problem_.N &&
{
' && '
.
join
(
hw_valid
)
}
;
"""
)
return
code
return
code
class
SparseConvIndicesKernel
(
pccm
.
ParameterizedClass
):
def
__init__
(
self
,
problem
:
ConvProblem
,
dtype_indices
:
dtypes
.
DType
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
,
TensorViewKernel
,
TensorViewHashKernel
,
ThrustLib
)
self
.
add_dependency
(
TensorView
,
TensorViewKernel
,
TensorViewHashKernel
,
ThrustLib
)
self
.
loc_iter
=
ConvOutLocIter
(
problem
)
self
.
add_param_class
(
"spinds"
,
self
.
loc_iter
,
"ConvLocIter"
)
self
.
add_param_class
(
"spinds"
,
problem
,
"ConvProblem"
)
self
.
add_param_class
(
"cudakers"
,
CudaCommonKernel
())
self
.
add_param_class
(
"spinds"
,
problem
,
"ConvProblem"
)
self
.
add_param_class
(
"cudakers"
,
CudaCommonKernel
())
self
.
ndim
=
problem
.
ndim
self
.
ndim
=
problem
.
ndim
self
.
dtype_indices
=
dtype_indices
self
.
dtype_indices_uniq
=
dtype_indices
assert
dtype_indices
==
dtypes
.
int32
or
dtype_indices
==
dtypes
.
int64
@
pccm
.
cuda
.
cuda_global_function
def
calc_conv_indices_stage1
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_num_per_loc"
,
f
"int*"
)
# [kernelProd]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_num_per_loc"
,
f
"int*"
)
# [kernelProd]
code
.
arg
(
"num_indices_in"
,
"int"
)
code
.
arg
(
"indices_pair_size"
,
"int"
)
...
...
@@ -288,17 +306,18 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""
)
return
code
@
pccm
.
cuda
.
cuda_global_function
def
build_conv_hash_table
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indices_out"
,
f
"int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"const
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indices_out"
,
f
"int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"const
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"layout_npq"
,
f
"spinds::LayoutNPQ"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"layout_npq"
,
f
"spinds::LayoutNPQ"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"num_indices"
,
"int"
)
...
...
@@ -315,8 +334,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
calc_conv_indices_stage2
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_out_part"
,
f
"int*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_out_part"
,
f
"int*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"num_indices_in"
,
"int"
)
code
.
arg
(
"indices_pair_size"
,
"int"
)
# TODO use block instead of filter_offset?
...
...
@@ -338,12 +357,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
@
pccm
.
cuda
.
cuda_global_function
def
calc_conv_indices_stage1_mask
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_bwd"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_num_per_loc"
,
f
"int*"
)
# [kernelProd]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_bwd"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_num_per_loc"
,
f
"int*"
)
# [kernelProd]
code
.
arg
(
"num_indices_in"
,
"int"
)
...
...
@@ -381,11 +402,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
calc_conv_indices_stage2_mask
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_fwd"
,
f
"int*"
)
# [kernelProd, MaxSize], inp -> out
code
.
arg
(
"indice_pairs_bwd"
,
f
"int*"
)
# [kernelProd, MaxSize], out -> inp
code
.
arg
(
"mask_fwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"mask_bwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_fwd"
,
f
"int*"
)
# [kernelProd, MaxSize], inp -> out
code
.
arg
(
"indice_pairs_bwd"
,
f
"int*"
)
# [kernelProd, MaxSize], out -> inp
code
.
arg
(
"mask_fwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"mask_bwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"num_indices_in"
,
"int"
)
code
.
arg
(
"num_indices_out"
,
"int"
)
...
...
@@ -418,8 +441,9 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
@
pccm
.
cuda
.
cuda_global_function
def
calc_conv_indices_stage2_mask_output
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indice_pairs_bwd"
,
f
"int*"
)
# [kernelProd, MaxSize], out -> inp
code
.
arg
(
"mask_bwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"indice_pairs_bwd"
,
f
"int*"
)
# [kernelProd, MaxSize], out -> inp
code
.
arg
(
"mask_bwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"num_indices_in"
,
"int"
)
code
.
arg
(
"kv"
,
"int"
)
...
...
@@ -441,10 +465,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
calc_conv_indices_stage2_inference_mask
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_fwd"
,
f
"int*"
)
# [kernelProd, MaxSize], inp -> out
code
.
arg
(
"indice_pairs_bwd"
,
f
"int*"
)
# [kernelProd, MaxSize], out -> inp
code
.
arg
(
"mask_fwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs_fwd"
,
f
"int*"
)
# [kernelProd, MaxSize], inp -> out
code
.
arg
(
"indice_pairs_bwd"
,
f
"int*"
)
# [kernelProd, MaxSize], out -> inp
code
.
arg
(
"mask_fwd"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"num_indices_in"
,
"int"
)
code
.
arg
(
"num_indices_out"
,
"int"
)
...
...
@@ -469,16 +495,15 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""
)
return
code
@
pccm
.
cuda
.
cuda_global_function
def
build_subm_conv_hash_table
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"layout_npq"
,
f
"spinds::LayoutNPQ"
)
code
.
arg
(
"layout_npq"
,
f
"spinds::LayoutNPQ"
)
code
.
arg
(
"num_indices"
,
"int"
)
...
...
@@ -493,8 +518,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
@
pccm
.
cuda
.
cuda_global_function
def
clean_indices_uniq
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"
{
self
.
dtype_indices
}
*"
)
code
.
arg
(
"size"
,
f
"
{
self
.
dtype_indices
}
"
)
code
.
arg
(
"indice_pairs_for_uniq"
,
f
"
{
self
.
dtype_indices
}
*"
)
code
.
arg
(
"size"
,
f
"
{
self
.
dtype_indices
}
"
)
code
.
raw
(
f
"""
for (
{
self
.
dtype_indices
}
i : tv::KernelLoopX<
{
self
.
dtype_indices
}
>(size)) {{
indice_pairs_for_uniq[i] = std::numeric_limits<
{
self
.
dtype_indices
}
>::max();
...
...
@@ -506,12 +531,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
calc_subm_conv_indices
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_num_per_loc"
,
f
"int*"
)
# [kernelProd]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"indice_num_per_loc"
,
f
"int*"
)
# [kernelProd]
code
.
arg
(
"num_indices_in"
,
"int"
)
code
.
arg
(
"indices_pair_size"
,
"int"
)
...
...
@@ -552,12 +578,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
calc_subm_conv_indices_mask
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"mask"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"mask"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"num_indices"
,
"int"
)
code
.
arg
(
"indices_pair_size"
,
"int"
)
...
...
@@ -609,13 +636,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
calc_subm_conv_indices_split_mask
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"TTable"
)
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"loc_iter"
,
f
"ConvLocIter"
)
# [N, ndim + 1]
code
.
arg
(
"table"
,
f
"TTable"
)
# [N, ndim + 1]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"mask1"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"mask2"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"indices_in"
,
f
"const int*"
)
# [N, ndim + 1]
code
.
arg
(
"indice_pairs"
,
f
"
{
self
.
dtype_indices
}
*"
)
# [2, kernelProd, MaxSize]
code
.
arg
(
"mask1"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"mask2"
,
f
"uint32_t*"
)
# [kernelProd]
code
.
arg
(
"num_indices"
,
"int"
)
code
.
arg
(
"indices_pair_size"
,
"int"
)
...
...
@@ -665,10 +693,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
generate_conv_inds_stage1
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indices"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
...
...
@@ -706,9 +736,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act;
"""
)
return
code
# .ret("int")
return
code
# .ret("int")
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage1_5
(
self
):
...
...
@@ -726,7 +754,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""
)
return
code
.
ret
(
"int"
)
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage2
(
self
):
code
=
pccm
.
FunctionCode
()
...
...
@@ -735,7 +762,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code
.
arg
(
"num_out_act"
,
"int"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
...
...
@@ -783,10 +811,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def
generate_conv_inds_mask_stage1
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indices"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_bwd, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
...
...
@@ -817,21 +847,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
indice_pairs_bwd.data_ptr<
{
self
.
dtype_indices
}
>(),
indice_pairs_uniq.data_ptr<
{
self
.
dtype_indices
}
>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
kv, transposed);
auto timer = tv::CudaContextTimer<>();
"""
)
return
code
# .ret("int")
return
code
# .ret("int")
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage2_mask
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indices, hashdata"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds"
,
"tv::Tensor"
)
code
.
arg
(
"mask_fwd, mask_bwd"
,
"tv::Tensor"
)
code
.
arg
(
"num_out_act"
,
"int"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
...
...
@@ -903,7 +935,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""
)
return
code
.
ret
(
"int"
)
@
pccm
.
cuda
.
static_function
def
generate_subm_conv_inds
(
self
):
code
=
pccm
.
FunctionCode
()
...
...
@@ -912,7 +943,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"indice_pair_mask"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"indice_pair_mask"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"backward"
,
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
...
...
@@ -993,6 +1025,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
return
code
.
ret
(
"int"
)
class
SparseConvIndicesCPU
(
pccm
.
ParameterizedClass
):
def
__init__
(
self
,
problem
:
ConvProblem
,
dtype_indices
:
dtypes
.
DType
):
super
().
__init__
()
...
...
@@ -1000,9 +1033,9 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
self
.
add_include
(
"unordered_map"
)
self
.
loc_iter
=
ConvOutLocIter
(
problem
)
self
.
add_param_class
(
"spinds"
,
self
.
loc_iter
,
"ConvLocIter"
)
self
.
add_param_class
(
"spinds"
,
problem
,
"ConvProblem"
)
self
.
add_param_class
(
"spinds"
,
problem
,
"ConvProblem"
)
self
.
ndim
=
problem
.
ndim
self
.
ndim
=
problem
.
ndim
self
.
dtype_indices
=
dtype_indices
self
.
dtype_indices_uniq
=
dtype_indices
...
...
@@ -1016,7 +1049,7 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
raw
(
f
"""
tv::array<int,
{
self
.
ndim
}
> stride, padding;
for (int i = 0; i <
{
self
.
ndim
}
; ++i){{
...
...
@@ -1079,7 +1112,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
code
.
arg
(
"indice_pairs, out_inds, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
raw
(
f
"""
int kv = tv::arrayops::prod(ksize);
...
...
spconv/csrc/sparse/maxpool.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -16,15 +16,18 @@ import contextlib
from
cumm.conv.bases
import
ConvEnum
from
cumm.gemm.core.metaarray
import
MetaArray
,
seq
from
cumm
import
dtypes
import
pccm
import
pccm
from
cumm.gemm.layout
import
TensorGeneric
,
to_stride
from
cumm.common
import
TensorView
,
TensorViewHashKernel
,
TensorViewKernel
,
ThrustLib
,
GemmBasic
from
cumm.gemm
import
codeops
from
typing
import
List
from
typing
import
List
from
cumm.conv.params
import
ConvProblem
from
cumm.gemm.mask_iters
import
MaskTileIterator
,
MaskTileIteratorParams
import
numpy
as
np
import
numpy
as
np
from
cumm.gemm
import
(
thread_map
)
from
spconv.csrc.sparse.cpu_core
import
OMPLib
from
cumm.constants
import
CUMM_CPU_ONLY_BUILD
class
IndiceMaxPool
(
pccm
.
Class
):
# TODO optimize this function
...
...
@@ -32,13 +35,13 @@ class IndiceMaxPool(pccm.Class):
super
().
__init__
()
self
.
add_include
(
"limits"
)
self
.
add_dependency
(
TensorViewKernel
,
TensorView
,
GemmBasic
)
@
pccm
.
cuda
.
cuda_global_function
def
forward_kernel
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"out_features"
,
f
"T*"
)
code
.
arg
(
"out_features"
,
f
"T*"
)
code
.
arg
(
"in_features"
,
f
"const T*"
)
code
.
arg
(
"out_indices"
,
"const int*"
)
code
.
arg
(
"in_indices"
,
"const int*"
)
...
...
@@ -67,7 +70,7 @@ class IndiceMaxPool(pccm.Class):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"out_features"
,
f
"T*"
)
code
.
arg
(
"out_features"
,
f
"T*"
)
code
.
arg
(
"in_features"
,
f
"const T*"
)
code
.
arg
(
"indices"
,
"const int*"
)
code
.
arg
(
"num_features"
,
"int"
)
...
...
@@ -104,9 +107,9 @@ class IndiceMaxPool(pccm.Class):
def
backward_kernel
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"out_features"
,
f
"const T*"
)
code
.
arg
(
"out_features"
,
f
"const T*"
)
code
.
arg
(
"in_features"
,
f
"const T*"
)
code
.
arg
(
"dout_features"
,
f
"const T*"
)
code
.
arg
(
"dout_features"
,
f
"const T*"
)
code
.
arg
(
"din_features"
,
f
"T*"
)
code
.
arg
(
"out_indices"
,
"const int*"
)
code
.
arg
(
"in_indices"
,
"const int*"
)
...
...
@@ -137,9 +140,9 @@ class IndiceMaxPool(pccm.Class):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"out_features"
,
f
"const T*"
)
code
.
arg
(
"out_features"
,
f
"const T*"
)
code
.
arg
(
"in_features"
,
f
"const T*"
)
code
.
arg
(
"dout_features"
,
f
"const T*"
)
code
.
arg
(
"dout_features"
,
f
"const T*"
)
code
.
arg
(
"din_features"
,
f
"T*"
)
code
.
arg
(
"indices_bwd"
,
"const int*"
)
code
.
arg
(
"num_features"
,
"int"
)
...
...
@@ -351,6 +354,9 @@ class IndiceMaxPoolCPU(pccm.Class):
def
__init__
(
self
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
)
if
CUMM_CPU_ONLY_BUILD
:
self
.
add_dependency
(
OMPLib
)
self
.
add_include
(
"tensorview/parallel/all.h"
)
@
pccm
.
static_function
def
forward
(
self
):
...
...
@@ -371,20 +377,21 @@ class IndiceMaxPoolCPU(pccm.Class):
auto in_indices = in_inds.data_ptr<const int>();
auto out_indices = out_inds.data_ptr<const int>();
for (int i = 0; i < nhot; ++i) {{
int in_idx = in_indices[i];
int out_idx = out_indices[i];
auto in_ptr = in_features + in_idx * num_features;
auto out_ptr = out_features + out_idx * num_features;
for (int j = 0; j < num_features; ++j) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in > out){{
out_ptr[j] = in;
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
int in_idx = in_indices[i];
int out_idx = out_indices[i];
auto in_ptr = in_features + in_idx * num_features;
auto out_ptr = out_features + out_idx * num_features;
for (int j = 0; j < num_features; ++j) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in > out){{
out_ptr[j] = in;
}}
}}
}}
}}
}}
);
}});
"""
)
return
code
...
...
@@ -412,22 +419,24 @@ class IndiceMaxPoolCPU(pccm.Class):
auto in_indices = in_inds.data_ptr<const int>();
auto out_indices = out_inds.data_ptr<const int>();
for (int i = 0; i < nhot; ++i) {{
int in_idx_offset = in_indices[i] * num_features;
int out_idx_offset = out_indices[i] * num_features;
auto in_ptr = in_features + in_idx_offset;
auto out_ptr = out_features + out_idx_offset;
auto din_ptr = din_features + in_idx_offset;
auto dout_ptr = dout_features + out_idx_offset;
for (int j = 0; j < num_features; ++j) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in == out){{
din_ptr[j] = din_ptr[j] + dout_ptr[j];
tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
for (int i = begin; i < end; i += step) {{
int in_idx_offset = in_indices[i] * num_features;
int out_idx_offset = out_indices[i] * num_features;
auto in_ptr = in_features + in_idx_offset;
auto out_ptr = out_features + out_idx_offset;
auto din_ptr = din_features + in_idx_offset;
auto dout_ptr = dout_features + out_idx_offset;
for (int j = 0; j < num_features; ++j) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in == out){{
din_ptr[j] = din_ptr[j] + dout_ptr[j];
}}
}}
}}
}}
}});
}});
"""
)
return
code
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment