Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
82fd7a8b
Commit
82fd7a8b
authored
Nov 10, 2021
by
yan.yan
Browse files
v2.1.5: add profile tool and python 3.6 for linux
parent
f31eee3a
Changes
80
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
902 additions
and
736 deletions
+902
-736
spconv/csrc/sparse/pointops.py
spconv/csrc/sparse/pointops.py
+52
-34
spconv/pytorch/__init__.py
spconv/pytorch/__init__.py
+6
-5
spconv/pytorch/constants.py
spconv/pytorch/constants.py
+5
-5
spconv/pytorch/conv.py
spconv/pytorch/conv.py
+147
-138
spconv/pytorch/core.py
spconv/pytorch/core.py
+13
-7
spconv/pytorch/cppcore.py
spconv/pytorch/cppcore.py
+14
-8
spconv/pytorch/functional.py
spconv/pytorch/functional.py
+94
-44
spconv/pytorch/modules.py
spconv/pytorch/modules.py
+9
-4
spconv/pytorch/ops.py
spconv/pytorch/ops.py
+233
-213
spconv/pytorch/pool.py
spconv/pytorch/pool.py
+80
-71
spconv/pytorch/spatial.py
spconv/pytorch/spatial.py
+3
-3
spconv/pytorch/tables.py
spconv/pytorch/tables.py
+13
-13
spconv/pytorch/utils.py
spconv/pytorch/utils.py
+16
-14
spconv/test_utils.py
spconv/test_utils.py
+3
-3
spconv/tools.py
spconv/tools.py
+78
-0
spconv/utils/__init__.py
spconv/utils/__init__.py
+27
-8
test/aaa.py
test/aaa.py
+0
-112
test/benchmark.py
test/benchmark.py
+69
-29
test/test_conv.py
test/test_conv.py
+39
-24
version.txt
version.txt
+1
-1
No files found.
spconv/csrc/sparse/pointops.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -15,27 +15,27 @@
import
contextlib
from
cumm.gemm.core.metaarray
import
MetaArray
,
seq
from
cumm
import
dtypes
import
pccm
import
pccm
from
cumm.gemm.layout
import
TensorGeneric
,
to_stride
from
cumm.common
import
TensorView
,
TensorViewHashKernel
from
cumm.gemm
import
codeops
from
typing
import
List
from
typing
import
List
from
cumm.conv.params
import
ConvProblem
import
numpy
as
np
import
numpy
as
np
class
Point2VoxelCommon
(
pccm
.
ParameterizedClass
):
def
__init__
(
self
,
dtype
:
dtypes
.
DType
,
ndim
:
int
,
zyx
:
bool
=
True
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
)
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
zyx
=
zyx
ret_str
=
f
"std::array<int,
{
self
.
ndim
}
>"
retf_str
=
f
"std::array<float,
{
self
.
ndim
}
>"
retf2_str
=
f
"std::array<float,
{
self
.
ndim
*
2
}
>"
self
.
calc_meta_ret
=
f
"std::tuple<
{
retf_str
}
,
{
ret_str
}
,
{
ret_str
}
,
{
retf2_str
}
>"
@
pccm
.
pybind
.
mark
@
pccm
.
static_function
def
calc_meta_data
(
self
):
code
=
pccm
.
FunctionCode
()
...
...
@@ -80,7 +80,8 @@ class Point2VoxelCommon(pccm.ParameterizedClass):
retf_str
=
f
"std::array<float,
{
self
.
ndim
}
>"
retf2_str
=
f
"std::array<float,
{
self
.
ndim
*
2
}
>"
return
code
.
ret
(
f
"std::tuple<
{
retf_str
}
,
{
ret_str
}
,
{
ret_str
}
,
{
retf2_str
}
>"
)
return
code
.
ret
(
f
"std::tuple<
{
retf_str
}
,
{
ret_str
}
,
{
ret_str
}
,
{
retf2_str
}
>"
)
@
pccm
.
static_function
def
array2tvarray
(
self
):
...
...
@@ -112,16 +113,21 @@ class Point2VoxelCommon(pccm.ParameterizedClass):
"""
)
return
code
.
ret
(
"std::array<T, N>"
)
class
Point2VoxelKernel
(
pccm
.
ParameterizedClass
,
pccm
.
pybind
.
PybindClassMixin
):
"""this class don't support multi-thread.
create p2v for every thread.
"""
def
__init__
(
self
,
dtype
:
dtypes
.
DType
,
ndim
:
int
,
layout
:
TensorGeneric
,
zyx
:
bool
=
True
):
def
__init__
(
self
,
dtype
:
dtypes
.
DType
,
ndim
:
int
,
layout
:
TensorGeneric
,
zyx
:
bool
=
True
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
,
TensorViewHashKernel
)
self
.
add_param_class
(
"layout_ns"
,
layout
,
"Layout"
)
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
zyx
=
zyx
@
pccm
.
cuda
.
cuda_global_function
...
...
@@ -142,7 +148,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
point_xyz
=
f
"
{
self
.
ndim
-
1
}
- j"
if
not
self
.
zyx
:
point_xyz
=
f
"j"
# if zyx, the coors_range and grid_bound is zyx too,
# if zyx, the coors_range and grid_bound is zyx too,
# generated indices is zyx.
code
.
raw
(
f
"""
for (int i : tv::KernelLoopX<int>(num_points)){{
...
...
@@ -166,7 +172,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
}}
}}
"""
)
return
code
return
code
@
pccm
.
cuda
.
cuda_global_function
def
assign_table
(
self
):
...
...
@@ -190,7 +196,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
}}
}}
"""
)
return
code
return
code
@
pccm
.
cuda
.
cuda_global_function
def
generate_voxel
(
self
):
...
...
@@ -231,7 +237,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
}}
}}
"""
)
return
code
return
code
@
pccm
.
cuda
.
cuda_global_function
def
voxel_empty_fill_mean
(
self
):
...
...
@@ -263,7 +269,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
}}
}}
"""
)
return
code
return
code
@
pccm
.
cuda
.
cuda_global_function
def
limit_num_per_voxel_value
(
self
):
...
...
@@ -276,7 +282,8 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
num_per_voxel[i] = count;
}}
"""
)
return
code
return
code
class
Point2Voxel
(
pccm
.
ParameterizedClass
,
pccm
.
pybind
.
PybindClassMixin
):
def
__init__
(
self
,
dtype
:
dtypes
.
DType
,
ndim
:
int
,
zyx
:
bool
=
True
):
...
...
@@ -286,14 +293,23 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
self
.
add_param_class
(
"p2v_c"
,
self
.
p2v_c
,
"Point2VoxelCommon"
)
layout
=
TensorGeneric
(
ndim
,
True
)
self
.
add_param_class
(
"layout_ns"
,
layout
,
"Layout"
)
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
zyx
=
zyx
cuda_funcs
=
[
self
.
point_to_voxel_hash
,
self
.
point_to_voxel_hash_static
]
self
.
add_impl_only_param_class
(
cuda_funcs
,
"kernel"
,
Point2VoxelKernel
(
dtype
,
ndim
,
layout
,
zyx
))
self
.
add_pybind_member
(
"hashdata"
,
"tv::Tensor"
,
readwrite
=
False
,
pyanno
=
"cumm.tensorview.Tensor"
)
self
.
add_pybind_member
(
"point_indice_data"
,
"tv::Tensor"
,
readwrite
=
False
,
pyanno
=
"cumm.tensorview.Tensor"
)
cuda_funcs
=
[
self
.
point_to_voxel_hash
,
self
.
point_to_voxel_hash_static
]
self
.
add_impl_only_param_class
(
cuda_funcs
,
"kernel"
,
Point2VoxelKernel
(
dtype
,
ndim
,
layout
,
zyx
))
self
.
add_pybind_member
(
"hashdata"
,
"tv::Tensor"
,
readwrite
=
False
,
pyanno
=
"cumm.tensorview.Tensor"
)
self
.
add_pybind_member
(
"point_indice_data"
,
"tv::Tensor"
,
readwrite
=
False
,
pyanno
=
"cumm.tensorview.Tensor"
)
self
.
add_pybind_member
(
"voxels"
,
"tv::Tensor"
,
readwrite
=
False
)
self
.
add_pybind_member
(
"indices"
,
"tv::Tensor"
,
readwrite
=
False
)
...
...
@@ -357,7 +373,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
hashdata = tv::zeros({{1}}, tv::custom128, 0);
point_indice_data = tv::zeros({{1}}, tv::int64, 0);
"""
)
return
code
return
code
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
member_function
...
...
@@ -439,13 +455,13 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
"""
)
return
code
.
ret
(
"std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>"
)
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
static_function
def
point_to_voxel_hash_static
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"points"
,
"tv::Tensor"
)
code
.
arg
(
"voxels, indices, num_per_voxel, hashdata, point_indice_data"
,
"tv::Tensor"
)
code
.
arg
(
"voxels, indices, num_per_voxel, hashdata, point_indice_data"
,
"tv::Tensor"
)
code
.
arg
(
"vsize"
,
f
"std::array<float,
{
self
.
ndim
}
>"
)
code
.
arg
(
"grid_size, grid_stride"
,
f
"std::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"coors_range"
,
f
"std::array<float,
{
self
.
ndim
*
2
}
>"
)
...
...
@@ -527,13 +543,16 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
self
.
add_dependency
(
TensorView
)
layout
=
TensorGeneric
(
ndim
,
True
)
self
.
add_param_class
(
"layout_ns"
,
layout
,
"Layout"
)
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
dtype
=
dtype
self
.
ndim
=
ndim
self
.
zyx
=
zyx
self
.
p2v_c
=
Point2VoxelCommon
(
dtype
,
ndim
,
zyx
)
self
.
add_param_class
(
"p2v_c"
,
self
.
p2v_c
,
"Point2VoxelCommon"
)
self
.
add_pybind_member
(
"densehashdata"
,
"tv::Tensor"
,
readwrite
=
False
,
pyanno
=
"cumm.tensorview.Tensor"
)
self
.
add_pybind_member
(
"densehashdata"
,
"tv::Tensor"
,
readwrite
=
False
,
pyanno
=
"cumm.tensorview.Tensor"
)
self
.
add_pybind_member
(
"voxels"
,
"tv::Tensor"
,
readwrite
=
False
)
self
.
add_pybind_member
(
"indices"
,
"tv::Tensor"
,
readwrite
=
False
)
...
...
@@ -568,7 +587,6 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
"""
)
return
code
.
ret
(
self
.
p2v_c
.
calc_meta_ret
)
@
pccm
.
pybind
.
mark
@
pccm
.
constructor
def
ctor
(
self
):
...
...
@@ -613,7 +631,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
densehashdata_ptr[i] = -1;
}}
"""
)
return
code
return
code
def
point_to_voxel_static_template
(
self
,
mean
:
bool
=
False
):
code
=
pccm
.
FunctionCode
()
...
...
spconv/pytorch/__init__.py
View file @
82fd7a8b
...
...
@@ -4,13 +4,14 @@ from pathlib import Path
import
numpy
as
np
import
torch
from
spconv.pytorch
import
ops
from
spconv.pytorch.conv
import
(
SparseConv2d
,
SparseConv3d
,
SparseConvTranspose2d
,
SparseConvTranspose3d
,
SparseInverseConv2d
,
SparseInverseConv3d
,
SubMConv2d
,
SubMConv3d
)
from
spconv.pytorch
import
ops
,
functional
from
spconv.pytorch.conv
import
(
SparseConv2d
,
SparseConv3d
,
SparseConvTranspose2d
,
SparseConvTranspose3d
,
SparseInverseConv2d
,
SparseInverseConv3d
,
SubMConv2d
,
SubMConv3d
)
from
spconv.pytorch.core
import
SparseConvTensor
from
spconv.pytorch.identity
import
Identity
from
spconv.pytorch.modules
import
SparseModule
,
SparseSequential
from
spconv.pytorch.modules
import
SparseModule
,
SparseSequential
,
assign_name_for_sparse_modules
from
spconv.pytorch.ops
import
ConvAlgo
from
spconv.pytorch.pool
import
SparseMaxPool2d
,
SparseMaxPool3d
from
spconv.pytorch.tables
import
AddTable
,
ConcatTable
,
JoinTable
...
...
spconv/pytorch/constants.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
import
torch
try
:
remove_plus
=
torch
.
__version__
.
find
(
"+"
)
remove_dotdev
=
torch
.
__version__
.
find
(
".dev"
)
...
...
@@ -26,4 +26,4 @@ try:
PYTORCH_VERSION
=
list
(
map
(
int
,
PYTORCH_VERSION
.
split
(
"."
)))
except
:
# for unknown errors, just set a version
PYTORCH_VERSION
=
[
1
,
8
,
0
]
\ No newline at end of file
PYTORCH_VERSION
=
[
1
,
8
,
0
]
spconv/pytorch/conv.py
View file @
82fd7a8b
...
...
@@ -24,12 +24,13 @@ from torch.nn.parameter import Parameter
from
spconv
import
pytorch
as
spconv
from
spconv.core
import
ConvAlgo
import
spconv.pytorch
.
functional
as
Fsp
from
spconv.pytorch
import
functional
as
Fsp
from
spconv.pytorch
import
ops
from
spconv.cppconstants
import
CPU_ONLY_BUILD
from
spconv.pytorch.core
import
IndiceData
,
SparseConvTensor
,
ImplicitGemmIndiceData
from
spconv.pytorch.modules
import
SparseModule
from
spconv.constants
import
FILTER_HWIO
from
spconv.utils
import
nullcontext
def
_calculate_fan_in_and_fan_out_hwio
(
tensor
,
algo
:
ConvAlgo
):
...
...
@@ -205,6 +206,7 @@ class SparseConvolution(SparseModule):
self
.
dilation
)
else
:
out_spatial_shape
=
spatial_shape
# print(self._sparse_unique_name, spatial_shape, out_spatial_shape)
# input.update_grid(out_spatial_shape)
# t = time.time()
out_tensor
=
input
.
shadow_copy
()
...
...
@@ -247,158 +249,165 @@ class SparseConvolution(SparseModule):
out_tensor
=
out_tensor
.
replace_feature
(
features
)
return
out_tensor
indice_dict
=
input
.
indice_dict
.
copy
()
algo
=
self
.
algo
if
self
.
indice_key
is
not
None
:
if
self
.
indice_key
is
not
None
:
datas
=
input
.
find_indice_pair
(
self
.
indice_key
)
if
datas
is
not
None
:
msg
=
"due to limitation of pytorch, you must provide same algo to layers share same indice key."
assert
algo
==
datas
.
algo
,
msg
# algo = datas.algo
if
algo
==
ConvAlgo
.
Native
:
datas
=
input
.
find_indice_pair
(
self
.
indice_key
)
if
datas
is
not
None
:
assert
isinstance
(
datas
,
IndiceData
)
if
self
.
inverse
:
assert
datas
is
not
None
and
self
.
indice_key
is
not
None
assert
datas
.
is_subm
is
False
,
"inverse conv can only be used with standard conv and pool ops."
outids
=
datas
.
indices
indice_pairs
=
datas
.
indice_pairs
indice_pair_num
=
datas
.
indice_pair_num
out_spatial_shape
=
datas
.
out_spatial_shape
assert
indice_pair_num
.
shape
[
0
]
==
np
.
prod
(
self
.
kernel_size
),
"inverse conv must have same kernel size as its couple conv"
else
:
if
self
.
indice_key
is
not
None
and
datas
is
not
None
:
outids
=
datas
.
out_indices
profile_ctx
=
nullcontext
()
if
input
.
_timer
is
not
None
and
self
.
_sparse_unique_name
:
profile_ctx
=
input
.
_timer
.
namespace
(
self
.
_sparse_unique_name
)
with
profile_ctx
:
if
algo
==
ConvAlgo
.
Native
:
datas
=
input
.
find_indice_pair
(
self
.
indice_key
)
if
datas
is
not
None
:
assert
isinstance
(
datas
,
IndiceData
)
if
self
.
inverse
:
assert
datas
is
not
None
and
self
.
indice_key
is
not
None
assert
datas
.
is_subm
is
False
,
"inverse conv can only be used with standard conv and pool ops."
outids
=
datas
.
indices
indice_pairs
=
datas
.
indice_pairs
indice_pair_num
=
datas
.
indice_pair_num
out_spatial_shape
=
datas
.
out_spatial_shape
assert
indice_pair_num
.
shape
[
0
]
==
np
.
prod
(
self
.
kernel_size
),
"inverse conv must have same kernel size as its couple conv"
else
:
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
outids
,
indice_pairs
,
indice_pair_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
algo
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
output_padding
,
self
.
subm
,
self
.
transposed
)
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
interval
=
time
.
time
()
-
t
out_tensor
.
benchmark_record
[
self
.
name
][
"indice_gen_time"
].
append
(
interval
)
indice_data
=
IndiceData
(
outids
,
indices
,
indice_pairs
,
indice_pair_num
,
spatial_shape
,
is_subm
=
self
.
subm
,
algo
=
algo
)
if
self
.
indice_key
is
not
None
:
msg
=
f
"your indice key
{
self
.
indice_key
}
already exists in this sparse tensor."
assert
self
.
indice_key
not
in
indice_dict
,
msg
indice_dict
[
self
.
indice_key
]
=
indice_data
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
indice_pairs_calc
=
indice_pairs
if
indice_pairs
.
device
!=
features
.
device
:
indice_pairs_calc
=
indice_pairs
.
to
(
features
.
device
)
if
self
.
subm
:
out_features
=
Fsp
.
indice_subm_conv
(
features
,
self
.
weight
,
indice_pairs_calc
,
indice_pair_num
,
outids
.
shape
[
0
],
algo
)
else
:
if
self
.
inverse
:
out_features
=
Fsp
.
indice_inverse_conv
(
if
self
.
indice_key
is
not
None
and
datas
is
not
None
:
outids
=
datas
.
out_indices
indice_pairs
=
datas
.
indice_pairs
indice_pair_num
=
datas
.
indice_pair_num
else
:
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
outids
,
indice_pairs
,
indice_pair_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
algo
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
output_padding
,
self
.
subm
,
self
.
transposed
)
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
interval
=
time
.
time
()
-
t
out_tensor
.
benchmark_record
[
self
.
name
][
"indice_gen_time"
].
append
(
interval
)
indice_data
=
IndiceData
(
outids
,
indices
,
indice_pairs
,
indice_pair_num
,
spatial_shape
,
is_subm
=
self
.
subm
,
algo
=
algo
)
if
self
.
indice_key
is
not
None
:
msg
=
f
"your indice key
{
self
.
indice_key
}
already exists in this sparse tensor."
assert
self
.
indice_key
not
in
indice_dict
,
msg
indice_dict
[
self
.
indice_key
]
=
indice_data
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
indice_pairs_calc
=
indice_pairs
if
indice_pairs
.
device
!=
features
.
device
:
indice_pairs_calc
=
indice_pairs
.
to
(
features
.
device
)
if
self
.
subm
:
out_features
=
Fsp
.
indice_subm_conv
(
features
,
self
.
weight
,
indice_pairs_calc
,
indice_pair_num
,
outids
.
shape
[
0
],
algo
)
indice_pair_num
,
outids
.
shape
[
0
],
algo
,
input
.
_timer
)
else
:
out_features
=
Fsp
.
indice_conv
(
features
,
self
.
weight
,
indice_pairs_calc
,
indice_pair_num
,
outids
.
shape
[
0
],
algo
)
else
:
datas
=
input
.
find_indice_pair
(
self
.
indice_key
)
if
datas
is
not
None
:
assert
isinstance
(
datas
,
ImplicitGemmIndiceData
)
if
self
.
inverse
:
assert
datas
is
not
None
and
self
.
indice_key
is
not
None
assert
datas
.
is_subm
is
False
,
"inverse conv can only be used with standard conv and pool ops."
outids
=
datas
.
indices
pair_fwd
=
datas
.
pair_bwd
pair_bwd
=
datas
.
pair_fwd
pair_mask_fwd_splits
=
datas
.
pair_mask_bwd_splits
pair_mask_bwd_splits
=
datas
.
pair_mask_fwd_splits
mask_argsort_fwd_splits
=
datas
.
mask_argsort_bwd_splits
mask_argsort_bwd_splits
=
datas
.
mask_argsort_fwd_splits
masks
=
datas
.
masks
if
self
.
inverse
:
out_features
=
Fsp
.
indice_inverse_conv
(
features
,
self
.
weight
,
indice_pairs_calc
,
indice_pair_num
,
outids
.
shape
[
0
],
algo
)
else
:
out_features
=
Fsp
.
indice_conv
(
features
,
self
.
weight
,
indice_pairs_calc
,
indice_pair_num
,
outids
.
shape
[
0
],
algo
,
input
.
_timer
)
else
:
if
self
.
indice_key
is
not
None
and
datas
is
not
None
:
outids
=
datas
.
out_indices
pair_fwd
=
datas
.
pair_fwd
pair_bwd
=
datas
.
pair_bwd
pair_mask_fwd_splits
=
datas
.
pair_mask_fwd_splits
pair_mask_bwd_splits
=
datas
.
pair_mask_bwd_splits
mask_argsort_fwd_splits
=
datas
.
mask_argsort_fwd_splits
mask_argsort_bwd_splits
=
datas
.
mask_argsort_bwd_splits
datas
=
input
.
find_indice_pair
(
self
.
indice_key
)
if
datas
is
not
None
:
assert
isinstance
(
datas
,
ImplicitGemmIndiceData
)
if
self
.
inverse
:
assert
datas
is
not
None
and
self
.
indice_key
is
not
None
assert
datas
.
is_subm
is
False
,
"inverse conv can only be used with standard conv and pool ops."
outids
=
datas
.
indices
pair_fwd
=
datas
.
pair_bwd
pair_bwd
=
datas
.
pair_fwd
pair_mask_fwd_splits
=
datas
.
pair_mask_bwd_splits
pair_mask_bwd_splits
=
datas
.
pair_mask_fwd_splits
mask_argsort_fwd_splits
=
datas
.
mask_argsort_bwd_splits
mask_argsort_bwd_splits
=
datas
.
mask_argsort_fwd_splits
masks
=
datas
.
masks
else
:
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices
,
batch_size
,
spatial_shape
,
algo
,
ksize
=
self
.
kernel_size
,
stride
=
self
.
stride
,
padding
=
self
.
padding
,
dilation
=
self
.
dilation
,
out_padding
=
self
.
output_padding
,
subm
=
self
.
subm
,
transpose
=
self
.
transposed
,
is_train
=
self
.
training
,
alloc
=
input
.
thrust_allocator
)
outids
=
res
[
0
]
num_inds_per_loc
=
res
[
1
]
pair_fwd
=
res
[
2
]
pair_bwd
=
res
[
3
]
pair_mask_fwd_splits
=
res
[
4
]
pair_mask_bwd_splits
=
res
[
5
]
mask_argsort_fwd_splits
=
res
[
6
]
mask_argsort_bwd_splits
=
res
[
7
]
masks
=
res
[
8
]
if
self
.
indice_key
is
not
None
:
indice_data
=
ImplicitGemmIndiceData
(
outids
,
indices
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
=
pair_mask_fwd_splits
,
pair_mask_bwd_splits
=
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
=
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
=
mask_argsort_bwd_splits
,
masks
=
masks
,
is_subm
=
self
.
subm
,
out_spatial_shape
=
out_spatial_shape
,
algo
=
algo
)
msg
=
f
"your indice key
{
self
.
indice_key
}
already exists in this sparse tensor."
assert
self
.
indice_key
not
in
indice_dict
,
msg
indice_dict
[
self
.
indice_key
]
=
indice_data
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
num_activate_out
=
outids
.
shape
[
0
]
out_features
=
Fsp
.
implicit_gemm
(
features
,
self
.
weight
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
,
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
,
num_activate_out
,
masks
,
self
.
training
,
self
.
subm
)
if
self
.
indice_key
is
not
None
and
datas
is
not
None
:
outids
=
datas
.
out_indices
pair_fwd
=
datas
.
pair_fwd
pair_bwd
=
datas
.
pair_bwd
pair_mask_fwd_splits
=
datas
.
pair_mask_fwd_splits
pair_mask_bwd_splits
=
datas
.
pair_mask_bwd_splits
mask_argsort_fwd_splits
=
datas
.
mask_argsort_fwd_splits
mask_argsort_bwd_splits
=
datas
.
mask_argsort_bwd_splits
masks
=
datas
.
masks
else
:
with
input
.
_timer
.
namespace
(
"gen_pairs"
):
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices
,
batch_size
,
spatial_shape
,
algo
,
ksize
=
self
.
kernel_size
,
stride
=
self
.
stride
,
padding
=
self
.
padding
,
dilation
=
self
.
dilation
,
out_padding
=
self
.
output_padding
,
subm
=
self
.
subm
,
transpose
=
self
.
transposed
,
is_train
=
self
.
training
,
alloc
=
input
.
thrust_allocator
,
timer
=
input
.
_timer
)
outids
=
res
[
0
]
num_inds_per_loc
=
res
[
1
]
pair_fwd
=
res
[
2
]
pair_bwd
=
res
[
3
]
pair_mask_fwd_splits
=
res
[
4
]
pair_mask_bwd_splits
=
res
[
5
]
mask_argsort_fwd_splits
=
res
[
6
]
mask_argsort_bwd_splits
=
res
[
7
]
masks
=
res
[
8
]
if
self
.
indice_key
is
not
None
:
indice_data
=
ImplicitGemmIndiceData
(
outids
,
indices
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
=
pair_mask_fwd_splits
,
pair_mask_bwd_splits
=
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
=
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
=
mask_argsort_bwd_splits
,
masks
=
masks
,
is_subm
=
self
.
subm
,
out_spatial_shape
=
out_spatial_shape
,
algo
=
algo
)
msg
=
f
"your indice key
{
self
.
indice_key
}
already exists in this sparse tensor."
assert
self
.
indice_key
not
in
indice_dict
,
msg
indice_dict
[
self
.
indice_key
]
=
indice_data
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
num_activate_out
=
outids
.
shape
[
0
]
out_features
=
Fsp
.
implicit_gemm
(
features
,
self
.
weight
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
,
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
,
num_activate_out
,
masks
,
self
.
training
,
self
.
subm
,
input
.
_timer
)
if
self
.
bias
is
not
None
:
out_features
+=
self
.
bias
if
input
.
benchmark
:
...
...
spconv/pytorch/core.py
View file @
82fd7a8b
...
...
@@ -19,6 +19,7 @@ import torch
from
spconv.core
import
ConvAlgo
from
spconv.pytorch.constants
import
PYTORCH_VERSION
from
spconv.pytorch.ops
import
ThrustSortAllocator
from
spconv.tools
import
CUDAKernelTimer
if
PYTORCH_VERSION
>=
[
1
,
8
,
0
]:
try
:
...
...
@@ -51,13 +52,14 @@ class IndiceData(object):
class
ImplicitGemmIndiceData
(
object
):
def
__init__
(
self
,
out_indices
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
,
pair_fwd
:
torch
.
Tensor
,
pair_bwd
:
torch
.
Tensor
,
def
__init__
(
self
,
out_indices
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
,
pair_fwd
:
torch
.
Tensor
,
pair_bwd
:
torch
.
Tensor
,
pair_mask_fwd_splits
:
List
[
torch
.
Tensor
],
pair_mask_bwd_splits
:
List
[
torch
.
Tensor
],
mask_argsort_fwd_splits
:
List
[
torch
.
Tensor
],
mask_argsort_bwd_splits
:
List
[
torch
.
Tensor
],
masks
:
List
[
np
.
ndarray
],
out_spatial_shape
,
is_subm
:
bool
,
algo
:
ConvAlgo
):
masks
:
List
[
np
.
ndarray
],
out_spatial_shape
,
is_subm
:
bool
,
algo
:
ConvAlgo
):
self
.
out_indices
=
out_indices
self
.
indices
=
indices
self
.
pair_fwd
=
pair_fwd
...
...
@@ -99,7 +101,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
voxel_num
:
Optional
[
torch
.
Tensor
]
=
None
,
indice_dict
:
Optional
[
dict
]
=
None
,
benchmark
:
bool
=
False
,
permanent_thrust_allocator
:
bool
=
False
):
permanent_thrust_allocator
:
bool
=
False
,
enable_timer
:
bool
=
False
):
"""
Args:
features: [num_points, num_features] feature tensor
...
...
@@ -130,9 +133,10 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
self
.
voxel_num
=
voxel_num
# for tensorrt
self
.
benchmark
=
benchmark
self
.
benchmark_record
=
{}
self
.
thrust_allocator
:
Optional
[
ThrustSortAllocator
]
=
None
self
.
thrust_allocator
:
Optional
[
ThrustSortAllocator
]
=
None
if
permanent_thrust_allocator
:
self
.
thrust_allocator
=
ThrustSortAllocator
(
features
.
device
)
self
.
_timer
=
CUDAKernelTimer
(
enable_timer
)
def
replace_feature
(
self
,
feature
):
"""we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features))
...
...
@@ -144,7 +148,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
new_spt
.
benchmark
=
self
.
benchmark
new_spt
.
benchmark_record
=
self
.
benchmark_record
new_spt
.
thrust_allocator
=
self
.
thrust_allocator
new_spt
.
_timer
=
self
.
_timer
return
new_spt
@
property
...
...
@@ -174,7 +178,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
def
spatial_size
(
self
):
return
np
.
prod
(
self
.
spatial_shape
)
def
find_indice_pair
(
self
,
key
)
->
Optional
[
Union
[
IndiceData
,
ImplicitGemmIndiceData
]]:
def
find_indice_pair
(
self
,
key
)
->
Optional
[
Union
[
IndiceData
,
ImplicitGemmIndiceData
]]:
if
key
is
None
:
return
None
if
key
in
self
.
indice_dict
:
...
...
@@ -208,4 +213,5 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
self
.
benchmark
)
tensor
.
benchmark_record
=
self
.
benchmark_record
tensor
.
thrust_allocator
=
self
.
thrust_allocator
tensor
.
_timer
=
self
.
_timer
return
tensor
spconv/pytorch/cppcore.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
cumm
import
tensorview
as
tv
import
torch
from
cumm
import
tensorview
as
tv
import
torch
from
typing
import
Optional
,
List
_TORCH_DTYPE_TO_TV
=
{
torch
.
float32
:
tv
.
float32
,
torch
.
float64
:
tv
.
float64
,
...
...
@@ -26,10 +27,13 @@ _TORCH_DTYPE_TO_TV = {
torch
.
uint8
:
tv
.
uint8
,
}
def
torch_tensor_to_tv
(
ten
:
torch
.
Tensor
,
dtype
:
Optional
[
int
]
=
None
,
shape
:
Optional
[
List
[
int
]]
=
None
):
def
torch_tensor_to_tv
(
ten
:
torch
.
Tensor
,
dtype
:
Optional
[
int
]
=
None
,
shape
:
Optional
[
List
[
int
]]
=
None
):
assert
ten
.
is_contiguous
(),
"must be contiguous tensor"
ptr
=
ten
.
data_ptr
()
device
=
ten
.
device
device
=
ten
.
device
if
device
.
type
==
"cpu"
:
tv_device
=
-
1
elif
device
.
type
==
"cuda"
:
...
...
@@ -42,10 +46,12 @@ def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Op
dtype
=
_TORCH_DTYPE_TO_TV
[
ten
.
dtype
]
return
tv
.
from_blob
(
ptr
,
shape
,
dtype
,
tv_device
)
def
get_current_stream
():
return
torch
.
cuda
.
current_stream
().
cuda_stream
if
__name__
==
"__main__"
:
a
=
torch
.
rand
(
2
,
2
)
atv
=
torch_tensor_to_tv
(
a
)
print
(
atv
.
numpy_view
())
\ No newline at end of file
print
(
atv
.
numpy_view
())
spconv/pytorch/functional.py
View file @
82fd7a8b
...
...
@@ -15,8 +15,9 @@
import
torch
from
torch
import
nn
from
torch.autograd
import
Function
import
spconv.pytorch.ops
as
ops
from
typing
import
Optional
from
spconv.tools
import
CUDAKernelTimer
from
spconv.pytorch
import
ops
import
torch.cuda.amp
as
amp
from
torch.autograd.function
import
once_differentiable
import
numpy
as
np
...
...
@@ -27,23 +28,32 @@ from typing import List
class
SparseConvFunction
(
Function
):
@
staticmethod
@
amp
.
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
algo
):
def
forward
(
ctx
,
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
algo
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
ctx
.
save_for_backward
(
indice_pairs
,
indice_pair_num
,
features
,
filters
)
ctx
.
algo
=
algo
ctx
.
timer
=
timer
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
False
,
algo
=
algo
)
algo
=
algo
,
timer
=
timer
)
@
staticmethod
@
once_differentiable
@
amp
.
custom_bwd
def
backward
(
ctx
,
grad_output
):
indice_pairs
,
indice_pair_num
,
features
,
filters
=
ctx
.
saved_tensors
timer
=
ctx
.
timer
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
...
...
@@ -51,18 +61,27 @@ class SparseConvFunction(Function):
indice_pairs
,
indice_pair_num
,
False
,
algo
=
ctx
.
algo
)
algo
=
ctx
.
algo
,
timer
=
timer
)
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
,
None
class
SparseInverseConvFunction
(
Function
):
@
staticmethod
@
amp
.
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
algo
):
def
forward
(
ctx
,
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
algo
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
ctx
.
save_for_backward
(
indice_pairs
,
indice_pair_num
,
features
,
filters
)
ctx
.
algo
=
algo
ctx
.
timer
=
timer
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
...
...
@@ -70,13 +89,16 @@ class SparseInverseConvFunction(Function):
num_activate_out
,
True
,
False
,
algo
=
algo
)
algo
=
algo
,
timer
=
timer
)
@
staticmethod
@
once_differentiable
@
amp
.
custom_bwd
def
backward
(
ctx
,
grad_output
):
indice_pairs
,
indice_pair_num
,
features
,
filters
=
ctx
.
saved_tensors
timer
=
ctx
.
timer
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
...
...
@@ -84,29 +106,40 @@ class SparseInverseConvFunction(Function):
indice_pair_num
,
True
,
False
,
algo
=
ctx
.
algo
)
algo
=
ctx
.
algo
,
timer
=
timer
)
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
,
None
class
SparseImplicitGemmFunction
(
Function
):
@
staticmethod
@
amp
.
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
pair_fwd
:
torch
.
Tensor
,
pair_bwd
:
torch
.
Tensor
,
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
pair_fwd
:
torch
.
Tensor
,
pair_bwd
:
torch
.
Tensor
,
pair_mask_fwd_splits
:
List
[
torch
.
Tensor
],
pair_mask_bwd_splits
:
List
[
torch
.
Tensor
],
mask_argsort_fwd_splits
:
List
[
torch
.
Tensor
],
mask_argsort_bwd_splits
:
List
[
torch
.
Tensor
],
num_activate_out
:
int
,
masks
:
List
[
np
.
ndarray
],
is_train
:
bool
,
is_subm
:
bool
):
num_activate_out
:
int
,
masks
:
List
[
np
.
ndarray
],
is_train
:
bool
,
is_subm
:
bool
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
out
,
mask_out
,
mask_width
=
ops
.
implicit_gemm
(
features
,
filters
,
pair_fwd
,
pair_mask_fwd_splits
,
mask_argsort_fwd_splits
,
num_activate_out
,
masks
,
is_train
,
is_subm
)
out
,
mask_out
,
mask_width
=
ops
.
implicit_gemm
(
features
,
filters
,
pair_fwd
,
pair_mask_fwd_splits
,
mask_argsort_fwd_splits
,
num_activate_out
,
masks
,
is_train
,
is_subm
,
timer
)
ctx
.
save_for_backward
(
features
,
filters
,
pair_fwd
,
pair_bwd
)
ctx
.
mask_width
=
mask_width
ctx
.
mask_out
=
mask_out
ctx
.
timer
=
timer
ctx
.
pair_mask_fwd_splits
=
pair_mask_fwd_splits
ctx
.
mask_argsort_fwd_splits
=
mask_argsort_fwd_splits
ctx
.
pair_mask_bwd_splits
=
pair_mask_bwd_splits
...
...
@@ -130,30 +163,40 @@ class SparseImplicitGemmFunction(Function):
# num_activate_out = ctx.num_activate_out
masks
=
ctx
.
masks
is_subm
=
ctx
.
is_subm
input_bp
,
filters_bp
=
ops
.
implicit_gemm_backward
(
features
,
filters
,
grad_output
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
,
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
,
mask_output_fwd
=
mask_out
,
masks
=
masks
,
mask_width
=
mask_width
,
is_subm
=
is_subm
)
None_9
=
[
None
]
*
10
timer
=
ctx
.
timer
input_bp
,
filters_bp
=
ops
.
implicit_gemm_backward
(
features
,
filters
,
grad_output
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
,
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
,
mask_output_fwd
=
mask_out
,
masks
=
masks
,
mask_width
=
mask_width
,
is_subm
=
is_subm
,
timer
=
timer
)
None_9
=
[
None
]
*
11
return
(
input_bp
,
filters_bp
,
*
None_9
)
class
SubMConvFunction
(
Function
):
@
staticmethod
@
amp
.
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
algo
):
def
forward
(
ctx
,
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
algo
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
ctx
.
save_for_backward
(
indice_pairs
,
indice_pair_num
,
features
,
filters
)
ctx
.
algo
=
algo
ctx
.
timer
=
timer
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
...
...
@@ -161,13 +204,16 @@ class SubMConvFunction(Function):
num_activate_out
,
False
,
True
,
algo
=
algo
)
algo
=
algo
,
timer
=
timer
)
@
staticmethod
@
once_differentiable
@
amp
.
custom_bwd
def
backward
(
ctx
,
grad_output
):
indice_pairs
,
indice_pair_num
,
features
,
filters
=
ctx
.
saved_tensors
timer
=
ctx
.
timer
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
...
...
@@ -175,9 +221,10 @@ class SubMConvFunction(Function):
indice_pair_num
,
False
,
True
,
algo
=
ctx
.
algo
)
algo
=
ctx
.
algo
,
timer
=
timer
)
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
,
None
class
SparseMaxPoolFunction
(
Function
):
...
...
@@ -199,12 +246,14 @@ class SparseMaxPoolFunction(Function):
indice_pairs
,
indice_pair_num
)
return
input_bp
,
None
,
None
,
None
class
SparseMaxPoolImplicitGemmFunction
(
Function
):
@
staticmethod
@
amp
.
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indice_pairs_fwd
:
torch
.
Tensor
,
indice_pairs_bwd
:
torch
.
Tensor
,
num_activate_out
:
int
):
out
=
ops
.
indice_maxpool_implicit_gemm
(
features
,
indice_pairs_fwd
,
num_activate_out
)
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indice_pairs_fwd
:
torch
.
Tensor
,
indice_pairs_bwd
:
torch
.
Tensor
,
num_activate_out
:
int
):
out
=
ops
.
indice_maxpool_implicit_gemm
(
features
,
indice_pairs_fwd
,
num_activate_out
)
ctx
.
save_for_backward
(
indice_pairs_bwd
,
features
,
out
)
return
out
...
...
@@ -213,10 +262,11 @@ class SparseMaxPoolImplicitGemmFunction(Function):
@
amp
.
custom_bwd
def
backward
(
ctx
,
grad_output
):
indice_pairs_bwd
,
features
,
out
=
ctx
.
saved_tensors
input_bp
=
ops
.
indice_maxpool_implicit_gemm_backward
(
features
,
out
,
grad_output
,
indice_pairs_bwd
)
input_bp
=
ops
.
indice_maxpool_implicit_gemm_backward
(
features
,
out
,
grad_output
,
indice_pairs_bwd
)
return
input_bp
,
None
,
None
,
None
indice_conv
=
SparseConvFunction
.
apply
implicit_gemm
=
SparseImplicitGemmFunction
.
apply
indice_inverse_conv
=
SparseInverseConvFunction
.
apply
...
...
spconv/pytorch/modules.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
import
time
from
collections
import
OrderedDict
...
...
@@ -53,6 +52,7 @@ class SparseModule(nn.Module):
def
__init__
(
self
,
name
=
None
):
super
().
__init__
()
self
.
name
=
name
self
.
_sparse_unique_name
=
""
class
SparseSequential
(
SparseModule
):
...
...
@@ -143,3 +143,8 @@ class SparseSequential(SparseModule):
input
=
module
(
input
)
return
input
def
assign_name_for_sparse_modules
(
module
:
nn
.
Module
):
for
k
,
n
in
module
.
named_modules
():
if
isinstance
(
n
,
SparseModule
):
n
.
_sparse_unique_name
=
k
spconv/pytorch/ops.py
View file @
82fd7a8b
...
...
@@ -26,14 +26,19 @@ from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
import
spconv.core_cc
as
_ext
from
spconv.utils
import
nullcontext
if
hasattr
(
_ext
,
"cumm"
):
CPU_ONLY_BUILD
=
False
from
spconv.algo
import
GEMM
,
CONV
# , GATHER, SCATTER
else
:
GEMM
=
None
CONV
=
None
CPU_ONLY_BUILD
=
True
GEMM
=
None
CONV
=
None
import
time
from
spconv.constants
import
FILTER_HWIO
from
cumm.gemm
import
codeops
from
spconv.tools
import
CUDAKernelTimer
DEBUG
=
False
...
...
@@ -240,19 +245,21 @@ def get_indice_pairs(indices: torch.Tensor,
return
out_inds
,
pair
,
indice_num_per_loc
def
get_indice_pairs_implicit_gemm
(
indices
:
torch
.
Tensor
,
batch_size
:
int
,
spatial_shape
:
List
[
int
],
algo
:
ConvAlgo
,
ksize
:
List
[
int
],
stride
:
List
[
int
],
padding
:
List
[
int
],
dilation
:
List
[
int
],
out_padding
:
List
[
int
],
subm
:
bool
=
False
,
transpose
:
bool
=
False
,
is_train
:
bool
=
True
,
alloc
:
Optional
[
ThrustSortAllocator
]
=
None
):
def
get_indice_pairs_implicit_gemm
(
indices
:
torch
.
Tensor
,
batch_size
:
int
,
spatial_shape
:
List
[
int
],
algo
:
ConvAlgo
,
ksize
:
List
[
int
],
stride
:
List
[
int
],
padding
:
List
[
int
],
dilation
:
List
[
int
],
out_padding
:
List
[
int
],
subm
:
bool
=
False
,
transpose
:
bool
=
False
,
is_train
:
bool
=
True
,
alloc
:
Optional
[
ThrustSortAllocator
]
=
None
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
"""
Why return tuple? because pytorch seems don't support custom object in autograd.
return: (
...
...
@@ -336,18 +343,18 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
out_inds_tv
=
torch_tensor_to_tv
(
out_inds
)
hashdata_tv
=
torch_tensor_to_tv
(
hashdata
,
dtype
=
tv
.
custom64
)
pair_mask_tv
=
torch_tensor_to_tv
(
pair_mask
,
dtype
=
tv
.
uint32
)
SpconvOps
.
generate_subm_conv_inds
(
inds_tv
,
hashdata_tv
,
pair_tv
,
out_inds_tv
,
indice_num_per_loc_tv
,
batch_size
=
batch_size
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
dilation
=
dilation
,
indice_pair_mask
=
pair_mask_tv
,
stream_int
=
stream
)
with
timer
.
record
(
"gen_subm_inds"
,
stream
):
SpconvOps
.
generate_subm_conv_inds
(
inds_tv
,
hashdata_tv
,
pair_tv
,
out_inds_tv
,
indice_num_per_loc_tv
,
batch_size
=
batch_size
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
dilation
=
dilation
,
indice_pair_mask
=
pair_mask_tv
,
stream_int
=
stream
)
# torch.cuda.synchronize()
# print("SUBM0", time.time() - t)
# CONV.stream_synchronize(stream)
...
...
@@ -358,13 +365,15 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
mask_argsort_tv
=
torch_tensor_to_tv
(
mask_argsort
)
if
alloc
is
None
:
alloc
=
ThrustSortAllocator
(
indices
.
device
)
for
j
in
range
(
mask_split_count
):
# thrust don't provide two-step sort (first step return workspace size)
# so I use this stupid hack to use torch allocator without touch
# pytorch binary (c++).
# f**k thrust
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_tv
[
j
],
alloc
.
alloc
,
mask_argsort_tv
[
j
],
stream
)
with
timer
.
record
(
"gen_subm_inds_sort"
,
stream
):
for
j
in
range
(
mask_split_count
):
# thrust don't provide two-step sort (first step return workspace size)
# so I use this stupid hack to use torch allocator without touch
# pytorch binary (c++).
# f**k thrust
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_tv
[
j
],
alloc
.
alloc
,
mask_argsort_tv
[
j
],
stream
)
# CONV.stream_synchronize(stream)
pair_mask_in_splits
=
[
pair_mask
[
i
]
for
i
in
range
(
mask_split_count
)]
mask_argsort_in_splits
=
[
...
...
@@ -391,20 +400,20 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
indice_pairs_uniq_tv
=
torch_tensor_to_tv
(
indice_pairs_uniq
)
SpconvOps
.
generate_conv_inds_mask_stage1
(
inds_tv
,
pair_bwd_tv
,
indice_pairs_uniq_tv
,
indice_num_per_loc_tv
,
batch_size
=
batch_size
,
output_dims
=
out_shape
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
transposed
=
transpose
,
stream_int
=
stream
)
with
timer
.
record
(
"gen_conv_inds_stage1"
,
stream
):
SpconvOps
.
generate_conv_inds_mask_stage1
(
inds_tv
,
pair_bwd_tv
,
indice_pairs_uniq_tv
,
indice_num_per_loc_tv
,
batch_size
=
batch_size
,
output_dims
=
out_shape
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
transposed
=
transpose
,
stream_int
=
stream
)
if
DEBUG
:
CONV
.
stream_synchronize
(
stream
)
...
...
@@ -452,25 +461,25 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
CONV
.
stream_synchronize
(
stream
)
print
(
"REGU_S2_PREPARE"
,
time
.
time
()
-
t
)
t
=
time
.
time
()
SpconvOps
.
generate_conv_inds_mask_stage2
(
inds_tv
,
hashdata_tv
,
pair_fwd_tv
,
pair_bwd_tv
,
uniq_res_tv
,
out_inds_tv
,
pair_mask_fwd_tv
,
pair_mask_bwd_tv
,
num_out_act
=
num_act_out
,
batch_size
=
batch_size
,
output_dims
=
out_shape
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
transposed
=
transpose
,
stream_int
=
stream
)
with
timer
.
record
(
"gen_conv_inds_stage2"
,
stream
):
SpconvOps
.
generate_conv_inds_mask_stage2
(
inds_tv
,
hashdata_tv
,
pair_fwd_tv
,
pair_bwd_tv
,
uniq_res_tv
,
out_inds_tv
,
pair_mask_fwd_tv
,
pair_mask_bwd_tv
,
num_out_act
=
num_act_out
,
batch_size
=
batch_size
,
output_dims
=
out_shape
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
transposed
=
transpose
,
stream_int
=
stream
)
if
DEBUG
:
CONV
.
stream_synchronize
(
stream
)
...
...
@@ -492,62 +501,61 @@ def get_indice_pairs_implicit_gemm(indices: torch.Tensor,
mask_argsort_bwd_tv
=
torch_tensor_to_tv
(
mask_argsort_bwd
)
if
alloc
is
None
:
alloc
=
ThrustSortAllocator
(
indices
.
device
)
if
is_mask_split
:
for
j
in
range
(
mask_split_count
):
mask_tv
=
tv
.
from_numpy
(
masks
[
j
])
# here we try to ensure only call allocator once.
if
not
is_train
:
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_fwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_fwd_tv
[
j
],
stream
)
else
:
if
pair_mask_bwd_tv
.
dim
(
1
)
>
pair_mask_fwd_tv
.
dim
(
1
):
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_bwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_bwd_tv
[
j
],
stream
)
with
timer
.
record
(
"gen_conv_inds_sort"
,
stream
):
if
is_mask_split
:
for
j
in
range
(
mask_split_count
):
mask_tv
=
tv
.
from_numpy
(
masks
[
j
])
# here we try to ensure only call allocator once.
if
not
is_train
:
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_fwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_fwd_tv
[
j
],
stream
)
else
:
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_fwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_fwd_tv
[
j
],
stream
)
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_bwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_bwd_tv
[
j
],
stream
)
# SpconvOps.sort_1d_by_key_split(pair_mask_fwd_tv[j], mask_tv,
# mask_argsort_fwd_tv[j], stream)
# if is_train:
# SpconvOps.sort_1d_by_key_split(pair_mask_bwd_tv[j],
# mask_tv,
# mask_argsort_bwd_tv[j],
# stream)
if
pair_mask_bwd_tv
.
dim
(
1
)
>
pair_mask_fwd_tv
.
dim
(
1
):
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_bwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_bwd_tv
[
j
],
stream
)
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_fwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_fwd_tv
[
j
],
stream
)
else
:
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_fwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_fwd_tv
[
j
],
stream
)
SpconvOps
.
sort_1d_by_key_split_allocator
(
pair_mask_bwd_tv
[
j
],
alloc
.
alloc
,
mask_tv
,
mask_argsort_bwd_tv
[
j
],
stream
)
# SpconvOps.sort_1d_by_key_split(pair_mask_fwd_tv[j], mask_tv,
# mask_argsort_fwd_tv[j], stream)
# if is_train:
# SpconvOps.sort_1d_by_key_split(pair_mask_bwd_tv[j],
# mask_tv,
# mask_argsort_bwd_tv[j],
# stream)
else
:
# if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
if
not
is_train
:
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_fwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_fwd_tv
[
0
],
stream
)
else
:
if
pair_mask_bwd_tv
.
dim
(
1
)
>
pair_mask_fwd_tv
.
dim
(
1
):
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_bwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_bwd_tv
[
0
],
stream
)
# if pair_mask_bwd_tv.dim(1) > pair_mask_fwd_tv.dim(1):
if
not
is_train
:
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_fwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_fwd_tv
[
0
],
stream
)
alloc
.
alloc
,
mask_argsort_fwd_tv
[
0
],
stream
)
else
:
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_fwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_fwd_tv
[
0
],
stream
)
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_bwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_bwd_tv
[
0
],
stream
)
if
pair_mask_bwd_tv
.
dim
(
1
)
>
pair_mask_fwd_tv
.
dim
(
1
):
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_bwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_bwd_tv
[
0
],
stream
)
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_fwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_fwd_tv
[
0
],
stream
)
else
:
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_fwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_fwd_tv
[
0
],
stream
)
SpconvOps
.
sort_1d_by_key_allocator
(
pair_mask_bwd_tv
[
0
],
alloc
.
alloc
,
mask_argsort_bwd_tv
[
0
],
stream
)
if
DEBUG
:
CONV
.
stream_synchronize
(
stream
)
print
(
"REGU_S2_FINISH"
,
time
.
time
()
-
t
)
...
...
@@ -587,7 +595,8 @@ def indice_conv(features: torch.Tensor,
num_activate_out
:
int
,
inverse
:
bool
=
False
,
subm
:
bool
=
False
,
algo
:
ConvAlgo
=
ConvAlgo
.
Native
):
algo
:
ConvAlgo
=
ConvAlgo
.
Native
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
# filters: RSKC
# stream = get_current_stream()
# CONV.stream_synchronize(stream)
...
...
@@ -717,38 +726,38 @@ def indice_conv(features: torch.Tensor,
stream
=
stream
)
# CONV.stream_synchronize(stream)
# t = time.time()
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
b
=
filters_tv
[
i
]
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
algo_desp
=
GEMM
.
run_with_tuned_result
(
tuned_res
,
a
,
b
,
c
,
False
,
False
if
FILTER_HWIO
else
True
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
inp_indices
,
c_inds
=
out_indices
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
beta
=
beta
)
# gather_times += gather_time
inited
=
True
with
timer
.
record
(
"forward"
,
stream
):
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
b
=
filters_tv
[
i
]
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
algo_desp
=
GEMM
.
run_with_tuned_result
(
tuned_res
,
a
,
b
,
c
,
False
,
False
if
FILTER_HWIO
else
True
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
inp_indices
,
c_inds
=
out_indices
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
beta
=
beta
)
# gather_times += gather_time
inited
=
True
# CONV.stream_synchronize(stream)
# print(out_features.mean(), out_features.max(), out_features.min())
...
...
@@ -770,7 +779,8 @@ def indice_conv_backward(features: torch.Tensor,
indice_pair_num
:
torch
.
Tensor
,
inverse
:
bool
=
False
,
subm
:
bool
=
False
,
algo
:
ConvAlgo
=
ConvAlgo
.
Native
):
algo
:
ConvAlgo
=
ConvAlgo
.
Native
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
# print(out_bp.mean(), out_bp.max(), out_bp.min())
num_activate_out
=
out_bp
.
shape
[
0
]
...
...
@@ -1046,12 +1056,16 @@ def indice_conv_backward(features: torch.Tensor,
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
def
implicit_gemm
(
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
def
implicit_gemm
(
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
pair_fwd
:
torch
.
Tensor
,
pair_mask_fwd_splits
:
List
[
torch
.
Tensor
],
mask_argsort_fwd_splits
:
List
[
torch
.
Tensor
],
num_activate_out
:
int
,
masks
:
List
[
np
.
ndarray
],
is_train
:
bool
,
is_subm
:
bool
):
num_activate_out
:
int
,
masks
:
List
[
np
.
ndarray
],
is_train
:
bool
,
is_subm
:
bool
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
stream
=
get_current_stream
()
# if DEBUG:
...
...
@@ -1136,24 +1150,25 @@ def implicit_gemm(features: torch.Tensor, filters: torch.Tensor,
# CONV.stream_synchronize(stream)
# t = time.time()
for
j
in
range
(
num_split
):
beta
=
0
if
j
==
0
else
1
CONV
.
run_with_tuned_result
(
tune_res
,
ConvOpType
.
kForward
,
features_tv
,
filters_tv
,
out_features_tv
,
mask
=
pair_mask_fwd_split_tvs
[
j
],
mask_argsort
=
mask_argsort_fwd_split_tvs
[
j
],
mask_output
=
mask_output_fwd_tvs
[
j
],
indices
=
pair_fwd_tv
,
reverse_mask
=
False
,
mask_filter
=
masks_ints
[
j
],
mask_width
=-
1
,
beta
=
beta
,
stream
=
stream
,
verbose
=
False
)
with
timer
.
record
(
"implicit_gemm"
,
stream
):
for
j
in
range
(
num_split
):
beta
=
0
if
j
==
0
else
1
CONV
.
run_with_tuned_result
(
tune_res
,
ConvOpType
.
kForward
,
features_tv
,
filters_tv
,
out_features_tv
,
mask
=
pair_mask_fwd_split_tvs
[
j
],
mask_argsort
=
mask_argsort_fwd_split_tvs
[
j
],
mask_output
=
mask_output_fwd_tvs
[
j
],
indices
=
pair_fwd_tv
,
reverse_mask
=
False
,
mask_filter
=
masks_ints
[
j
],
mask_width
=-
1
,
beta
=
beta
,
stream
=
stream
,
verbose
=
False
)
# torch.cuda.synchronize()
# if DEBUG:
...
...
@@ -1166,16 +1181,20 @@ def implicit_gemm(features: torch.Tensor, filters: torch.Tensor,
return
out_features
,
mask_output_fwd
,
mask_width
def
implicit_gemm_backward
(
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
out_bp
:
torch
.
Tensor
,
pair_fwd
:
torch
.
Tensor
,
def
implicit_gemm_backward
(
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
out_bp
:
torch
.
Tensor
,
pair_fwd
:
torch
.
Tensor
,
pair_bwd
:
torch
.
Tensor
,
pair_mask_fwd_splits
:
List
[
torch
.
Tensor
],
pair_mask_bwd_splits
:
List
[
torch
.
Tensor
],
mask_argsort_fwd_splits
:
List
[
torch
.
Tensor
],
mask_argsort_bwd_splits
:
List
[
torch
.
Tensor
],
mask_output_fwd
:
torch
.
Tensor
,
masks
:
List
[
np
.
ndarray
],
mask_width
:
int
,
is_subm
:
bool
):
masks
:
List
[
np
.
ndarray
],
mask_width
:
int
,
is_subm
:
bool
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
# print(out_bp.mean(), out_bp.max(), out_bp.min())
if
features
.
dtype
==
torch
.
int8
or
features
.
dtype
==
torch
.
qint8
:
raise
NotImplementedError
(
"work in progress"
)
...
...
@@ -1287,44 +1306,46 @@ def implicit_gemm_backward(features: torch.Tensor, filters: torch.Tensor,
dtype
=
torch
.
int8
,
device
=
features
.
device
)
workspace_tv
=
torch_tensor_to_tv
(
workspace
)
for
j
in
range
(
num_split
):
beta
=
0
if
j
==
0
else
1
if
is_subm
:
mask
=
pair_mask_fwd_split_tvs
[
j
]
mask_argsort
=
mask_argsort_fwd_split_tvs
[
j
]
else
:
mask
=
pair_mask_bwd_split_tvs
[
j
]
mask_argsort
=
mask_argsort_bwd_split_tvs
[
j
]
CONV
.
run_with_tuned_result
(
dgrad_tune_res
,
ConvOpType
.
kBackwardInput
,
din_tv
,
filters_tv
,
dout_tv
,
mask
=
mask
,
mask_argsort
=
mask_argsort
,
mask_output
=
tv
.
Tensor
(),
indices
=
pair_bwd_tv
,
reverse_mask
=
is_subm
,
mask_filter
=
masks
[
j
].
item
(),
mask_width
=-
1
,
beta
=
beta
,
stream
=
stream
)
CONV
.
run_with_tuned_result
(
wgrad_tune_res
,
ConvOpType
.
kBackwardWeight
,
features_tv
,
dfilters_tv
,
dout_tv
,
mask
=
mask_output_fwd_tv
[
j
],
mask_argsort
=
mask_argsort_fwd_split_tvs
[
j
],
mask_output
=
tv
.
Tensor
(),
indices
=
pair_fwd_tv
,
reverse_mask
=
False
,
mask_filter
=
masks
[
j
].
item
(),
mask_width
=
mask_width
,
beta
=
beta
,
workspace
=
workspace_tv
,
stream
=
stream
)
with
timer
.
record
(
"implicit_gemm_backward"
,
stream
):
for
j
in
range
(
num_split
):
beta
=
0
if
j
==
0
else
1
if
is_subm
:
mask
=
pair_mask_fwd_split_tvs
[
j
]
mask_argsort
=
mask_argsort_fwd_split_tvs
[
j
]
else
:
mask
=
pair_mask_bwd_split_tvs
[
j
]
mask_argsort
=
mask_argsort_bwd_split_tvs
[
j
]
CONV
.
run_with_tuned_result
(
dgrad_tune_res
,
ConvOpType
.
kBackwardInput
,
din_tv
,
filters_tv
,
dout_tv
,
mask
=
mask
,
mask_argsort
=
mask_argsort
,
mask_output
=
tv
.
Tensor
(),
indices
=
pair_bwd_tv
,
reverse_mask
=
is_subm
,
mask_filter
=
masks
[
j
].
item
(),
mask_width
=-
1
,
beta
=
beta
,
stream
=
stream
)
CONV
.
run_with_tuned_result
(
wgrad_tune_res
,
ConvOpType
.
kBackwardWeight
,
features_tv
,
dfilters_tv
,
dout_tv
,
mask
=
mask_output_fwd_tv
[
j
],
mask_argsort
=
mask_argsort_fwd_split_tvs
[
j
],
mask_output
=
tv
.
Tensor
(),
indices
=
pair_fwd_tv
,
reverse_mask
=
False
,
mask_filter
=
masks
[
j
].
item
(),
mask_width
=
mask_width
,
beta
=
beta
,
workspace
=
workspace_tv
,
stream
=
stream
)
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
...
...
@@ -1445,4 +1466,3 @@ def indice_maxpool_implicit_gemm_backward(features, out_features, out_bp,
out_bp_tv
,
din_tv
,
indice_pairs_tv
,
stream
)
return
din
spconv/pytorch/pool.py
View file @
82fd7a8b
...
...
@@ -24,11 +24,12 @@ from typing import List, Optional, Tuple, Union
from
spconv
import
pytorch
as
spconv
from
spconv.core
import
ConvAlgo
import
spconv.pytorch
.
functional
as
Fsp
from
spconv.pytorch
import
functional
as
Fsp
from
spconv.pytorch
import
ops
from
spconv.pytorch.core
import
IndiceData
,
ImplicitGemmIndiceData
from
spconv.pytorch.modules
import
SparseModule
from
spconv.cppconstants
import
CPU_ONLY_BUILD
from
spconv.utils
import
nullcontext
class
SparseMaxPool
(
SparseModule
):
...
...
@@ -126,79 +127,87 @@ class SparseMaxPool(SparseModule):
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
out_padding
=
[
0
]
*
self
.
ndim
out_padding
=
[
0
]
*
self
.
ndim
indice_dict
=
input
.
indice_dict
.
copy
()
if
self
.
algo
==
ConvAlgo
.
Native
:
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
ConvAlgo
.
Native
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
out_padding
,
False
)
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
interval
=
time
.
time
()
-
t
out_tensor
.
benchmark_record
[
self
.
name
][
"indice_gen_time"
].
append
(
interval
)
t
=
time
.
time
()
profile_ctx
=
nullcontext
()
if
input
.
_timer
is
not
None
and
self
.
_sparse_unique_name
:
profile_ctx
=
input
.
_timer
.
namespace
(
self
.
_sparse_unique_name
)
with
profile_ctx
:
if
self
.
algo
==
ConvAlgo
.
Native
:
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
ConvAlgo
.
Native
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
out_padding
,
False
)
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
interval
=
time
.
time
()
-
t
out_tensor
.
benchmark_record
[
self
.
name
][
"indice_gen_time"
].
append
(
interval
)
t
=
time
.
time
()
if
self
.
indice_key
is
not
None
:
datas
=
input
.
find_indice_pair
(
self
.
indice_key
)
if
datas
is
None
:
indice_data
=
IndiceData
(
outids
,
indices
,
indice_pairs
,
indice_pairs_num
,
spatial_shape
,
is_subm
=
False
,
algo
=
self
.
algo
)
indice_dict
[
self
.
indice_key
]
=
indice_data
else
:
raise
ValueError
(
f
"indice key
{
self
.
indice_key
}
exists"
)
if
self
.
indice_key
is
not
None
:
datas
=
input
.
find_indice_pair
(
self
.
indice_key
)
if
datas
is
None
:
indice_data
=
IndiceData
(
outids
,
indices
,
indice_pairs
,
indice_pairs_num
,
spatial_shape
,
is_subm
=
False
,
algo
=
self
.
algo
)
indice_dict
[
self
.
indice_key
]
=
indice_data
else
:
raise
ValueError
(
f
"indice key
{
self
.
indice_key
}
exists"
)
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
outids
.
shape
[
0
])
else
:
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices
,
batch_size
,
spatial_shape
,
self
.
algo
,
ksize
=
self
.
kernel_size
,
stride
=
self
.
stride
,
padding
=
self
.
padding
,
dilation
=
self
.
dilation
,
out_padding
=
out_padding
,
subm
=
self
.
subm
,
is_train
=
self
.
training
,
alloc
=
input
.
thrust_allocator
)
outids
=
res
[
0
]
num_inds_per_loc
=
res
[
1
]
pair_fwd
=
res
[
2
]
pair_bwd
=
res
[
3
]
pair_mask_fwd_splits
=
res
[
4
]
pair_mask_bwd_splits
=
res
[
5
]
mask_argsort_fwd_splits
=
res
[
6
]
mask_argsort_bwd_splits
=
res
[
7
]
masks
=
res
[
8
]
if
self
.
indice_key
is
not
None
:
indice_data
=
ImplicitGemmIndiceData
(
outids
,
indices
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
=
pair_mask_fwd_splits
,
pair_mask_bwd_splits
=
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
=
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
=
mask_argsort_bwd_splits
,
masks
=
masks
,
is_subm
=
self
.
subm
,
out_spatial_shape
=
out_spatial_shape
,
algo
=
self
.
algo
)
msg
=
f
"your indice key
{
self
.
indice_key
}
already exists in this sparse tensor."
assert
self
.
indice_key
not
in
indice_dict
,
msg
indice_dict
[
self
.
indice_key
]
=
indice_data
out_features
=
Fsp
.
indice_maxpool_implicit_gemm
(
features
,
pair_fwd
,
pair_bwd
,
outids
.
shape
[
0
])
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
outids
.
shape
[
0
])
else
:
with
input
.
_timer
.
namespace
(
"gen_pairs"
):
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices
,
batch_size
,
spatial_shape
,
self
.
algo
,
ksize
=
self
.
kernel_size
,
stride
=
self
.
stride
,
padding
=
self
.
padding
,
dilation
=
self
.
dilation
,
out_padding
=
out_padding
,
subm
=
self
.
subm
,
is_train
=
self
.
training
,
alloc
=
input
.
thrust_allocator
,
timer
=
input
.
_timer
)
outids
=
res
[
0
]
num_inds_per_loc
=
res
[
1
]
pair_fwd
=
res
[
2
]
pair_bwd
=
res
[
3
]
pair_mask_fwd_splits
=
res
[
4
]
pair_mask_bwd_splits
=
res
[
5
]
mask_argsort_fwd_splits
=
res
[
6
]
mask_argsort_bwd_splits
=
res
[
7
]
masks
=
res
[
8
]
if
self
.
indice_key
is
not
None
:
indice_data
=
ImplicitGemmIndiceData
(
outids
,
indices
,
pair_fwd
,
pair_bwd
,
pair_mask_fwd_splits
=
pair_mask_fwd_splits
,
pair_mask_bwd_splits
=
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
=
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
=
mask_argsort_bwd_splits
,
masks
=
masks
,
is_subm
=
self
.
subm
,
out_spatial_shape
=
out_spatial_shape
,
algo
=
self
.
algo
)
msg
=
f
"your indice key
{
self
.
indice_key
}
already exists in this sparse tensor."
assert
self
.
indice_key
not
in
indice_dict
,
msg
indice_dict
[
self
.
indice_key
]
=
indice_data
out_features
=
Fsp
.
indice_maxpool_implicit_gemm
(
features
,
pair_fwd
,
pair_bwd
,
outids
.
shape
[
0
])
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
...
...
spconv/pytorch/spatial.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
spconv/pytorch/tables.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -15,18 +15,18 @@
import
torch
from
torch.autograd
import
Function
import
spconv.pytorch
as
spconv
#from torch.nn import Module
from
spconv.pytorch.modules
import
SparseModule
from
spconv.pytorch.core
import
SparseConvTensor
from
typing
import
List
from
typing
import
List
class
JoinTable
(
SparseModule
):
# Module):
def
forward
(
self
,
input
:
List
[
SparseConvTensor
]):
output
=
spconv
.
SparseConvTensor
(
torch
.
cat
([
i
.
features
for
i
in
input
],
1
),
input
[
0
].
indices
,
input
[
0
].
spatial_shape
,
input
[
0
].
batch_size
,
input
[
0
].
grid
,
input
[
0
].
voxel_num
,
input
[
0
].
indice_dict
)
output
=
SparseConvTensor
(
torch
.
cat
([
i
.
features
for
i
in
input
],
1
),
input
[
0
].
indices
,
input
[
0
].
spatial_shape
,
input
[
0
].
batch_size
,
input
[
0
].
grid
,
input
[
0
].
voxel_num
,
input
[
0
].
indice_dict
)
output
.
benchmark_record
=
input
[
1
].
benchmark_record
output
.
thrust_allocator
=
input
[
1
].
thrust_allocator
return
output
...
...
@@ -37,10 +37,10 @@ class JoinTable(SparseModule): # Module):
class
AddTable
(
SparseModule
):
# Module):
def
forward
(
self
,
input
:
List
[
SparseConvTensor
]):
output
=
spconv
.
SparseConvTensor
(
sum
([
i
.
features
for
i
in
input
]),
input
[
0
].
indices
,
input
[
0
].
spatial_shape
,
input
[
0
].
batch_size
,
input
[
0
].
grid
,
input
[
0
].
voxel_num
,
input
[
0
].
indice_dict
)
output
=
SparseConvTensor
(
sum
([
i
.
features
for
i
in
input
]),
input
[
0
].
indices
,
input
[
0
].
spatial_shape
,
input
[
0
].
batch_size
,
input
[
0
].
grid
,
input
[
0
].
voxel_num
,
input
[
0
].
indice_dict
)
output
.
benchmark_record
=
input
[
1
].
benchmark_record
output
.
thrust_allocator
=
input
[
1
].
thrust_allocator
return
output
...
...
spconv/pytorch/utils.py
View file @
82fd7a8b
...
...
@@ -82,24 +82,25 @@ class PointToVoxel(object):
if
self
.
point_indice_data
.
shape
[
0
]
<
pc
.
shape
[
0
]:
self
.
point_indice_data
=
torch
.
empty
([
pc
.
shape
[
0
]],
dtype
=
torch
.
int64
,
device
=
self
.
device
)
dtype
=
torch
.
int64
,
device
=
self
.
device
)
pc_tv
=
torch_tensor_to_tv
(
pc
)
stream
=
get_current_stream
()
voxels_tv
=
torch_tensor_to_tv
(
self
.
voxels
)
indices_tv
=
torch_tensor_to_tv
(
self
.
indices
)
num_per_voxel_tv
=
torch_tensor_to_tv
(
self
.
num_per_voxel
)
hashdata_tv
=
torch_tensor_to_tv
(
self
.
hashdata
,
dtype
=
tv
.
custom128
,
shape
=
[
self
.
hashdata
.
shape
[
0
]])
point_indice_data_tv
=
torch_tensor_to_tv
(
self
.
point_indice_data
)
hashdata_tv
=
torch_tensor_to_tv
(
self
.
hashdata
,
dtype
=
tv
.
custom128
,
shape
=
[
self
.
hashdata
.
shape
[
0
]])
point_indice_data_tv
=
torch_tensor_to_tv
(
self
.
point_indice_data
)
res
=
SpconvOps
.
point2voxel_cuda
(
pc_tv
,
voxels_tv
,
indices_tv
,
num_per_voxel_tv
,
hashdata_tv
,
point_indice_data_tv
,
self
.
vsize
,
self
.
grid_size
,
self
.
grid_stride
,
self
.
coors_range
,
empty_mean
,
clear_voxels
,
stream
)
res
=
SpconvOps
.
point2voxel_cuda
(
pc_tv
,
voxels_tv
,
indices_tv
,
num_per_voxel_tv
,
hashdata_tv
,
point_indice_data_tv
,
self
.
vsize
,
self
.
grid_size
,
self
.
grid_stride
,
self
.
coors_range
,
empty_mean
,
clear_voxels
,
stream
)
num_voxels
=
res
[
0
].
shape
[
0
]
else
:
pc_tv
=
torch_tensor_to_tv
(
pc
)
...
...
@@ -111,8 +112,9 @@ class PointToVoxel(object):
res
=
SpconvOps
.
point2voxel_cpu
(
pc_tv
,
voxels_tv
,
indices_tv
,
num_per_voxel_tv
,
hashdata_tv
,
self
.
vsize
,
self
.
grid_size
,
self
.
grid_stride
,
self
.
coors_range
,
empty_mean
,
clear_voxels
)
self
.
grid_stride
,
self
.
coors_range
,
empty_mean
,
clear_voxels
)
num_voxels
=
res
[
0
].
shape
[
0
]
return
(
self
.
voxels
[:
num_voxels
],
self
.
indices
[:
num_voxels
],
...
...
spconv/test_utils.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
spconv/tools.py
0 → 100644
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Dict
from
spconv.cppconstants
import
CPU_ONLY_BUILD
import
contextlib
from
spconv.utils
import
nullcontext
if
not
CPU_ONLY_BUILD
:
from
cumm.tensorview
import
CUDAKernelTimer
as
_CUDAKernelTimer
class
CUDAKernelTimer
:
def
__init__
(
self
,
enable
:
bool
=
True
)
->
None
:
self
.
enable
=
enable
and
not
CPU_ONLY_BUILD
if
self
.
enable
:
self
.
_timer
=
_CUDAKernelTimer
(
enable
)
else
:
self
.
_timer
=
None
@
contextlib
.
contextmanager
def
_namespace
(
self
,
name
:
str
):
assert
self
.
_timer
is
not
None
self
.
_timer
.
push
(
name
)
try
:
yield
finally
:
self
.
_timer
.
pop
()
@
contextlib
.
contextmanager
def
_record
(
self
,
name
:
str
,
stream
:
int
=
0
):
assert
self
.
_timer
is
not
None
self
.
_timer
.
push
(
name
)
try
:
self
.
_timer
.
insert_pair
(
""
,
"start"
,
"stop"
)
self
.
_timer
.
record
(
"start"
,
stream
)
yield
self
.
_timer
.
record
(
"stop"
,
stream
)
finally
:
self
.
_timer
.
pop
()
def
namespace
(
self
,
name
:
str
):
if
self
.
enable
:
return
self
.
_namespace
(
name
)
else
:
return
nullcontext
()
def
record
(
self
,
name
:
str
,
stream
:
int
=
0
):
if
self
.
enable
:
return
self
.
_record
(
name
,
stream
)
else
:
return
nullcontext
()
def
get_all_pair_time
(
self
)
->
Dict
[
str
,
float
]:
if
self
.
enable
:
assert
self
.
_timer
is
not
None
return
self
.
_timer
.
get_all_pair_duration
()
else
:
return
{}
@
staticmethod
def
collect_by_name
(
name
:
str
,
res
:
Dict
[
str
,
float
]):
filtered_res
:
Dict
[
str
,
float
]
=
{}
for
k
,
v
in
res
.
items
():
k_split
=
k
.
split
(
"."
)
if
name
in
k_split
:
filtered_res
[
k
]
=
v
return
filtered_res
spconv/utils/__init__.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -13,18 +13,37 @@
# limitations under the License.
import
numpy
as
np
from
cumm
import
tensorview
as
tv
from
cumm
import
tensorview
as
tv
from
contextlib
import
AbstractContextManager
from
spconv.cppconstants
import
CPU_ONLY_BUILD
from
spconv.core_cc.csrc.sparse.all.ops_cpu1d
import
Point2VoxelCPU
as
Point2VoxelCPU1d
from
spconv.core_cc.csrc.sparse.all.ops_cpu2d
import
Point2VoxelCPU
as
Point2VoxelCPU2d
from
spconv.core_cc.csrc.sparse.all.ops_cpu3d
import
Point2VoxelCPU
as
Point2VoxelCPU3d
from
spconv.core_cc.csrc.sparse.all.ops_cpu4d
import
Point2VoxelCPU
as
Point2VoxelCPU4d
import
spconv.core_cc.csrc.sparse.all
as
__all
IS_CPU_ONLY_BUILD
=
hasattr
(
__all
,
"ops1d"
)
if
IS_CPU_ONLY_BUILD
:
if
not
CPU_ONLY_BUILD
:
from
spconv.core_cc.csrc.sparse.all.ops1d
import
Point2Voxel
as
Point2VoxelGPU1d
from
spconv.core_cc.csrc.sparse.all.ops2d
import
Point2Voxel
as
Point2VoxelGPU2d
from
spconv.core_cc.csrc.sparse.all.ops3d
import
Point2Voxel
as
Point2VoxelGPU3d
from
spconv.core_cc.csrc.sparse.all.ops4d
import
Point2Voxel
as
Point2VoxelGPU4d
class
nullcontext
(
AbstractContextManager
):
"""Context manager that does no additional processing.
Used as a stand-in for a normal context manager, when a particular
block of code is only sometimes used with a normal context manager:
cm = optional_cm if condition else nullcontext()
with cm:
# Perform operation, using optional_cm if condition is True
"""
def
__init__
(
self
,
enter_result
=
None
):
self
.
enter_result
=
enter_result
def
__enter__
(
self
):
return
self
.
enter_result
def
__exit__
(
self
,
*
excinfo
):
pass
test/aaa.py
deleted
100644 → 0
View file @
f31eee3a
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
STR
=
"""
BWG 0.0008761882781982422
BWG 0.0008311271667480469
BWG 0.002079486846923828
BWG 0.002329587936401367
BWG 0.0025458335876464844
BWG 0.0026700496673583984
BWG 0.002583742141723633
BWG 0.0025262832641601562
BWG 0.003481149673461914
BWG 0.003238201141357422
BWG 0.005095958709716797
BWG 0.0037899017333984375
BWG 0.003931283950805664
BWG 0.003300189971923828
"""
"""
0.003921985626220703
0.0049707889556884766
0.0052530765533447266
0.0060312747955322266
0.0036766529083251953
0.00421142578125
0.002129793167114258
0.0023038387298583984
0.0013151168823242188
0.0015285015106201172
0.0008392333984375
0.0008127689361572266
0.0002486705780029297
0.00030994415283203125
"""
STR1
=
"""
SUBM 0.0005137920379638672
F 0.0012662410736083984
F 0.0016875267028808594
REGU 0.0009055137634277344
M 0.0009114742279052734
SUBM 0.00037789344787597656
F 0.0020329952239990234
F 0.001947641372680664
REGU 0.0009374618530273438
M 0.00045609474182128906
SUBM 0.0009856224060058594
F 0.0009992122650146484
F 0.0010600090026855469
REGU 0.0006346702575683594
M 0.0004057884216308594
SUBM 0.0006394386291503906
F 0.0008478164672851562
F 0.0008838176727294922
REGU 0.0007183551788330078
M 0.00025177001953125
SUBM 0.0009539127349853516
F 0.0009481906890869141
F 0.0010502338409423828
REGU 0.0007147789001464844
M 0.000274658203125
SUBM 0.0007004737854003906
F 0.0009715557098388672
F 0.0012331008911132812
REGU 0.0008800029754638672
M 0.0002167224884033203
SUBM 0.00045108795166015625
F 0.0006735324859619141
F 0.0008375644683837891
"""
STR2
=
"""
F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A0T1688_NS00_C3_01LLL_1 0.0007038116455078125
F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0007627010345458984
F Turing_f16f16f16f16f16tnt_m64n128k32m32n64k32A1T1688_NS00_C3_01LLL_1 0.0007650852203369141
F Turing_f16f16f16f16f16tnt_m64n128k32m32n64k32A1T1688_NS00_C3_01LLL_1 0.0008864402770996094
F Turing_f16f16f16f16f16tnt_m64n128k32m32n64k32A1T1688_NS00_C3_01LLL_1 0.0004017353057861328
F Turing_f16f16f16f16f16tnt_m32n128k64m32n32k32A1T1688_NS00_C3_01LLL_1 0.0006165504455566406
F Turing_f16f16f16f16f16tnt_m64n64k32m32n32k32A1T1688_NS00_C3_01LLL_1 0.0005872249603271484
F Turing_f16f16f16f16f16tnt_m64n64k32m32n32k32A1T1688_NS00_C3_01LLL_1 0.0006289482116699219
F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0002968311309814453
F Turing_f16f16f16f16f16tnt_m64n64k32m32n32k32A1T1688_NS00_C3_01LLL_1 0.0003299713134765625
F Turing_f16f16f16f16f16tnt_m64n128k64m32n64k32A1T1688_NS00_C3_01LLL_1 0.0002288818359375
F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0002830028533935547
F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0001780986785888672
F Turing_f16f16f16f16f16tnt_m32n64k32m32n32k16A1T1688_NS00_C3_01LLL_1 0.0003058910369873047
"""
def
_handle_lines
(
s
:
str
):
arr
=
s
.
split
(
" "
)
return
(
arr
[
0
],
float
(
arr
[
-
1
]))
from
cumm.gemm.codeops
import
group_by
def
print_str
(
s
:
str
):
nums
=
list
(
map
(
_handle_lines
,
s
.
strip
().
split
(
"
\n
"
)))
num_dict
=
group_by
(
lambda
x
:
x
[
0
],
nums
)
num_dict_
=
{
k
:
sum
([
vv
[
1
]
for
vv
in
v
])
for
k
,
v
in
num_dict
.
items
()}
print
(
num_dict_
)
print_str
(
STR1
)
print_str
(
STR2
)
\ No newline at end of file
test/benchmark.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -19,10 +19,12 @@ import numpy as np
import
torch
from
torch
import
nn
from
cumm
import
tensorview
as
tv
from
spconv.core
import
ConvAlgo
from
spconv.core
import
ConvAlgo
import
spconv.pytorch
as
spconv
from
spconv.utils
import
Point2VoxelCPU3d
def
waymo_data
(
batch_size
=
1
):
gen
=
Point2VoxelCPU3d
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
3
,
150000
,
1
)
...
...
@@ -42,7 +44,7 @@ def waymo_data(batch_size=1):
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
pool_algo
=
algo
pool_algo
=
algo
# pool_algo = ConvAlgo.Native
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
...
...
@@ -68,7 +70,6 @@ class Net(nn.Module):
# nn.BatchNorm1d(32),
# nn.ReLU(),
# spconv.SparseConv3d(64, 64, 2, 2, bias=False, indice_key="m0"),
spconv
.
SparseMaxPool3d
(
2
,
2
,
algo
=
pool_algo
),
spconv
.
SubMConv3d
(
64
,
96
,
...
...
@@ -101,7 +102,6 @@ class Net(nn.Module):
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
spconv
.
SparseMaxPool3d
(
2
,
2
,
algo
=
pool_algo
),
spconv
.
SubMConv3d
(
128
,
160
,
...
...
@@ -118,7 +118,6 @@ class Net(nn.Module):
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
spconv
.
SparseMaxPool3d
(
2
,
2
,
algo
=
pool_algo
),
spconv
.
SubMConv3d
(
160
,
192
,
...
...
@@ -136,7 +135,6 @@ class Net(nn.Module):
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
indice_key
=
"m4"
,
algo
=
pool_algo
),
# spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
...
...
@@ -174,7 +172,6 @@ class Net(nn.Module):
# # nn.ReLU(),
# spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False, algo=algo),
)
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
...
...
@@ -183,16 +180,25 @@ class Net(nn.Module):
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
def
forward
(
self
,
features
,
coors
,
batch_size
,
enable_timer
:
bool
=
False
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
,
enable_timer
=
enable_timer
)
return
self
.
net
(
x
)
class
Net2
(
nn
.
Module
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
128
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
spconv
.
SubMConv3d
(
3
,
128
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# spconv.SubMConv3d(32,
# 32,
...
...
@@ -240,20 +246,22 @@ class Net2(nn.Module):
self
.
grid
)
return
self
.
net
(
x
)
import
numpy
as
np
from
cumm
import
tensorview
as
tv
import
numpy
as
np
from
cumm
import
tensorview
as
tv
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
import
pickle
import
pickle
import
torch
from
spconv.pytorch.cppcore
import
torch_tensor_to_tv
from
spconv.pytorch.cppcore
import
torch_tensor_to_tv
def
sort_bench
():
with
open
(
"/home/yy/asd.pkl"
,
"rb"
)
as
f
:
a_th
=
pickle
.
load
(
f
)
mask_argsort
=
torch
.
empty
((
1
,
a_th
.
shape
[
1
]),
dtype
=
torch
.
int32
,
device
=
a_th
.
device
)
dtype
=
torch
.
int32
,
device
=
a_th
.
device
)
a
=
a_th
.
cpu
().
numpy
()[
0
]
a_tv
=
torch_tensor_to_tv
(
a_th
)
...
...
@@ -262,8 +270,9 @@ def sort_bench():
a_tv_1
=
a_tv
.
clone
()
SpconvOps
.
sort_1d_by_key
(
a_tv_1
[
0
],
mask_argsort_tv
[
0
])
def
main
():
import
pickle
import
pickle
np
.
random
.
seed
(
50051
)
torch
.
manual_seed
(
50051
)
# voxels, coors, spatial_shape = waymo_data()
...
...
@@ -280,24 +289,55 @@ def main():
voxels_th
=
torch
.
from_numpy
(
voxels
).
to
(
device
).
to
(
dtype
)
coors_th
=
torch
.
from_numpy
(
coors
).
to
(
device
).
int
()
voxels_th
.
requires_grad
=
True
algo
=
spconv
.
ConvAlgo
.
MaskImplicitGemm
algo
=
spconv
.
ConvAlgo
.
Native
# 3080 Laptop
# MaskImpGemm: 11.2ms
# MaskSplitImpGemm: 12.2ms
# Native: 13.7ms
# F32
# MaskSplitImpGemm: 22ms
# MaskImplicitGemm: 23.5ms
# Native: 21.7ms
# Pure Gemm
# Native: 6.6ms
# MaskImpGemm: 4.3ms
# MaskSplitImpGemm: 4.0ms
# F16 Bwd
# MaskSplitImpGemm: 12.2ms
# MaskImpGemm: 13.8ms
# Native: 25.2ms
# F32 Bwd
# Native: 41.9ms
# MaskImpGemm: 51.0ms
# MaskSplitImpGemm: 41.1ms
# algo = None
net
=
Net
(
spatial_shape
,
algo
).
to
(
device
).
eval
().
to
(
dtype
).
train
()
spconv
.
assign_name_for_sparse_modules
(
net
)
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
print
(
out
.
spatial_shape
)
print
(
voxels
.
mean
(),
voxels
.
max
(),
voxels
.
min
())
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out
.
features
.
shape
).
astype
(
np
.
float32
)
print
(
voxels
.
mean
(),
voxels
.
max
(),
voxels
.
min
())
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out
.
features
.
shape
).
astype
(
np
.
float32
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
).
to
(
dtype
)
print
(
out
.
spatial_shape
,
out
.
features
.
mean
(),
out
.
features
.
max
(),
out
.
features
.
min
())
print
(
out
.
spatial_shape
,
out
.
features
.
mean
(),
out
.
features
.
max
(),
out
.
features
.
min
())
times
=
[]
with
torch
.
no_grad
():
for
i
in
range
(
20
):
print
(
"------------"
)
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
out_nograd
=
net
(
voxels_th
,
coors_th
,
1
)
out_nograd
=
net
(
voxels_th
,
coors_th
,
1
,
True
)
timer
=
out_nograd
.
_timer
res
=
timer
.
collect_by_name
(
"forward"
,
timer
.
get_all_pair_time
())
res2
=
timer
.
collect_by_name
(
"forward0"
,
timer
.
get_all_pair_time
())
print
(
sum
(
res
.
values
())
+
sum
(
res2
.
values
()))
# print(timer.get_all_pair_time())
# print(sum(timer.get_all_pair_time().values()))
torch
.
cuda
.
synchronize
()
# sort_bench()
times
.
append
(
time
.
time
()
-
t
)
...
...
@@ -313,8 +353,8 @@ def main():
# torch.cuda.synchronize()
# times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
#
# #
print((net.grid == -1).float().sum(), net.grid.numel())
#
# #
print("spconv time", time.time() - t)
# print("spconv bw time", np.mean(times[5:]))
...
...
test/test_conv.py
View file @
82fd7a8b
# Copyright 2021 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -30,6 +30,7 @@ from spconv.constants import FILTER_HWIO
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
class
SparseConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
...
...
@@ -363,7 +364,10 @@ class TestSpConv(TestCase):
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskSplitImplicitGemm
]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskSplitImplicitGemm
]
algos
=
[
ConvAlgo
.
MaskSplitImplicitGemm
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
...
...
@@ -375,8 +379,16 @@ class TestSpConv(TestCase):
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
dtype
=
torch
.
float32
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
al
).
to
(
device
).
to
(
dtype
)
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
al
).
to
(
device
).
to
(
dtype
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
to
(
dtype
)
...
...
@@ -390,27 +402,32 @@ class TestSpConv(TestCase):
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
features_dense_t
.
requires_grad
=
True
if
net
.
algo
==
ConvAlgo
.
Native
:
if
FILTER_HWIO
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
OC
,
k
,
k
,
k
,
IC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
OC
,
k
,
k
,
k
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
...
...
@@ -446,7 +463,6 @@ class TestSpConv(TestCase):
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
def
testSpDeConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
]
...
...
@@ -499,11 +515,11 @@ class TestSpConv(TestCase):
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
print
(
net_ref
.
net
[
0
].
weight
.
shape
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
...
...
@@ -532,7 +548,6 @@ class TestSpConv(TestCase):
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
def
testSpCpConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
...
...
version.txt
View file @
82fd7a8b
2.1.3
\ No newline at end of file
2.1.5
\ No newline at end of file
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment