Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
52594038
Commit
52594038
authored
Dec 09, 2021
by
yan.yan
Browse files
v2.1.21: add sm37, avoid fp16 nan
parent
b0f52b8a
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
176 additions
and
70 deletions
+176
-70
CHANGELOG.md
CHANGELOG.md
+5
-0
setup.py
setup.py
+5
-4
spconv/algo.py
spconv/algo.py
+38
-15
spconv/build.py
spconv/build.py
+2
-1
spconv/core.py
spconv/core.py
+15
-15
spconv/core_cc/cumm/common.pyi
spconv/core_cc/cumm/common.pyi
+5
-0
spconv/cppconstants.py
spconv/cppconstants.py
+3
-1
spconv/pytorch/conv.py
spconv/pytorch/conv.py
+68
-18
spconv/pytorch/cppcore.py
spconv/pytorch/cppcore.py
+10
-1
spconv/pytorch/functional.py
spconv/pytorch/functional.py
+10
-4
spconv/pytorch/ops.py
spconv/pytorch/ops.py
+13
-9
test/benchmark.py
test/benchmark.py
+1
-1
version.txt
version.txt
+1
-1
No files found.
CHANGELOG.md
View file @
52594038
# Changelog
## [2.1.21] - 2021-12-9
### Added
-
add sm_37
-
add fp16 kernels witl fp32 accumulator (run slower, but can avoid nan if channel size is too large)
## [2.1.20] - 2021-12-6
### Added
-
Add fp16 conv simt kernels for mixed-training in pascal or older GPUS. WARNING: not optimized for TESLA P100 which has 2x throughput in half.
...
...
setup.py
View file @
52594038
...
...
@@ -38,9 +38,9 @@ if cuda_ver:
cuda_ver
=
cuda_ver
.
replace
(
"."
,
""
)
# 10.2 to 102
RELEASE_NAME
+=
"-cu{}"
.
format
(
cuda_ver
)
deps
=
[
"cumm-cu{}>=0.2.
6
"
.
format
(
cuda_ver
)]
deps
=
[
"cumm-cu{}>=0.2.
8
"
.
format
(
cuda_ver
)]
else
:
deps
=
[
"cumm>=0.2.
6
"
]
deps
=
[
"cumm>=0.2.
8
"
]
...
...
@@ -158,6 +158,7 @@ if disable_jit is not None and disable_jit == "1":
from
spconv.csrc.sparse.all
import
SpconvOps
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.hash.core
import
HashTable
from
cumm.common
import
CompileInfo
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
convcu
=
ConvMainUnitTest
(
IMPLGEMM_SIMT_PARAMS
+
IMPLGEMM_VOLTA_PARAMS
+
IMPLGEMM_TURING_PARAMS
)
...
...
@@ -171,9 +172,9 @@ if disable_jit is not None and disable_jit == "1":
std
=
"c++14"
else
:
std
=
"c++17"
cus
=
[
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
()]
cus
=
[
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
()
,
CompileInfo
()
]
if
CUMM_CPU_ONLY_BUILD
:
cus
=
[
SpconvOps
(),
BoxOps
(),
HashTable
()]
cus
=
[
SpconvOps
(),
BoxOps
(),
HashTable
()
,
CompileInfo
()
]
ext_modules
:
List
[
Extension
]
=
[
PCCMExtension
(
cus
,
"spconv/core_cc"
,
...
...
spconv/algo.py
View file @
52594038
...
...
@@ -514,10 +514,23 @@ class SimpleConv:
out
:
tv
.
Tensor
,
layout_i
:
ConvLayout
,
layout_w
:
ConvLayout
,
layout_o
:
ConvLayout
,
arch
:
Tuple
[
int
,
int
],
op_type
:
ConvOpType
,
mask_width
:
int
):
mask_width
:
int
,
fp32_accum
:
Optional
[
bool
]
=
None
):
avail_algos
=
get_available_algo_str_from_arch
(
arch
)
finally_algos
:
List
[
ConvAlgoDesp
]
=
[]
is_fp16
=
inp
.
dtype
==
tv
.
float16
and
weight
.
dtype
==
tv
.
float16
and
out
.
dtype
==
tv
.
float16
use_f32_as_accum
=
False
kv
=
int
(
np
.
prod
(
weight
.
shape
[
1
:
-
1
]))
# for 3d conv, if reduce axis is too large, may cause nan during
# forward.
if
is_fp16
:
if
fp32_accum
is
None
:
if
op_type
==
ConvOpType
.
kForward
:
use_f32_as_accum
=
weight
.
dim
(
-
1
)
*
kv
>
128
*
27
elif
op_type
==
ConvOpType
.
kBackwardInput
:
use_f32_as_accum
=
weight
.
dim
(
0
)
*
kv
>
128
*
27
else
:
use_f32_as_accum
=
fp32_accum
for
algo
in
avail_algos
:
static_key
=
(
layout_i
.
layout_type
.
value
,
layout_w
.
layout_type
.
value
,
...
...
@@ -531,6 +544,14 @@ class SimpleConv:
# skip volta tensor op since it is very slow in architectures except volta.
if
arch
>=
(
7
,
5
)
and
desp
.
algo
==
GemmAlgo
.
Volta
.
value
:
continue
if
arch
>=
(
7
,
0
)
and
is_fp16
:
# skip simt fp16 kernels if we have tensor core
if
desp
.
algo
==
GemmAlgo
.
Simt
:
continue
if
use_f32_as_accum
:
if
desp
.
dacc
==
tv
.
float16
:
continue
ldi
=
inp
.
dim
(
-
1
)
ldw
=
weight
.
dim
(
-
1
)
ldo
=
out
.
dim
(
-
1
)
...
...
@@ -589,9 +610,11 @@ class SimpleConv:
mask_output
:
tv
.
Tensor
=
tv
.
Tensor
(),
alpha
:
float
=
1.0
,
beta
:
float
=
0.0
,
stream
:
int
=
0
):
stream
:
int
=
0
,
fp32_accum
:
Optional
[
bool
]
=
None
):
avail
=
self
.
get_all_available
(
inp
,
weight
,
output
,
layout_i
,
layout_w
,
layout_o
,
arch
,
op_type
,
mask_width
)
layout_o
,
arch
,
op_type
,
mask_width
,
fp32_accum
)
inp
=
inp
.
clone
()
weight
=
weight
.
clone
()
output
=
output
.
clone
()
...
...
spconv/build.py
View file @
52594038
...
...
@@ -26,6 +26,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from
cumm.gemm.main
import
GemmMainUnitTest
from
cumm.conv.main
import
ConvMainUnitTest
from
cumm.common
import
CompileInfo
from
spconv.csrc.sparse.all
import
SpconvOps
from
spconv.csrc.utils
import
BoxOps
...
...
@@ -41,7 +42,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
if
InWindows
:
# windows have command line limit, so we use objects_folder to reduce command size.
objects_folder
=
"objects"
pccm
.
builder
.
build_pybind
([
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
()],
pccm
.
builder
.
build_pybind
([
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
()
,
CompileInfo
()
],
PACKAGE_ROOT
/
"core_cc"
,
namespace_root
=
PACKAGE_ROOT
,
objects_folder
=
objects_folder
,
...
...
spconv/core.py
View file @
52594038
...
...
@@ -403,7 +403,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
64
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -415,7 +415,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
64
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -427,7 +427,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
128
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -439,7 +439,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
256
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -490,7 +490,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
64
,
32
),
(
32
,
32
,
16
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -502,7 +502,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
64
,
32
),
(
32
,
32
,
16
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -514,7 +514,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
256
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -526,7 +526,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
128
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -538,7 +538,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
128
,
64
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -550,7 +550,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
128
,
64
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -562,7 +562,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
128
,
64
),
(
32
,
32
,
64
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -574,7 +574,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
32
,
128
,
64
),
(
32
,
64
,
64
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -586,7 +586,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
128
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -598,7 +598,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
128
,
64
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
@@ -610,7 +610,7 @@ IMPLGEMM_TURING_PARAMS = [
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
64
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f16,f16,f16,f16,f16"
],
2
,
[
"f16,f16,f16,f16,f16"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
...
...
spconv/core_cc/cumm/common.pyi
0 → 100644
View file @
52594038
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class CompileInfo:
@staticmethod
def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
spconv/cppconstants.py
View file @
52594038
...
...
@@ -20,5 +20,7 @@ else:
CPU_ONLY_BUILD
=
True
from
spconv.core_cc.csrc.utils.boxops
import
BoxOps
from
spconv.core_cc.cumm.common
import
CompileInfo
HAS_BOOST
=
BoxOps
.
has_boost
()
COMPILED_CUDA_ARCHS
=
set
(
CompileInfo
.
get_compiled_cuda_arch
())
spconv/pytorch/conv.py
View file @
52594038
...
...
@@ -36,8 +36,6 @@ from spconv.utils import nullcontext
from
torch.nn.init
import
calculate_gain
class
SparseConvolution
(
SparseModule
):
__constants__
=
[
'stride'
,
'padding'
,
'dilation'
,
'groups'
,
'bias'
,
'subm'
,
'inverse'
,
...
...
@@ -60,6 +58,7 @@ class SparseConvolution(SparseModule):
inverse
:
bool
=
False
,
indice_key
:
Optional
[
str
]
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConvolution
,
self
).
__init__
(
name
=
name
)
assert
groups
==
1
,
"don't support groups for now"
...
...
@@ -78,7 +77,9 @@ class SparseConvolution(SparseModule):
if
not
subm
:
self
.
conv1x1
&=
kv_stride
==
1
if
self
.
conv1x1
:
assert
self
.
padding
==
[
0
]
*
ndim
,
"padding must be zero for 1x1 conv (k=1,s=1)"
assert
self
.
padding
==
[
0
]
*
ndim
,
"padding must be zero for 1x1 conv (k=1,s=1)"
self
.
transposed
=
transposed
self
.
inverse
=
inverse
self
.
output_padding
=
expand_nd
(
ndim
,
output_padding
)
...
...
@@ -98,6 +99,7 @@ class SparseConvolution(SparseModule):
if
CPU_ONLY_BUILD
:
assert
algo
==
ConvAlgo
.
Native
,
"cpu only build only support native algorithm"
self
.
algo
=
algo
self
.
fp32_accum
=
fp32_accum
# self.algo = ConvAlgo.Native
if
self
.
algo
==
ConvAlgo
.
Native
:
if
FILTER_HWIO
:
...
...
@@ -150,18 +152,25 @@ class SparseConvolution(SparseModule):
mode
=
mode
.
lower
()
valid_modes
=
[
'fan_in'
,
'fan_out'
]
if
mode
not
in
valid_modes
:
raise
ValueError
(
"Mode {} not supported, please use one of {}"
.
format
(
mode
,
valid_modes
))
raise
ValueError
(
"Mode {} not supported, please use one of {}"
.
format
(
mode
,
valid_modes
))
fan_in
,
fan_out
=
self
.
_calculate_fan_in_and_fan_out
()
return
fan_in
if
mode
==
'fan_in'
else
fan_out
def
_custom_kaiming_uniform_
(
self
,
tensor
,
a
=
0
,
mode
=
'fan_in'
,
nonlinearity
=
'leaky_relu'
):
def
_custom_kaiming_uniform_
(
self
,
tensor
,
a
=
0
,
mode
=
'fan_in'
,
nonlinearity
=
'leaky_relu'
):
r
"""same as torch.init.kaiming_uniform_, with KRSC layout support
"""
fan
=
self
.
_calculate_correct_fan
(
mode
)
gain
=
calculate_gain
(
nonlinearity
,
a
)
std
=
gain
/
math
.
sqrt
(
fan
)
bound
=
math
.
sqrt
(
3.0
)
*
std
# Calculate uniform bounds from standard deviation
bound
=
math
.
sqrt
(
3.0
)
*
std
# Calculate uniform bounds from standard deviation
with
torch
.
no_grad
():
return
tensor
.
uniform_
(
-
bound
,
bound
)
...
...
@@ -268,7 +277,8 @@ class SparseConvolution(SparseModule):
indice_pairs
=
datas
.
indice_pairs
indice_pair_num
=
datas
.
indice_pair_num
assert
self
.
subm
,
"only support reuse subm indices"
self
.
_check_subm_reuse_valid
(
input
,
spatial_shape
,
datas
)
self
.
_check_subm_reuse_valid
(
input
,
spatial_shape
,
datas
)
else
:
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
...
...
@@ -360,7 +370,8 @@ class SparseConvolution(SparseModule):
mask_argsort_bwd_splits
=
datas
.
mask_argsort_bwd_splits
masks
=
datas
.
masks
assert
self
.
subm
,
"only support reuse subm indices"
self
.
_check_subm_reuse_valid
(
input
,
spatial_shape
,
datas
)
self
.
_check_subm_reuse_valid
(
input
,
spatial_shape
,
datas
)
else
:
with
input
.
_timer
.
namespace
(
"gen_pairs"
):
...
...
@@ -432,7 +443,7 @@ class SparseConvolution(SparseModule):
pair_mask_fwd_splits
,
pair_mask_bwd_splits
,
mask_argsort_fwd_splits
,
mask_argsort_bwd_splits
,
num_activate_out
,
masks
,
self
.
training
,
self
.
subm
,
input
.
_timer
)
input
.
_timer
,
self
.
fp32_accum
)
if
self
.
bias
is
not
None
:
out_features
+=
self
.
bias
if
input
.
benchmark
:
...
...
@@ -449,21 +460,28 @@ class SparseConvolution(SparseModule):
out_tensor
.
spatial_shape
=
out_spatial_shape
return
out_tensor
def
_check_subm_reuse_valid
(
self
,
inp
:
SparseConvTensor
,
spatial_shape
:
List
[
int
],
datas
:
Union
[
ImplicitGemmIndiceData
,
IndiceData
]):
def
_check_subm_reuse_valid
(
self
,
inp
:
SparseConvTensor
,
spatial_shape
:
List
[
int
],
datas
:
Union
[
ImplicitGemmIndiceData
,
IndiceData
]):
assert
datas
.
is_subm
,
"only support reuse subm indices"
if
self
.
kernel_size
!=
datas
.
ksize
:
raise
ValueError
(
f
"subm with same indice_key must have same kernel"
raise
ValueError
(
f
"subm with same indice_key must have same kernel"
f
" size, expect
{
datas
.
ksize
}
, this layer
{
self
.
kernel_size
}
"
)
if
self
.
dilation
!=
datas
.
dilation
:
raise
ValueError
(
f
"subm with same indice_key must have same dilation"
raise
ValueError
(
f
"subm with same indice_key must have same dilation"
f
", expect
{
datas
.
dilation
}
, this layer
{
self
.
dilation
}
"
)
if
inp
.
spatial_shape
!=
datas
.
spatial_shape
:
raise
ValueError
(
f
"subm with same indice_key must have same spatial structure"
raise
ValueError
(
f
"subm with same indice_key must have same spatial structure"
f
", expect
{
datas
.
spatial_shape
}
, input
{
spatial_shape
}
"
)
if
inp
.
indices
.
shape
[
0
]
!=
datas
.
indices
.
shape
[
0
]:
raise
ValueError
(
f
"subm with same indice_key must have same num of indices"
f
", expect
{
datas
.
indices
.
shape
[
0
]
}
, input
{
inp
.
indices
.
shape
[
0
]
}
"
)
raise
ValueError
(
f
"subm with same indice_key must have same num of indices"
f
", expect
{
datas
.
indices
.
shape
[
0
]
}
, input
{
inp
.
indices
.
shape
[
0
]
}
"
)
class
SparseConv1d
(
SparseConvolution
):
...
...
@@ -478,6 +496,7 @@ class SparseConv1d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConv1d
,
self
).
__init__
(
1
,
in_channels
,
...
...
@@ -490,6 +509,7 @@ class SparseConv1d(SparseConvolution):
bias
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -505,6 +525,7 @@ class SparseConv2d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConv2d
,
self
).
__init__
(
2
,
in_channels
,
...
...
@@ -517,6 +538,7 @@ class SparseConv2d(SparseConvolution):
bias
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -532,6 +554,7 @@ class SparseConv3d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConv3d
,
self
).
__init__
(
3
,
in_channels
,
...
...
@@ -544,6 +567,7 @@ class SparseConv3d(SparseConvolution):
bias
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -559,6 +583,7 @@ class SparseConv4d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConv4d
,
self
).
__init__
(
4
,
in_channels
,
...
...
@@ -571,6 +596,7 @@ class SparseConv4d(SparseConvolution):
bias
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -586,6 +612,7 @@ class SparseConvTranspose1d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConvTranspose1d
,
self
).
__init__
(
1
,
in_channels
,
...
...
@@ -599,6 +626,7 @@ class SparseConvTranspose1d(SparseConvolution):
transposed
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -614,6 +642,7 @@ class SparseConvTranspose2d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConvTranspose2d
,
self
).
__init__
(
2
,
in_channels
,
...
...
@@ -627,6 +656,7 @@ class SparseConvTranspose2d(SparseConvolution):
transposed
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -642,6 +672,7 @@ class SparseConvTranspose3d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConvTranspose3d
,
self
).
__init__
(
3
,
in_channels
,
...
...
@@ -655,6 +686,7 @@ class SparseConvTranspose3d(SparseConvolution):
transposed
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -670,6 +702,7 @@ class SparseConvTranspose4d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseConvTranspose4d
,
self
).
__init__
(
4
,
in_channels
,
...
...
@@ -683,6 +716,7 @@ class SparseConvTranspose4d(SparseConvolution):
transposed
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -694,6 +728,7 @@ class SparseInverseConv1d(SparseConvolution):
indice_key
,
bias
=
True
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseInverseConv1d
,
self
).
__init__
(
1
,
in_channels
,
...
...
@@ -703,6 +738,7 @@ class SparseInverseConv1d(SparseConvolution):
inverse
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -714,6 +750,7 @@ class SparseInverseConv2d(SparseConvolution):
indice_key
,
bias
=
True
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseInverseConv2d
,
self
).
__init__
(
2
,
in_channels
,
...
...
@@ -723,6 +760,7 @@ class SparseInverseConv2d(SparseConvolution):
inverse
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -734,6 +772,7 @@ class SparseInverseConv3d(SparseConvolution):
indice_key
,
bias
=
True
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseInverseConv3d
,
self
).
__init__
(
3
,
in_channels
,
...
...
@@ -743,6 +782,7 @@ class SparseInverseConv3d(SparseConvolution):
inverse
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -754,6 +794,7 @@ class SparseInverseConv4d(SparseConvolution):
indice_key
,
bias
=
True
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SparseInverseConv4d
,
self
).
__init__
(
4
,
in_channels
,
...
...
@@ -763,6 +804,7 @@ class SparseInverseConv4d(SparseConvolution):
inverse
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -778,6 +820,7 @@ class SubMConv1d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SubMConv1d
,
self
).
__init__
(
1
,
in_channels
,
...
...
@@ -791,6 +834,7 @@ class SubMConv1d(SparseConvolution):
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -806,6 +850,7 @@ class SubMConv2d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SubMConv2d
,
self
).
__init__
(
2
,
in_channels
,
...
...
@@ -819,6 +864,7 @@ class SubMConv2d(SparseConvolution):
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -834,6 +880,7 @@ class SubMConv3d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SubMConv3d
,
self
).
__init__
(
3
,
in_channels
,
...
...
@@ -847,6 +894,7 @@ class SubMConv3d(SparseConvolution):
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
...
...
@@ -862,6 +910,7 @@ class SubMConv4d(SparseConvolution):
bias
=
True
,
indice_key
=
None
,
algo
:
Optional
[
ConvAlgo
]
=
None
,
fp32_accum
:
Optional
[
bool
]
=
None
,
name
=
None
):
super
(
SubMConv4d
,
self
).
__init__
(
4
,
in_channels
,
...
...
@@ -875,4 +924,5 @@ class SubMConv4d(SparseConvolution):
True
,
indice_key
=
indice_key
,
algo
=
algo
,
fp32_accum
=
fp32_accum
,
name
=
name
)
spconv/pytorch/cppcore.py
View file @
52594038
...
...
@@ -15,6 +15,8 @@
from
cumm
import
tensorview
as
tv
import
torch
from
typing
import
Optional
,
List
from
spconv.cppconstants
import
COMPILED_CUDA_ARCHS
import
sys
_TORCH_DTYPE_TO_TV
=
{
torch
.
float32
:
tv
.
float32
,
...
...
@@ -53,6 +55,13 @@ def torch_tensors_to_tv(*tens: torch.Tensor):
def
get_current_stream
():
return
torch
.
cuda
.
current_stream
().
cuda_stream
def
get_arch
():
arch
=
torch
.
cuda
.
get_device_capability
()
if
arch
not
in
COMPILED_CUDA_ARCHS
:
print
(
f
"[WARNING]your gpu arch
{
arch
}
isn't compiled in prebuilt, "
f
"may cause invalid device function. "
f
"available:
{
COMPILED_CUDA_ARCHS
}
"
,
file
=
sys
.
stderr
)
return
arch
if
__name__
==
"__main__"
:
a
=
torch
.
rand
(
2
,
2
)
...
...
spconv/pytorch/functional.py
View file @
52594038
...
...
@@ -179,14 +179,16 @@ class SparseImplicitGemmFunction(Function):
masks
:
List
[
np
.
ndarray
],
is_train
:
bool
,
is_subm
:
bool
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
fp32_accum
:
Optional
[
bool
]
=
None
):
try
:
out
,
mask_out
,
mask_width
=
ops
.
implicit_gemm
(
features
,
filters
,
pair_fwd
,
pair_mask_fwd_splits
,
mask_argsort_fwd_splits
,
num_activate_out
,
masks
,
is_train
,
is_subm
,
timer
)
is_train
,
is_subm
,
timer
,
fp32_accum
)
except
Exception
as
e
:
msg
=
"[Exception|implicit_gemm]"
msg
+=
f
"feat=
{
features
.
shape
}
,w=
{
filters
.
shape
}
,pair=
{
pair_fwd
.
shape
}
,"
...
...
@@ -208,6 +210,7 @@ class SparseImplicitGemmFunction(Function):
# ctx.num_activate_out = num_activate_out
ctx
.
masks
=
masks
ctx
.
is_subm
=
is_subm
ctx
.
fp32_accum
=
fp32_accum
return
out
@
staticmethod
...
...
@@ -225,6 +228,8 @@ class SparseImplicitGemmFunction(Function):
masks
=
ctx
.
masks
is_subm
=
ctx
.
is_subm
timer
=
ctx
.
timer
fp32_accum
=
ctx
.
fp32_accum
try
:
input_bp
,
filters_bp
=
ops
.
implicit_gemm_backward
(
features
,
...
...
@@ -240,7 +245,8 @@ class SparseImplicitGemmFunction(Function):
masks
=
masks
,
mask_width
=
mask_width
,
is_subm
=
is_subm
,
timer
=
timer
)
timer
=
timer
,
fp32_accum
=
fp32_accum
)
except
Exception
as
e
:
msg
=
"[Exception|implicit_gemm_backward]"
msg
+=
f
"feat=
{
features
.
shape
}
,w=
{
filters
.
shape
}
,pair=
{
pair_fwd
.
shape
}
,"
...
...
@@ -251,7 +257,7 @@ class SparseImplicitGemmFunction(Function):
masks
))
raise
e
None_9
=
[
None
]
*
1
1
None_9
=
[
None
]
*
1
2
return
(
input_bp
,
filters_bp
,
*
None_9
)
...
...
spconv/pytorch/ops.py
View file @
52594038
...
...
@@ -23,7 +23,7 @@ import spconv
from
spconv.core
import
AlgoHint
,
ConvAlgo
from
typing
import
List
,
Optional
,
Union
from
spconv.pytorch.core
import
ThrustSortAllocator
from
spconv.pytorch.cppcore
import
torch_tensor_to_tv
,
get_current_stream
from
spconv.pytorch.cppcore
import
torch_tensor_to_tv
,
get_current_stream
,
get_arch
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
import
spconv.core_cc
as
_ext
...
...
@@ -666,7 +666,7 @@ def indice_conv(features: torch.Tensor,
profile_idx
=
i
assert
nhot_profile
>
0
,
"this shouldn't happen"
# print(nhot_profile, indice_pair_num_cpu)
arch
=
torch
.
cuda
.
get_device_capability
()
arch
=
get_arch
()
tuned_res
=
GEMM
.
get_tuned_algo
(
a
.
dtype
,
filters_tv
.
dtype
,
...
...
@@ -809,7 +809,7 @@ def indice_conv_backward(features: torch.Tensor,
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
maxnhot
=
max
(
indice_pair_num_cpu
)
arch
=
torch
.
cuda
.
get_device_capability
()
arch
=
get_arch
()
filters_tv
=
torch_tensor_to_tv
(
filters
)
dfilters_tv
=
torch_tensor_to_tv
(
dfilters
)
...
...
@@ -1051,7 +1051,8 @@ def implicit_gemm(features: torch.Tensor,
masks
:
List
[
np
.
ndarray
],
is_train
:
bool
,
is_subm
:
bool
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
fp32_accum
:
Optional
[
bool
]
=
None
):
stream
=
get_current_stream
()
# if DEBUG:
...
...
@@ -1085,7 +1086,7 @@ def implicit_gemm(features: torch.Tensor,
features_tv
=
torch_tensor_to_tv
(
features
)
filters_tv
=
torch_tensor_to_tv
(
filters
)
out_features_tv
=
torch_tensor_to_tv
(
out_features
)
arch
=
torch
.
cuda
.
get_device_capability
()
arch
=
get_arch
()
pair_mask_fwd_split_tvs
=
[
torch_tensor_to_tv
(
x
,
dtype
=
tv
.
uint32
)
for
x
in
pair_mask_fwd_splits
]
...
...
@@ -1113,7 +1114,8 @@ def implicit_gemm(features: torch.Tensor,
indices
=
pair_fwd_tv
,
reverse_mask
=
False
,
mask_filter
=
masks
[
0
].
item
(),
stream
=
stream
)
stream
=
stream
,
fp32_accum
=
fp32_accum
)
mask_width
=
tune_res
.
algo_desp
.
tile_shape
[
0
]
if
is_train
:
mask_output_fwd
=
torch
.
empty
(
...
...
@@ -1180,7 +1182,8 @@ def implicit_gemm_backward(features: torch.Tensor,
masks
:
List
[
np
.
ndarray
],
mask_width
:
int
,
is_subm
:
bool
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
fp32_accum
:
Optional
[
bool
]
=
None
):
# print(out_bp.mean(), out_bp.max(), out_bp.min())
if
features
.
dtype
==
torch
.
int8
or
features
.
dtype
==
torch
.
qint8
:
raise
NotImplementedError
(
"work in progress"
)
...
...
@@ -1217,7 +1220,7 @@ def implicit_gemm_backward(features: torch.Tensor,
dout_tv
=
torch_tensor_to_tv
(
out_bp
)
din_tv
=
torch_tensor_to_tv
(
din
)
mask_output_fwd_tv
=
torch_tensor_to_tv
(
mask_output_fwd
,
dtype
=
tv
.
uint32
)
arch
=
torch
.
cuda
.
get_device_capability
()
arch
=
get_arch
()
pair_mask_fwd_split_tvs
=
[
torch_tensor_to_tv
(
x
,
dtype
=
tv
.
uint32
)
for
x
in
pair_mask_fwd_splits
]
...
...
@@ -1263,7 +1266,8 @@ def implicit_gemm_backward(features: torch.Tensor,
indices
=
pair_bwd_tv
,
reverse_mask
=
is_subm
,
mask_filter
=
masks
[
0
].
item
(),
stream
=
stream
)
stream
=
stream
,
fp32_accum
=
fp32_accum
)
if
wgrad_tune_res
is
None
:
wgrad_tune_res
,
_
=
CONV
.
tune_and_cache
(
ConvOpType
.
kBackwardWeight
,
...
...
test/benchmark.py
View file @
52594038
...
...
@@ -289,7 +289,7 @@ def main():
voxels_th
=
torch
.
from_numpy
(
voxels
).
to
(
device
).
to
(
dtype
)
coors_th
=
torch
.
from_numpy
(
coors
).
to
(
device
).
int
()
voxels_th
.
requires_grad
=
True
algo
=
spconv
.
ConvAlgo
.
Native
algo
=
spconv
.
ConvAlgo
.
MaskImplicitGemm
# 3080 Laptop
# MaskImpGemm: 11.2ms
# MaskSplitImpGemm: 12.2ms
...
...
version.txt
View file @
52594038
2.1.20
\ No newline at end of file
2.1.21
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment