Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
bab09b63
Commit
bab09b63
authored
Jul 12, 2022
by
yan.yan
Browse files
Merge branch 'develop'
parents
7af751dc
66529500
Changes
35
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1453 additions
and
688 deletions
+1453
-688
spconv/pytorch/core.py
spconv/pytorch/core.py
+8
-1
spconv/pytorch/cppcore.py
spconv/pytorch/cppcore.py
+15
-6
spconv/pytorch/functional.py
spconv/pytorch/functional.py
+1
-0
spconv/pytorch/hash.py
spconv/pytorch/hash.py
+6
-6
spconv/pytorch/modules.py
spconv/pytorch/modules.py
+1
-0
spconv/pytorch/ops.py
spconv/pytorch/ops.py
+98
-49
test/benchmark.py
test/benchmark.py
+10
-9
test/test_all_algo.py
test/test_all_algo.py
+682
-0
test/test_conv.py
test/test_conv.py
+275
-582
test/test_implgemm.py
test/test_implgemm.py
+0
-15
test/test_multi_impl.py
test/test_multi_impl.py
+341
-0
test/test_native_kernels.py
test/test_native_kernels.py
+0
-14
test_before_push.sh
test_before_push.sh
+10
-0
tools/install_windows_cuda.ps1
tools/install_windows_cuda.ps1
+5
-5
version.txt
version.txt
+1
-1
No files found.
spconv/pytorch/core.py
View file @
bab09b63
...
...
@@ -131,7 +131,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
indice_dict
:
Optional
[
dict
]
=
None
,
benchmark
:
bool
=
False
,
permanent_thrust_allocator
:
bool
=
False
,
enable_timer
:
bool
=
False
):
enable_timer
:
bool
=
False
,
force_algo
:
Optional
[
ConvAlgo
]
=
None
):
"""
Args:
features: [num_points, num_features] feature tensor
...
...
@@ -142,6 +143,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor.
enable_timer: if exists, all spconv internal ops run time will be record in _timer.
force_algo: force conv/pool layers use this algo, should only used for debug.
"""
ndim
=
indices
.
shape
[
1
]
-
1
assert
features
.
ndim
==
2
...
...
@@ -166,6 +169,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
if
permanent_thrust_allocator
:
self
.
thrust_allocator
=
ThrustSortAllocator
(
features
.
device
)
self
.
_timer
=
CUDAKernelTimer
(
enable_timer
)
self
.
force_algo
=
force_algo
def
replace_feature
(
self
,
feature
:
torch
.
Tensor
):
"""we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features))
...
...
@@ -179,6 +183,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
new_spt
.
benchmark_record
=
self
.
benchmark_record
new_spt
.
thrust_allocator
=
self
.
thrust_allocator
new_spt
.
_timer
=
self
.
_timer
new_spt
.
force_algo
=
self
.
force_algo
return
new_spt
@
property
...
...
@@ -244,6 +250,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
tensor
.
benchmark_record
=
self
.
benchmark_record
tensor
.
thrust_allocator
=
self
.
thrust_allocator
tensor
.
_timer
=
self
.
_timer
tensor
.
force_algo
=
self
.
force_algo
return
tensor
def
expand_nd
(
ndim
:
int
,
val
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...],
np
.
ndarray
])
->
List
[
int
]:
...
...
spconv/pytorch/cppcore.py
View file @
bab09b63
...
...
@@ -36,8 +36,9 @@ _ALL_INTS = {tv.int32, tv.int16, tv.int8, tv.int64, tv.uint64, tv.uint8, tv.uint
def
torch_tensor_to_tv
(
ten
:
torch
.
Tensor
,
dtype
:
Optional
[
int
]
=
None
,
shape
:
Optional
[
List
[
int
]]
=
None
):
assert
ten
.
is_contiguous
(),
"must be contiguous tensor"
shape
:
Optional
[
List
[
int
]]
=
None
,
stride
:
Optional
[
List
[
int
]]
=
None
):
# assert ten.is_contiguous(), "must be contiguous tensor"
ptr
=
ten
.
data_ptr
()
device
=
ten
.
device
if
device
.
type
==
"cpu"
:
...
...
@@ -46,12 +47,20 @@ def torch_tensor_to_tv(ten: torch.Tensor,
tv_device
=
0
else
:
raise
NotImplementedError
if
shape
is
None
:
shape
=
list
(
ten
.
shape
)
if
dtype
is
None
:
dtype
=
_TORCH_DTYPE_TO_TV
[
ten
.
dtype
]
stride
=
ten
.
stride
()
return
tv
.
from_blob_strided
(
ptr
,
shape
,
list
(
stride
),
dtype
,
tv_device
)
if
stride
is
None
:
stride
=
list
(
ten
.
stride
())
if
shape
is
None
:
shape
=
list
(
ten
.
shape
)
else
:
if
not
ten
.
is_contiguous
():
msg
=
"if you provide custom shape for non-contig tensor, stride must not None"
assert
stride
is
not
None
,
msg
else
:
# custom shape, if tensor is contiguous, we use from_blob and calc strides
return
tv
.
from_blob
(
ptr
,
shape
,
dtype
,
tv_device
)
return
tv
.
from_blob_strided
(
ptr
,
shape
,
stride
,
dtype
,
tv_device
)
def
torch_tensors_to_tv
(
*
tens
:
torch
.
Tensor
):
return
(
torch_tensor_to_tv
(
t
)
for
t
in
tens
)
...
...
spconv/pytorch/functional.py
View file @
bab09b63
...
...
@@ -19,6 +19,7 @@ import torch
from
torch
import
nn
from
torch.autograd
import
Function
from
typing
import
Optional
,
TypeVar
from
spconv.pytorch.core
import
SparseConvTensor
from
spconv.tools
import
CUDAKernelTimer
from
spconv.pytorch
import
ops
,
SparseConvTensor
from
spconv.pytorch.constants
import
PYTORCH_VERSION
...
...
spconv/pytorch/hash.py
View file @
bab09b63
...
...
@@ -80,7 +80,7 @@ class HashTable:
def
query
(
self
,
keys
:
torch
.
Tensor
,
values
:
Optional
[
torch
.
Tensor
]
=
None
):
"""query value by keys, if values is not None, create a new one.
return values and a uint8 tensor that whether query
success
.
return values and a uint8 tensor that whether query
fail
.
"""
keys_tv
=
torch_tensor_to_tv
(
keys
)
if
values
is
None
:
...
...
@@ -96,17 +96,17 @@ class HashTable:
def
insert_exist_keys
(
self
,
keys
:
torch
.
Tensor
,
values
:
torch
.
Tensor
):
"""insert kv that k exists in table. return a uint8 tensor that
whether insert
success
.
whether insert
fail
.
"""
keys_tv
=
torch_tensor_to_tv
(
keys
)
values_tv
=
torch_tensor_to_tv
(
values
)
stream
=
0
if
not
self
.
is_cpu
:
stream
=
get_current_stream
()
is_
success
=
torch
.
empty
([
keys
.
shape
[
0
]],
dtype
=
torch
.
uint8
,
device
=
keys
.
device
)
is_
success
_tv
=
torch_tensor_to_tv
(
is_
success
)
self
.
_table
.
insert_exist_keys
(
keys_tv
,
values_tv
,
is_
success
_tv
,
stream
)
return
is_
success
>
0
is_
empty
=
torch
.
empty
([
keys
.
shape
[
0
]],
dtype
=
torch
.
uint8
,
device
=
keys
.
device
)
is_
empty
_tv
=
torch_tensor_to_tv
(
is_
empty
)
self
.
_table
.
insert_exist_keys
(
keys_tv
,
values_tv
,
is_
empty
_tv
,
stream
)
return
is_
empty
def
assign_arange_
(
self
):
"""iterate table, assign values with "arange" value.
...
...
spconv/pytorch/modules.py
View file @
bab09b63
...
...
@@ -137,6 +137,7 @@ class SparseSequential(SparseModule):
input
=
module
(
input
)
else
:
if
isinstance
(
input
,
spconv
.
SparseConvTensor
):
print
(
input
.
features
.
shape
)
if
input
.
indices
.
shape
[
0
]
!=
0
:
input
=
input
.
replace_feature
(
module
(
input
.
features
))
else
:
...
...
spconv/pytorch/ops.py
View file @
bab09b63
...
...
@@ -39,7 +39,7 @@ else:
GEMM
=
None
CONV
=
None
import
time
from
spconv.constants
import
FILTER_HWIO
from
spconv.constants
import
FILTER_HWIO
,
ALL_WEIGHT_IS_KRSC
from
cumm.gemm
import
codeops
from
spconv.tools
import
CUDAKernelTimer
...
...
@@ -630,21 +630,40 @@ def indice_conv(features: torch.Tensor,
if
features
.
dtype
==
torch
.
int8
or
features
.
dtype
==
torch
.
qint8
:
raise
NotImplementedError
(
"work in progress"
)
if
FILTER_HWIO
:
out_channel
=
filters
.
shape
[
-
1
]
if
not
ALL_WEIGHT_IS_KRSC
:
kv_dim
=
0
is_KC_not_CK
=
not
FILTER_HWIO
if
FILTER_HWIO
:
out_channel
=
filters
.
shape
[
-
1
]
filter_shape_per_kv
=
[
filters
.
shape
[
-
2
],
out_channel
]
else
:
out_channel
=
filters
.
shape
[
-
2
]
filter_shape_per_kv
=
[
out_channel
,
filters
.
shape
[
-
1
]]
filters
=
filters
.
reshape
(
-
1
,
*
filters
.
shape
[
-
2
:])
kv
=
filters
.
shape
[
0
]
else
:
out_channel
=
filters
.
shape
[
-
2
]
filters
=
filters
.
reshape
(
-
1
,
*
filters
.
shape
[
-
2
:])
kv
=
filters
.
shape
[
0
]
kv_dim
=
1
out_channel
=
filters
.
shape
[
0
]
filters
=
filters
.
reshape
(
out_channel
,
-
1
,
filters
.
shape
[
-
1
])
is_KC_not_CK
=
True
kv
=
filters
.
shape
[
1
]
filter_shape_per_kv
=
[
out_channel
,
filters
.
shape
[
-
1
]]
kv_center
=
kv
//
2
if
subm
:
# out_features = torch.zeros((num_activate_out, out_channel),
# dtype=features.dtype,
# device=features.device)
if
FILTER_HWIO
:
out_features
=
torch
.
mm
(
features
,
filters
[
kv_center
])
if
not
ALL_WEIGHT_IS_KRSC
:
if
not
is_KC_not_CK
:
out_features
=
torch
.
mm
(
features
,
filters
[
kv_center
])
else
:
out_features
=
torch
.
mm
(
features
,
filters
[
kv_center
].
T
)
else
:
out_features
=
torch
.
mm
(
features
,
filters
[
kv_center
].
T
)
out_features
=
torch
.
mm
(
features
,
filters
[
:,
kv_center
].
T
)
else
:
out_features
=
torch
.
zeros
((
num_activate_out
,
out_channel
),
dtype
=
features
.
dtype
,
...
...
@@ -664,7 +683,6 @@ def indice_conv(features: torch.Tensor,
pair_in
=
indice_pairs_tv
[
int
(
inverse
)]
pair_out
=
indice_pairs_tv
[
int
(
not
inverse
)]
filters_tv
=
torch_tensor_to_tv
(
filters
)
if
not
features
.
is_cuda
:
# perform gather-mm-scatter_add for cpu data
assert
not
filters
.
is_cuda
...
...
@@ -686,7 +704,8 @@ def indice_conv(features: torch.Tensor,
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
SpconvOps
.
gather_cpu
(
inp_buffer_tv
,
a
,
inp_indices
)
filters_cur
=
filters
[
i
]
if
FILTER_HWIO
else
filters
[
i
].
T
filters_i
=
filters
.
select
(
kv_dim
,
i
)
filters_cur
=
filters_i
if
not
is_KC_not_CK
else
filters_i
.
T
torch
.
mm
(
inp_buffer
[:
nhot
],
filters_cur
,
out
=
out_buffer
[:
nhot
])
SpconvOps
.
scatter_add_cpu
(
c
,
out_buffer_tv
,
out_indices
)
...
...
@@ -713,10 +732,10 @@ def indice_conv(features: torch.Tensor,
filters_tv
.
dtype
,
c
.
dtype
,
a
.
shape
,
filter
s
.
shape
[
-
2
:]
,
filter
_
shape
_per_kv
,
c
.
shape
,
False
,
False
if
FILTER_HWIO
else
True
,
is_KC_not_CK
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
...
...
@@ -732,13 +751,14 @@ def indice_conv(features: torch.Tensor,
inp_indices
=
torch_tensor_to_tv
(
inp_indices_th
)
out_indices
=
torch_tensor_to_tv
(
out_indices_th
)
filter_tv
=
torch_tensor_to_tv
(
filters
)[
profile_idx
]
filter_tv
=
torch_tensor_to_tv
(
filters
).
select
(
kv_dim
,
profile_idx
)
tuned_res
,
min_time
=
GEMM
.
tune_and_cache
(
a
,
filter_tv
,
c
,
False
,
False
if
FILTER_HWIO
else
True
,
is_KC_not_CK
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
...
...
@@ -760,7 +780,7 @@ def indice_conv(features: torch.Tensor,
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
b
=
filters_tv
[
i
]
b
=
filters_tv
.
select
(
kv_dim
,
i
)
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
algo_desp
=
GEMM
.
run_with_tuned_result
(
...
...
@@ -769,7 +789,7 @@ def indice_conv(features: torch.Tensor,
b
,
c
,
False
,
False
if
FILTER_HWIO
else
True
,
is_KC_not_CK
,
False
,
arch
=
arch
,
stream
=
stream
,
...
...
@@ -807,11 +827,27 @@ def indice_conv_backward(features: torch.Tensor,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
# print(out_bp.mean(), out_bp.max(), out_bp.min())
num_activate_out
=
out_bp
.
shape
[
0
]
out_channel
=
out_bp
.
shape
[
-
1
]
filters_shape
=
filters
.
shape
filters
=
filters
.
reshape
(
-
1
,
*
filters
.
shape
[
-
2
:])
kv
=
filters
.
shape
[
0
]
if
not
ALL_WEIGHT_IS_KRSC
:
kv_dim
=
0
is_KC_not_CK
=
not
FILTER_HWIO
if
FILTER_HWIO
:
out_channel
=
filters
.
shape
[
-
1
]
filter_shape_per_kv
=
[
filters
.
shape
[
-
2
],
out_channel
]
else
:
out_channel
=
filters
.
shape
[
-
2
]
filter_shape_per_kv
=
[
out_channel
,
filters
.
shape
[
-
1
]]
filters
=
filters
.
reshape
(
-
1
,
*
filters
.
shape
[
-
2
:])
kv
=
filters
.
shape
[
0
]
else
:
kv_dim
=
1
out_channel
=
filters
.
shape
[
0
]
filters
=
filters
.
reshape
(
out_channel
,
-
1
,
filters
.
shape
[
-
1
])
is_KC_not_CK
=
True
kv
=
filters
.
shape
[
1
]
filter_shape_per_kv
=
[
out_channel
,
filters
.
shape
[
-
1
]]
kv_center
=
kv
//
2
# TODO handle this in nn.Module to make sure features in backward is contiguous
if
not
features
.
is_contiguous
():
...
...
@@ -824,20 +860,24 @@ def indice_conv_backward(features: torch.Tensor,
if
subm
:
dfilters
=
torch
.
zeros_like
(
filters
)
if
FILTER_HWIO
:
torch
.
mm
(
features
.
T
,
out_bp
,
out
=
dfilters
[
kv_center
])
# TODO can we use torch mm for f16 backward weight?
din
=
torch
.
mm
(
out_bp
,
filters
[
kv_center
].
T
)
if
not
ALL_WEIGHT_IS_KRSC
:
if
not
is_KC_not_CK
:
torch
.
mm
(
features
.
T
,
out_bp
,
out
=
dfilters
[
kv_center
])
din
=
torch
.
mm
(
out_bp
,
filters
[
kv_center
].
T
)
else
:
torch
.
mm
(
out_bp
.
T
,
features
,
out
=
dfilters
[
kv_center
])
din
=
torch
.
mm
(
out_bp
,
filters
[
kv_center
])
else
:
torch
.
mm
(
out_bp
.
T
,
features
,
out
=
dfilters
[
kv_center
])
# TODO can we use torch mm for f16 backward weight?
din
=
torch
.
mm
(
out_bp
,
filters
[
kv_center
])
# KN @ NC
torch
.
mm
(
out_bp
.
T
,
features
,
out
=
dfilters
[:,
kv_center
])
# NK @ KC
din
=
torch
.
mm
(
out_bp
,
filters
[:,
kv_center
])
else
:
dfilters
=
torch
.
zeros_like
(
filters
)
din
=
torch
.
zeros_like
(
features
)
if
kv
==
1
and
subm
:
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
inited
:
bool
=
subm
indice_pairs_tv
=
torch_tensor_to_tv
(
indice_pairs
)
# torch slice (a_th[x]) is very slow, so we need to use tv.Tensor earlier.
...
...
@@ -881,12 +921,18 @@ def indice_conv_backward(features: torch.Tensor,
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
SpconvOps
.
gather_cpu
(
inp_buffer_tv
,
features_tv
,
inp_indices
)
SpconvOps
.
gather_cpu
(
out_buffer_tv
,
out_bp_tv
,
out_indices
)
filters_T_cur
=
filters
[
i
].
T
if
FILTER_HWIO
else
filters
[
i
]
dfilters_cur
=
dfilters
[
i
]
if
FILTER_HWIO
else
dfilters
[
i
].
T
torch
.
mm
(
inp_buffer
[:
nhot
].
T
,
out_buffer
[:
nhot
],
out
=
dfilters_cur
)
torch
.
mm
(
out_buffer
[:
nhot
],
filters_T_cur
,
out
=
inp_buffer
[:
nhot
])
filters_i
=
filters
.
select
(
kv_dim
,
i
)
dfilters_i
=
dfilters
.
select
(
kv_dim
,
i
)
filters_KC
=
filters_i
if
is_KC_not_CK
else
filters_i
.
T
if
is_KC_not_CK
:
# KN @ NC
torch
.
mm
(
out_buffer
[:
nhot
].
T
,
inp_buffer
[:
nhot
],
out
=
dfilters_i
)
else
:
# CN @ NK
torch
.
mm
(
inp_buffer
[:
nhot
].
T
,
out_buffer
[:
nhot
],
out
=
dfilters_i
)
# NK @ KC
torch
.
mm
(
out_buffer
[:
nhot
],
filters_KC
,
out
=
inp_buffer
[:
nhot
])
SpconvOps
.
scatter_add_cpu
(
din_tv
,
inp_buffer_tv
,
inp_indices
)
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
...
...
@@ -910,10 +956,10 @@ def indice_conv_backward(features: torch.Tensor,
filters_tv
.
dtype
,
din_tv
.
dtype
,
out_bp_tv
.
shape
,
filter
s
.
shape
[
-
2
:]
,
filter
_
shape
_per_kv
,
din_tv
.
shape
,
False
,
True
if
FILTER_HWIO
else
False
,
not
is_KC_not_CK
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
...
...
@@ -923,13 +969,13 @@ def indice_conv_backward(features: torch.Tensor,
if
tuned_res_dgrad
is
None
:
inp_indices
=
pair_in
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
out_indices
=
pair_out
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
filter_tv
=
filters_tv
[
profile_idx
]
filter_tv
=
filters_tv
.
select
(
kv_dim
,
profile_idx
)
tuned_res_dgrad
,
min_time
=
GEMM
.
tune_and_cache
(
out_bp_tv
,
filter_tv
,
din_tv
,
False
,
True
if
FILTER_HWIO
else
False
,
not
is_KC_not_CK
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
...
...
@@ -939,7 +985,7 @@ def indice_conv_backward(features: torch.Tensor,
beta
=
0.0
,
hint
=
AlgoHint
.
BackwardInput
.
value
,
stream
=
stream
)
if
not
FILTER_HWIO
:
if
is_KC_not_CK
:
a_wgrad
=
out_bp_tv
b_wgrad
=
features_tv
else
:
...
...
@@ -951,7 +997,7 @@ def indice_conv_backward(features: torch.Tensor,
filters_tv
.
dtype
,
a_wgrad
.
shape
,
b_wgrad
.
shape
,
filter
s
.
shape
[
-
2
:]
,
filter
_
shape
_per_kv
,
True
,
False
,
False
,
...
...
@@ -964,8 +1010,8 @@ def indice_conv_backward(features: torch.Tensor,
if
tuned_res_wgrad
is
None
:
inp_indices
=
pair_in
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
out_indices
=
pair_out
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
dfilter_tv
=
dfilters_tv
[
profile_idx
]
if
not
FILTER_HWIO
:
dfilter_tv
=
dfilters_tv
.
select
(
kv_dim
,
profile_idx
)
if
is_KC_not_CK
:
a_inds_wgrad
=
out_indices
b_inds_wgrad
=
inp_indices
else
:
...
...
@@ -988,7 +1034,7 @@ def indice_conv_backward(features: torch.Tensor,
stream
=
stream
)
# print(tuned_res_wgrad.algo_desp, tuned_res_wgrad.splitk, min_time)
# get workspace size for wgrad
if
not
FILTER_HWIO
:
if
is_KC_not_CK
:
a_shape
=
[
maxnhot
,
out_bp_tv
.
dim
(
1
)]
b_shape
=
[
maxnhot
,
features_tv
.
dim
(
1
)]
else
:
...
...
@@ -1030,13 +1076,13 @@ def indice_conv_backward(features: torch.Tensor,
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
# out.T @ inp, NK @ NC
# print(features_tv.shape, out_bp_tv.shape
)
filter_i_tv
=
filters_tv
.
select
(
kv_dim
,
i
)
GEMM
.
run_with_tuned_result
(
tuned_res_dgrad
,
out_bp_tv
,
filter
s
_tv
[
i
]
,
filter
_i
_tv
,
din_tv
,
False
,
True
if
FILTER_HWIO
else
False
,
not
is_KC_not_CK
,
False
,
arch
=
arch
,
stream
=
stream
,
...
...
@@ -1047,7 +1093,7 @@ def indice_conv_backward(features: torch.Tensor,
alpha
=
1.0
,
beta
=
beta
)
if
not
FILTER_HWIO
:
if
is_KC_not_CK
:
a
=
out_bp_tv
b
=
features_tv
a_inds
=
out_indices
...
...
@@ -1060,7 +1106,7 @@ def indice_conv_backward(features: torch.Tensor,
GEMM
.
run_with_tuned_result
(
tuned_res_wgrad
,
a
,
b
,
dfilters_tv
[
i
]
,
dfilters_tv
.
select
(
kv_dim
,
i
)
,
True
,
False
,
False
,
...
...
@@ -1365,6 +1411,9 @@ def implicit_gemm_backward(features: torch.Tensor,
mask_width
=-
1
,
beta
=
beta
,
stream
=
stream
)
# for backward weight, beta = 0 because each split
# handle different kernel locations.
# TODO remove D iterator in backward weight kernel
CONV
.
run_with_tuned_result
(
wgrad_tune_res
,
ConvOpType
.
kBackwardWeight
,
...
...
@@ -1378,7 +1427,7 @@ def implicit_gemm_backward(features: torch.Tensor,
reverse_mask
=
False
,
mask_filter
=
masks
[
j
].
item
(),
mask_width
=
mask_width
,
beta
=
beta
,
beta
=
0
,
workspace
=
workspace_tv
,
stream
=
stream
)
...
...
test/benchmark.py
View file @
bab09b63
...
...
@@ -24,7 +24,7 @@ from spconv.core import ConvAlgo
import
spconv.pytorch
as
spconv
from
spconv.utils
import
Point2VoxelCPU3d
# torch.backends.cudnn.enabled = False
def
waymo_data
(
batch_size
=
1
):
gen
=
Point2VoxelCPU3d
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
3
,
150000
,
1
)
...
...
@@ -168,8 +168,8 @@ class Net(nn.Module):
# nn.ReLU(),
# spconv.SparseInverseConv3d(256, 128, 2, indice_key="m5", bias=False, algo=algo),
# # nn.BatchNorm1d(128),
# # nn.ReLU(),
# #
#
nn.BatchNorm1d(128),
# #
#
nn.ReLU(),
# spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False, algo=algo),
)
...
...
@@ -312,7 +312,8 @@ def main():
# MaskImpGemm: 51.0ms
# MaskSplitImpGemm: 41.1ms
# algo = None
net
=
Net
(
spatial_shape
,
algo
).
to
(
device
).
eval
().
to
(
dtype
).
train
()
net
=
Net
(
spatial_shape
,
algo
).
to
(
device
).
eval
().
to
(
dtype
)
# .train()
# net.load_state_dict(net.state_dict())
spconv
.
assign_name_for_sparse_modules
(
net
)
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
...
...
@@ -329,12 +330,12 @@ def main():
print
(
"------------"
)
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
out_nograd
=
net
(
voxels_th
,
coors_th
,
1
,
Tru
e
)
out_nograd
=
net
(
voxels_th
,
coors_th
,
1
,
Fals
e
)
timer
=
out_nograd
.
_timer
res
=
timer
.
collect_by_name
(
"forward"
,
timer
.
get_all_pair_time
())
res2
=
timer
.
collect_by_name
(
"forward0"
,
timer
.
get_all_pair_time
())
#
res = timer.collect_by_name("forward", timer.get_all_pair_time())
#
res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
print
(
sum
(
res
.
values
())
+
sum
(
res2
.
values
()))
#
print(sum(res.values()) + sum(res2.values()))
# print(timer.get_all_pair_time())
# print(sum(timer.get_all_pair_time().values()))
...
...
@@ -342,7 +343,7 @@ def main():
# sort_bench()
times
.
append
(
time
.
time
()
-
t
)
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
#
times = []
times
=
[]
# for i in range(10):
# out = net(voxels_th, coors_th, 1)
...
...
test/test_all_algo.py
0 → 100644
View file @
bab09b63
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test all gemm/conv kernels.
We can't test all kernels in network because auto-tuner will only find one best kernel.
"""
import
sys
from
pathlib
import
Path
from
typing
import
Dict
,
List
,
Tuple
import
pickle
import
sys
import
time
from
pathlib
import
Path
from
cumm.gemm.algospec.core
import
GemmAlgo
,
ShuffleStrideType
import
numpy
as
np
import
pccm
import
torch
import
torch.nn.functional
as
F
from
spconv.test_utils
import
TestCase
from
cumm
import
tensorview
as
tv
from
cumm.conv.bases
import
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvOpType
import
os
from
cumm.gemm.codeops
import
div_up
from
spconv.core
import
AlgoHint
,
ConvAlgo
from
spconv.pytorch.conv
import
expand_nd
from
spconv.pytorch
import
ops
from
spconv.algo
import
CONV
,
GEMM
,
BestAlgoByProfile
,
BestConvAlgoByProfile
from
spconv.pytorch.cppcore
import
get_current_stream
,
torch_tensor_to_tv
from
spconv.test_utils
import
generate_sparse_data
,
params_grid
import
tqdm
from
spconv.constants
import
ALL_WEIGHT_IS_KRSC
assert
ALL_WEIGHT_IS_KRSC
is
True
,
"we only support KRSC in spconv >= 2.2"
# TODO remove or release this when tf32 op is ready
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
NUMPY_DTYPE_TO_TORCH
=
{
np
.
float32
:
torch
.
float32
,
np
.
float16
:
torch
.
float16
,
np
.
int8
:
torch
.
int8
,
}
class
SparseConvTester
:
def
__init__
(
self
,
algo
:
ConvAlgo
,
subm
:
bool
,
shape
:
List
[
int
],
bs
:
int
,
dtype
:
np
.
dtype
,
N
:
int
,
K
:
int
,
C
:
int
,
ksize
:
int
,
stride
:
int
,
padding
:
int
,
dilation
:
int
)
->
None
:
ndim
=
3
self
.
shape
=
shape
self
.
bs
=
bs
self
.
dtype
=
dtype
self
.
dtype_th
=
NUMPY_DTYPE_TO_TORCH
[
dtype
]
self
.
K
=
K
self
.
C
=
C
self
.
ksize
=
expand_nd
(
ksize
,
ndim
)
self
.
stride
=
expand_nd
(
stride
,
ndim
)
self
.
padding
=
expand_nd
(
padding
,
ndim
)
self
.
dilation
=
expand_nd
(
dilation
,
ndim
)
self
.
N
=
N
self
.
device
=
torch
.
device
(
"cuda:0"
)
op
=
expand_nd
(
0
,
ndim
)
self
.
kv
:
int
=
np
.
prod
(
self
.
ksize
)
self
.
num_split
=
1
if
algo
==
ConvAlgo
.
MaskImplicitGemm
else
2
sparse_dict
=
generate_sparse_data
(
shape
,
[
N
]
*
bs
,
C
)
voxels_np
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices_np
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
indices_th
=
torch
.
from_numpy
(
indices_np
).
to
(
self
.
device
)
out_inds
,
pair_ref
,
indice_num_per_loc
=
ops
.
get_indice_pairs
(
indices_th
,
1
,
shape
,
ConvAlgo
.
Native
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
op
,
subm
)
self
.
indice_num_per_loc_np
=
indice_num_per_loc
.
cpu
().
numpy
()
self
.
indice_pairs_np
=
pair_ref
.
cpu
().
numpy
()
self
.
pair_native
=
pair_ref
self
.
indice_num_per_loc
=
indice_num_per_loc
if
algo
==
ConvAlgo
.
Native
:
self
.
out_inds
:
torch
.
Tensor
=
out_inds
self
.
num_inds_per_loc
:
torch
.
Tensor
=
indice_num_per_loc
self
.
pair_fwd
:
torch
.
Tensor
=
torch
.
Tensor
()
self
.
pair_bwd
:
torch
.
Tensor
=
torch
.
Tensor
()
self
.
pair_mask_fwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
pair_mask_bwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
mask_argsort_fwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
mask_argsort_bwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
masks
=
np
.
array
([])
else
:
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices_th
,
bs
,
shape
,
algo
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
op
,
subm
=
subm
)
self
.
out_inds
=
res
[
0
]
self
.
num_inds_per_loc
=
res
[
1
]
self
.
pair_fwd
=
res
[
2
]
self
.
pair_bwd
=
res
[
3
]
self
.
pair_mask_fwd_splits
=
res
[
4
]
self
.
pair_mask_bwd_splits
=
res
[
5
]
self
.
mask_argsort_fwd_splits
=
res
[
6
]
self
.
mask_argsort_bwd_splits
=
res
[
7
]
self
.
masks
=
res
[
8
]
self
.
voxels_np
=
voxels_np
self
.
indices_np
=
indices_np
self
.
subm
=
subm
if
dtype
==
np
.
int8
:
self
.
inp
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
voxels_np
.
shape
[
0
],
C
]).
astype
(
np
.
int8
)
self
.
weight
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
K
,
*
self
.
ksize
,
C
]).
astype
(
np
.
int8
)
self
.
output
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
self
.
out_inds
.
shape
[
0
],
K
]).
astype
(
dtype
)
else
:
self
.
inp
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
voxels_np
.
shape
[
0
],
C
]).
astype
(
dtype
)
self
.
weight
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
K
,
*
self
.
ksize
,
C
]).
astype
(
dtype
)
self
.
output
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
self
.
out_inds
.
shape
[
0
],
K
]).
astype
(
dtype
)
self
.
weight_ref
=
self
.
weight
.
transpose
(
1
,
2
,
3
,
0
,
4
)
self
.
weight_ref
=
np
.
ascontiguousarray
(
self
.
weight_ref
).
reshape
(
-
1
,
K
,
C
)
self
.
out_ref
,
self
.
din_ref
,
self
.
dw_ref
=
self
.
_get_ref_output
()
self
.
dw_ref
=
np
.
ascontiguousarray
(
self
.
dw_ref
.
transpose
(
1
,
0
,
2
).
reshape
(
K
,
*
self
.
ksize
,
C
))
def
_get_ref_output
(
self
):
output_ref
=
np
.
zeros_like
(
self
.
output
,
dtype
=
np
.
float32
)
dinput_ref
=
np
.
zeros_like
(
self
.
inp
,
dtype
=
np
.
float32
)
dw_ref
=
np
.
zeros_like
(
self
.
weight_ref
,
dtype
=
np
.
float32
)
# KV, K, C
for
filter_offset
in
range
(
self
.
kv
):
if
self
.
subm
and
filter_offset
>
self
.
kv
//
2
:
nhot
=
self
.
indice_num_per_loc_np
[
self
.
kv
-
1
-
filter_offset
]
elif
self
.
subm
and
filter_offset
==
self
.
kv
//
2
:
nhot
=
self
.
voxels_np
.
shape
[
0
]
else
:
nhot
=
self
.
indice_num_per_loc_np
[
filter_offset
]
i_inds
=
self
.
indice_pairs_np
[
0
][
filter_offset
][:
nhot
]
o_inds
=
self
.
indice_pairs_np
[
1
][
filter_offset
][:
nhot
]
a
=
self
.
inp
[
i_inds
]
cc
=
a
.
astype
(
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
T
.
astype
(
np
.
float32
)
output_ref
[
o_inds
]
+=
cc
a
=
self
.
output
[
o_inds
]
# NK @ KC
cc
=
a
.
astype
(
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
astype
(
np
.
float32
)
dinput_ref
[
i_inds
]
+=
cc
out_gather
=
self
.
output
[
o_inds
]
# [N, K]
inp_gather
=
self
.
inp
[
i_inds
]
# [N, C]
# KN @ NC
dw_res
=
out_gather
.
astype
(
np
.
float32
).
T
@
inp_gather
.
astype
(
np
.
float32
)
dw_ref
[
filter_offset
]
=
dw_res
return
output_ref
,
dinput_ref
,
dw_ref
def
get_operands
(
self
,
op_type
:
ConvOpType
):
zeros_func
=
tv
.
zeros
if
not
self
.
subm
else
tv
.
empty
if
op_type
==
ConvOpType
.
kBackwardInput
:
inp_tv
=
zeros_func
(
list
(
self
.
inp
.
shape
),
self
.
dtype
,
0
)
else
:
inp_tv
=
tv
.
from_numpy
(
self
.
inp
).
cuda
()
if
op_type
==
ConvOpType
.
kBackwardWeight
:
weight_tv
=
zeros_func
(
list
(
self
.
weight
.
shape
),
self
.
dtype
,
0
)
else
:
weight_tv
=
tv
.
from_numpy
(
self
.
weight
).
cuda
()
if
op_type
==
ConvOpType
.
kForward
:
output_tv
=
zeros_func
(
list
(
self
.
output
.
shape
),
self
.
dtype
,
0
)
else
:
output_tv
=
tv
.
from_numpy
(
self
.
output
).
cuda
()
return
inp_tv
,
weight_tv
,
output_tv
def
get_operands_torch
(
self
,
op_type
:
ConvOpType
):
zeros_func
=
torch
.
zeros
if
not
self
.
subm
else
torch
.
empty
if
op_type
==
ConvOpType
.
kBackwardInput
:
inp_tv
=
zeros_func
(
list
(
self
.
inp
.
shape
),
dtype
=
self
.
dtype_th
,
device
=
self
.
device
)
else
:
inp_tv
=
torch
.
from_numpy
(
self
.
inp
).
cuda
()
if
op_type
==
ConvOpType
.
kBackwardWeight
:
weight_tv
=
zeros_func
(
list
(
self
.
weight
.
shape
),
dtype
=
self
.
dtype_th
,
device
=
self
.
device
)
else
:
weight_tv
=
torch
.
from_numpy
(
self
.
weight
).
cuda
()
if
op_type
==
ConvOpType
.
kForward
:
output_tv
=
zeros_func
(
list
(
self
.
output
.
shape
),
dtype
=
self
.
dtype_th
,
device
=
self
.
device
)
else
:
output_tv
=
torch
.
from_numpy
(
self
.
output
).
cuda
()
return
inp_tv
,
weight_tv
,
output_tv
def
_test_impgemm_conv_cuda
(
subm
:
bool
):
ndim
=
3
np
.
random
.
seed
(
50005
)
dtype_to_tol
=
{
np
.
float32
:
(
1e-2
,
1e-2
),
np
.
float16
:
(
1e-2
,
1e-2
),
np
.
int8
:
(
1e-4
,
1e-4
),
}
device
=
torch
.
device
(
"cuda:0"
)
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
]
dtypes
=
[
np
.
float32
,
np
.
float16
]
test_case
=
TestCase
()
in_channels
=
[
512
]
out_channels
=
[
512
]
multiple_base
=
16
if
subm
:
ksizes
=
[
3
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
else
:
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
]
algos
=
[
# ConvAlgo.MaskSplitImplicitGemm,
ConvAlgo
.
MaskImplicitGemm
,
]
arch
=
torch
.
cuda
.
get_device_capability
()
for
shape
,
bs
,
C
,
K
,
k
,
s
,
p
,
d
,
algo
,
dtype
in
tqdm
.
tqdm
(
params_grid
(
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
,
algos
,
dtypes
)):
shape_prod
=
np
.
prod
(
shape
)
num_batch
=
np
.
random
.
randint
(
int
(
0.2
*
shape_prod
),
int
(
0.7
*
shape_prod
))
# C = np.random.randint(int(0.3 * C), int(0.7 * C))
# K = np.random.randint(int(0.3 * K), int(0.7 * K))
multipler
=
max
(
C
,
K
)
/
multiple_base
multipler
=
max
(
multipler
,
1.0
)
# print(num_batch)
tester
=
SparseConvTester
(
algo
,
subm
,
shape
,
bs
,
dtype
,
num_batch
,
K
,
C
,
k
,
s
,
p
,
d
)
atol
,
rtol
=
dtype_to_tol
[
dtype
]
mask_width_to_mask_out_fwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
mask_width_to_mask_out_bwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
op_types
=
[
ConvOpType
.
kForward
,
ConvOpType
.
kBackwardInput
]
spk
=
1
for
op_type
in
op_types
:
inp_tv
,
weight_tv
,
output_tv
=
tester
.
get_operands
(
op_type
)
avail_desps
=
CONV
.
get_all_available
(
inp_tv
,
weight_tv
,
output_tv
,
NHWC
,
NHWC
,
NHWC
,
arch
,
op_type
,
-
1
)
print
(
avail_desps
)
for
desp
in
avail_desps
:
if
not
subm
:
if
op_type
==
ConvOpType
.
kForward
:
output_tv
.
zero_
()
else
:
inp_tv
.
zero_
()
# this algo must success
mask_width
=
desp
.
tile_shape
[
0
]
# if mask_width != 32:
# continue
if
mask_width
not
in
mask_width_to_mask_out_fwd
:
mask_width_to_mask_out_fwd
[
mask_width
]
=
torch
.
zeros
([
2
,
div_up
(
tester
.
out_inds
.
shape
[
0
],
mask_width
)],
dtype
=
torch
.
int32
,
device
=
tester
.
device
)
mask_output_fwd
=
mask_width_to_mask_out_fwd
[
mask_width
]
if
subm
:
if
desp
.
op_type
==
ConvOpType
.
kForward
.
value
:
indice_pairs
=
tester
.
pair_fwd
elif
desp
.
op_type
==
ConvOpType
.
kBackwardInput
.
value
:
indice_pairs
=
tester
.
pair_bwd
else
:
indice_pairs
=
tester
.
pair_fwd
mask_output
=
mask_output_fwd
# print([bin(x.item()) for x in masks])
for
j
in
range
(
tester
.
num_split
):
beta
=
1
if
j
==
1
else
0
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
mask_op
=
mask_output
[
j
]
else
:
mask_op
=
tester
.
pair_mask_fwd_splits
[
j
]
if
desp
.
op_type
==
ConvOpType
.
kBackwardInput
.
value
:
reverse_mask
=
True
mask_output_run
=
torch_tensor_to_tv
(
mask_output
[
j
],
dtype
=
tv
.
uint32
)
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
mask_output_run
=
tv
.
Tensor
()
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
tester
.
mask_argsort_fwd_splits
[
j
]),
mask_output_run
,
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
else
:
if
mask_width
not
in
mask_width_to_mask_out_bwd
:
mask_width_to_mask_out_bwd
[
mask_width
]
=
torch
.
zeros
([
2
,
div_up
(
tester
.
indices_np
.
shape
[
0
],
mask_width
)],
dtype
=
torch
.
int32
,
device
=
tester
.
device
)
mask_output_bwd
=
mask_width_to_mask_out_bwd
[
mask_width
]
if
desp
.
op_type
==
ConvOpType
.
kForward
.
value
:
indice_pairs
=
tester
.
pair_fwd
# inp -> out
mask_ops
=
tester
.
pair_mask_fwd_splits
mask_argsorts
=
tester
.
mask_argsort_fwd_splits
mask_output
=
mask_output_fwd
elif
desp
.
op_type
==
ConvOpType
.
kBackwardInput
.
value
:
indice_pairs
=
tester
.
pair_bwd
# out -> inp
mask_ops
=
tester
.
pair_mask_bwd_splits
mask_argsorts
=
tester
.
mask_argsort_bwd_splits
mask_output
=
mask_output_bwd
else
:
indice_pairs
=
tester
.
pair_fwd
# inp -> out
mask_ops
=
tester
.
pair_mask_fwd_splits
mask_argsorts
=
tester
.
mask_argsort_fwd_splits
mask_output
=
mask_output_fwd
for
j
in
range
(
tester
.
num_split
):
beta
=
1
if
j
==
1
else
0
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
mask_op
=
mask_output
[
j
]
else
:
mask_op
=
mask_ops
[
j
]
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
mask_argsorts
[
j
]),
torch_tensor_to_tv
(
mask_output
[
j
],
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
out_ref
=
tester
.
out_ref
din_ref
=
tester
.
din_ref
dw_ref
=
tester
.
dw_ref
if
op_type
==
ConvOpType
.
kForward
:
out_my
=
output_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
test_case
.
assertAllClose
(
out_ref
,
out_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
out_ref
.
reshape
(
-
1
)
-
out_my
.
reshape
(
-
1
))
# if (error_norm > 5):
print
(
f
"
{
desp
}
, Error=
{
error_norm
}
"
)
assert
error_norm
<
10
*
multipler
# print(desp, )
else
:
din_my
=
inp_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
test_case
.
assertAllClose
(
din_ref
,
din_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
din_ref
.
reshape
(
-
1
)
-
din_my
.
reshape
(
-
1
))
assert
error_norm
<
10
*
multipler
,
f
"
{
desp
}
,
{
error_norm
}
,
{
k
}
,
{
s
}
,
{
p
}
,
{
d
}
"
inp_tv
,
weight_tv
,
output_tv
=
tester
.
get_operands
(
ConvOpType
.
kBackwardWeight
)
for
spk
in
[
1
,
4
,
16
,
64
]:
for
mask_width
,
mask_output
in
mask_width_to_mask_out_fwd
.
items
():
avail_desps
=
CONV
.
get_all_available
(
inp_tv
,
weight_tv
,
output_tv
,
NHWC
,
NHWC
,
NHWC
,
arch
,
ConvOpType
.
kBackwardWeight
,
mask_width
)
for
desp
in
avail_desps
:
weight_tv
.
zero_
()
if
subm
:
indice_pairs
=
tester
.
pair_fwd
for
j
in
range
(
tester
.
num_split
):
beta
=
0
mask_filter
=
tester
.
masks
[
j
].
item
()
mask_op
=
mask_output
[
j
]
mask_op_tv
=
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
)
# mask_op_np = mask_op_tv.cpu().numpy()
# bit_ref = np.bitwise_or.reduce(mask_op_np, axis=0)
# bit_my = mask_filter
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
mask_op_tv
,
torch_tensor_to_tv
(
tester
.
mask_argsort_fwd_splits
[
j
]),
tv
.
Tensor
(),
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
=
False
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
else
:
indice_pairs
=
tester
.
pair_fwd
# inp -> out
mask_ops
=
tester
.
pair_mask_fwd_splits
mask_argsorts
=
tester
.
mask_argsort_fwd_splits
for
j
in
range
(
tester
.
num_split
):
# beta = 1 if j == 1 else 0
beta
=
0
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
mask_op
=
mask_output
[
j
]
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
mask_argsorts
[
j
]),
torch_tensor_to_tv
(
mask_output
[
j
],
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
dw_ref
=
tester
.
dw_ref
dw_my
=
weight_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
# print(desp, spk, K, C, mask_width, algo)
test_case
.
assertAllClose
(
dw_ref
,
dw_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
dw_ref
.
reshape
(
-
1
)
-
dw_my
.
reshape
(
-
1
))
# print(desp, error_norm)
if
(
error_norm
>
5
):
print
(
f
"
{
desp
}
, Error=
{
error_norm
}
,
{
spk
}
"
)
assert
error_norm
<
10
*
multipler
def
_test_native_conv_cuda
(
subm
:
bool
):
ndim
=
3
dtype_to_tol
=
{
np
.
float32
:
(
1e-4
,
1e-4
),
np
.
float16
:
(
1e-2
,
1e-2
),
np
.
int8
:
(
1e-4
,
1e-4
),
}
device
=
torch
.
device
(
"cuda:0"
)
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
]
dtypes
=
[
np
.
float32
,
np
.
float16
]
test_case
=
TestCase
()
in_channels
=
[
32
,
47
]
out_channels
=
[
32
,
48
,
62
]
if
subm
:
ksizes
=
[
3
,
5
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
else
:
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
]
multiple_base
=
128
arch
=
torch
.
cuda
.
get_device_capability
()
stream
=
get_current_stream
()
for
shape
,
bs
,
C
,
K
,
k
,
s
,
p
,
d
,
dtype
in
tqdm
.
tqdm
(
params_grid
(
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
,
dtypes
)):
tester
=
SparseConvTester
(
ConvAlgo
.
Native
,
subm
,
shape
,
bs
,
dtype
,
1500
,
K
,
C
,
k
,
s
,
p
,
d
)
atol
,
rtol
=
dtype_to_tol
[
dtype
]
multipler
=
max
(
C
,
K
)
/
multiple_base
multipler
=
max
(
multipler
,
1.0
)
kv_center
=
tester
.
kv
//
2
kv
=
tester
.
kv
pair_in
=
torch_tensor_to_tv
(
tester
.
pair_native
)[
0
]
pair_out
=
torch_tensor_to_tv
(
tester
.
pair_native
)[
1
]
op_types
=
[
ConvOpType
.
kForward
,
ConvOpType
.
kBackwardInput
,
ConvOpType
.
kBackwardWeight
]
indice_pair_num_cpu
=
tester
.
indice_num_per_loc_np
spk
=
1
out_ref
=
tester
.
out_ref
din_ref
=
tester
.
din_ref
dw_ref
=
tester
.
dw_ref
.
reshape
(
K
,
-
1
,
C
)
for
op_type
in
op_types
:
inp_th
,
weight_th
,
output_th
=
tester
.
get_operands_torch
(
op_type
)
weight_th
=
weight_th
.
view
(
K
,
-
1
,
C
)
inp_tv
=
torch_tensor_to_tv
(
inp_th
)
weight_tv
=
torch_tensor_to_tv
(
weight_th
)
output_tv
=
torch_tensor_to_tv
(
output_th
)
if
op_type
==
ConvOpType
.
kForward
:
a
=
inp_tv
c
=
output_tv
b
=
weight_tv
.
select
(
1
,
tester
.
kv
//
2
)
avail_desps
=
GEMM
.
get_all_available
(
a
,
b
,
c
,
False
,
True
,
False
,
arch
,
ShuffleStrideType
.
ShuffleAC
)
for
desp
in
avail_desps
:
if
subm
:
torch
.
mm
(
inp_th
,
weight_th
[:,
tester
.
kv
//
2
].
T
,
out
=
output_th
)
else
:
output_tv
.
zero_
()
inited
=
subm
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
b
=
weight_tv
.
select
(
1
,
i
)
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
GEMM
.
run_with_tuned_result
(
BestAlgoByProfile
(
desp
,
1
),
a
,
b
,
c
,
False
,
True
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
inp_indices
,
c_inds
=
out_indices
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
beta
=
beta
)
inited
=
True
out_my
=
output_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
# error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
# assert error_norm < 1
# print(desp, K, C, k, error_norm)
test_case
.
assertAllClose
(
out_ref
,
out_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
out_ref
.
reshape
(
-
1
)
-
out_my
.
reshape
(
-
1
))
assert
error_norm
<
10
*
multipler
elif
op_type
==
ConvOpType
.
kBackwardInput
:
a
=
output_tv
b
=
weight_tv
.
select
(
1
,
tester
.
kv
//
2
)
c
=
inp_tv
avail_desps
=
GEMM
.
get_all_available
(
a
,
b
,
c
,
False
,
False
,
False
,
arch
,
ShuffleStrideType
.
ShuffleAC
)
for
desp
in
avail_desps
:
if
subm
:
torch
.
mm
(
output_th
,
weight_th
[:,
tester
.
kv
//
2
],
out
=
inp_th
)
else
:
inp_tv
.
zero_
()
inited
=
subm
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
b
=
weight_tv
.
select
(
1
,
i
)
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
GEMM
.
run_with_tuned_result
(
BestAlgoByProfile
(
desp
,
1
),
a
,
b
,
c
,
False
,
False
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
out_indices
,
c_inds
=
inp_indices
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
beta
=
beta
)
inited
=
True
din_my
=
inp_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
# error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
# print(desp, K, C, k, error_norm)
test_case
.
assertAllClose
(
din_ref
,
din_my
,
atol
=
atol
,
rtol
=
rtol
)
# assert error_norm < 1
else
:
error_norm
=
np
.
linalg
.
norm
(
din_ref
.
reshape
(
-
1
)
-
din_my
.
reshape
(
-
1
))
assert
error_norm
<
10
*
multipler
else
:
a
=
output_tv
b
=
inp_tv
c
=
weight_tv
.
select
(
1
,
tester
.
kv
//
2
)
avail_desps
=
GEMM
.
get_all_available
(
a
,
b
,
c
,
True
,
False
,
False
,
arch
,
ShuffleStrideType
.
ShuffleAB
)
for
desp
in
avail_desps
:
inited
=
subm
weight_tv
.
zero_
()
if
subm
:
torch
.
mm
(
output_th
.
T
,
inp_th
,
out
=
weight_th
[:,
kv_center
])
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
beta
=
1.0
if
inited
else
0.0
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
a_inds
=
out_indices
b_inds
=
inp_indices
GEMM
.
run_with_tuned_result
(
BestAlgoByProfile
(
desp
,
32
),
a
,
b
,
weight_tv
.
select
(
1
,
i
),
True
,
False
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAB
,
a_inds
=
a_inds
,
b_inds
=
b_inds
,
hint
=
AlgoHint
.
BackwardWeight
.
value
,
alpha
=
1.0
,
beta
=
beta
)
dw_my
=
weight_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
error_norm
=
np
.
linalg
.
norm
(
dw_ref
.
reshape
(
-
1
)
-
dw_my
.
reshape
(
-
1
))
assert
error_norm
<
1
*
multipler
# test_case.assertAllClose(dw_ref, dw_my, atol=atol, rtol=rtol)
# print(desp, error_norm)
else
:
error_norm
=
np
.
linalg
.
norm
(
dw_ref
.
reshape
(
-
1
)
-
dw_my
.
reshape
(
-
1
))
# print(desp, error_norm)
assert
error_norm
<
10
*
multipler
def
test_all_algo_unit
():
# for i in range(5):
_test_impgemm_conv_cuda
(
True
)
# _test_impgemm_conv_cuda(False)
# _test_native_conv_cuda(True)
# _test_native_conv_cuda(False)
if
__name__
==
"__main__"
:
test_all_algo_unit
()
\ No newline at end of file
test/test_conv.py
View file @
bab09b63
...
...
@@ -12,6 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compare results between sparse and dense layers:
SparseConvXd
SparseConvTransposeXd
SparseMaxPoolXd
"""
import
time
import
unittest
from
pathlib
import
Path
...
...
@@ -23,14 +29,12 @@ from spconv.core import ConvAlgo
import
spconv.pytorch
as
spconv
from
spconv.test_utils
import
TestCase
,
generate_sparse_data
,
params_grid
from
spconv.constants
import
FILTER_HWIO
# import sparseconvnet as scn
from
spconv.constants
import
ALL_WEIGHT_IS_KRSC
,
FILTER_HWIO
# we must disable tf32 to increase reference precision.
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
class
SparseConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
...
...
@@ -76,52 +80,6 @@ class SparseConv3dTestTorch(nn.Module):
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
class
SubMConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
spconv
.
ConvAlgo
.
Native
):
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
,
algo
=
algo
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
,
algo
=
algo
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self
.
grid
=
None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
# .cpu()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
class
Conv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
...
...
@@ -150,11 +108,11 @@ class Conv3dTestTorch(nn.Module):
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
class
SparseDeConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
,
algo
):
super
().
__init__
()
self
.
algo
=
algo
layers
=
[
spconv
.
SparseConvTranspose3d
(
in_channels
,
out_channels
,
...
...
@@ -162,7 +120,8 @@ class SparseDeConv3dTestTorch(nn.Module):
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)
bias
=
False
,
algo
=
algo
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
...
...
@@ -172,7 +131,8 @@ class SparseDeConv3dTestTorch(nn.Module):
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
bias
=
False
,
algo
=
algo
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
shape
=
shape
...
...
@@ -213,14 +173,15 @@ class DeConv3dTestTorch(nn.Module):
class
SparseMaxPoolTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
dilation
):
dilation
,
algo
):
super
().
__init__
()
self
.
algo
=
algo
layers
=
[
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
algo
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
algo
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
shape
=
shape
...
...
@@ -243,86 +204,6 @@ class MaxPool3dTestTorch(nn.Module):
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
class
SubmanifoldConvTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
bias
=
False
,
indice_key
=
"subm0"
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
class
SCNCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
self
.
scn_input
=
scn
.
InputLayer
(
ndim
,
shape
,
mode
=
0
)
self
.
net
=
nn
.
Sequential
(
scn
.
Convolution
(
ndim
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
Deconvolution
(
ndim
,
out_channels
,
in_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
SparseToDense
(
ndim
,
in_channels
),
)
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
long
().
cpu
()
x
=
self
.
scn_input
((
coors
,
features
))
return
self
.
net
(
x
)
class
SparseCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
indice_key
=
"cp0"
,
bias
=
False
),
spconv
.
SparseInverseConv3d
(
out_channels
,
in_channels
,
kernel_size
,
indice_key
=
"cp0"
,
bias
=
False
),
)
self
.
todense
=
spconv
.
ToDense
()
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
todense
(
self
.
net
(
x
))
# .dense()
def
gather_nd
(
params
,
indices
):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim
=
indices
.
shape
[
-
1
]
...
...
@@ -349,374 +230,147 @@ def scatter_nd(indices, updates, shape):
ret
[
slices
]
=
updates
.
view
(
*
output_shape
)
return
ret
def
test_spconv3d
():
test_case
=
TestCase
()
np
.
random
.
seed
(
484
)
torch
.
manual_seed
(
48848
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
class
TestSpConv
(
TestCase
):
def
testSpConv3d
(
self
):
np
.
random
.
seed
(
71
)
torch
.
manual_seed
(
705
)
devices
=
[
"cuda:0"
]
shapes
=
[[
4
,
4
,
4
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
4
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
ksizes
=
[
3
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
algos
=
[
ConvAlgo
.
MaskImplicitGemm
,
# ConvAlgo.MaskSplitImplicitGemm
]
# algos = [ConvAlgo.MaskSplitImplicitGemm]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
,
algos
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
10
]
*
bs
dtype
=
torch
.
float32
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
al
).
to
(
device
).
to
(
dtype
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
to
(
dtype
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
# print(k, s, p, d, features.mean(), indices.mean())
# if k == 2 and s == 2 and p == 0 and d == 1:
# breakpoint()
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
features_dense_t
.
requires_grad
=
True
if
net
.
algo
==
ConvAlgo
.
Native
:
if
FILTER_HWIO
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
OC
,
k
,
k
,
k
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
if
net
.
algo
==
ConvAlgo
.
Native
:
if
FILTER_HWIO
:
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
else
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
# OHWI -> OIHW
dw
=
dw
.
transpose
(
0
,
4
,
1
,
2
,
3
)
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
def
testSpDeConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
64
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
,
3
]
strides
=
[
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
ksizes
=
[
3
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
in_channels
=
[
32
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskSplitImplicitGemm
]
# algos = [ConvAlgo.Native]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
,
algos
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
# print(dev, shape, bs, IC, OC, k, s, p, d)
device
=
torch
.
device
(
dev
)
num_points
=
[
1500
]
*
bs
dtype
=
torch
.
float32
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
al
).
to
(
device
).
to
(
dtype
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
to
(
dtype
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
features_dense_t
.
requires_grad
=
True
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
else
:
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
print
(
net_ref
.
net
[
0
].
weight
.
shape
)
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
4
,
3
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
OC
,
k
,
k
,
k
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
test_case
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
def
testSpCpConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
shapes
=
[[
20
,
20
,
20
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
64
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
]
strides
=
[
2
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
):
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_scn_t
=
torch
.
from_numpy
(
indices
[:,
[
1
,
2
,
3
,
0
]]).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_ref_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_ref_t
.
requires_grad
=
True
net_ref
=
SCNCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net
=
SparseCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
net
.
net
[
0
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
0
].
weight
.
shape
)
net_ref
.
net
[
1
].
weight
.
data
[:]
=
net
.
net
[
1
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
1
].
weight
.
shape
)
out_ref
=
net_ref
(
features_ref_t
,
indices_scn_t
,
bs
)
out
=
net
(
features_t
,
indices_t
,
bs
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din
=
features_t
.
grad
.
detach
()
din_ref
=
features_ref_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_ref_np
=
din_ref
.
cpu
().
numpy
()
self
.
assertAllClose
(
din_ref_np
,
din_np
,
atol
=
1e-4
)
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
view
(
*
dw
.
shape
).
numpy
()
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpMaxPool3d
(
self
):
np
.
random
.
seed
(
485
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
64
]
out_channels
=
[
64
]
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
,
3
]
# ksizes = [2]
# strides = [2]
# paddings = [0]
# dilations = [1]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
,
data_range
=
[
0.1
,
1
])
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
net
=
SparseMaxPoolTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
MaxPool3dTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
).
to
(
device
)
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
)
outids
=
out
.
indices
outfeatures
=
out
.
features
outids_dev
=
outids
.
float
()
out_dense
=
out
.
dense
(
channels_first
=
False
)
out
=
out_dense
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout_sparse
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
dout_sparse_t
=
torch
.
from_numpy
(
dout_sparse
).
to
(
device
)
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
list
(
out_dense
.
shape
))
dout_t
=
dout_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
def
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
):
# function for develop.
np
.
random
.
seed
(
484
)
# devices = ["cuda:0"]
devices
=
[
"cuda:0"
]
shapes
=
[[
400
,
400
,
15
]]
batchsizes
=
[
2
]
else
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
# OHWI -> OIHW
dw
=
dw
.
transpose
(
0
,
4
,
1
,
2
,
3
)
test_case
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
test_case
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
in_channels
=
[
19
]
out_channels
=
[
17
]
ksizes
=
[(
3
,
3
,
3
)]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
def
test_spdeconv3d
():
test_case
=
TestCase
()
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
64
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
,
3
]
strides
=
[
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskSplitImplicitGemm
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
,
algos
):
if
all
([
s
>
1
,
d
>
1
]):
continue
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
30000
]
*
bs
num_points
=
[
1000
]
*
bs
dtype
=
torch
.
float32
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
...
...
@@ -725,115 +379,154 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
).
to
(
dtype
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
).
to
(
device
)
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
algo
).
to
(
device
).
to
(
dtype
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
to
(
dtype
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
[:]
=
filters_t
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
OC
,
k
,
k
,
k
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
0
,
1
,
2
,
3
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
out_ref
=
net_ref
(
features_dense_t
)
times
=
[]
for
i
in
range
(
10
):
t
=
time
.
time
()
out
=
net
(
features_t
,
indices_t
,
bs
)
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
2
:]))
out
=
net
(
features_t
,
indices_t
,
bs
)
# print(out.indices)
out
=
out
.
dense
()
out_numpy
=
out
.
detach
().
cpu
().
numpy
()
print
(
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
out_ref
.
detach
().
cpu
().
numpy
()))
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
out_numpy
.
sum
())
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
test_case
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
test_case
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
else
:
# OHWI -> OIHW
dw
=
dw
.
transpose
(
4
,
0
,
1
,
2
,
3
)
test_case
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
def
test_spmaxpool3d
():
test_case
=
TestCase
()
def
main_subm
(
algo
,
dtype
=
torch
.
float32
):
# function for develop.
np
.
random
.
seed
(
484
)
torch
.
manual_seed
(
50051
)
# devices = ["cuda:0"]
np
.
random
.
seed
(
485
)
devices
=
[
"cuda:0"
]
shapes
=
[[
400
,
400
,
1
5
]]
batchsizes
=
[
2
]
shapes
=
[[
19
,
18
,
1
7
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
32
]
in_channels
=
[
64
]
out_channels
=
[
64
]
ksizes
=
[(
3
,
3
,
3
)]
strides
=
[
1
]
paddings
=
[
1
]
dilations
=
[
1
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
,
3
]
# ksizes = [2]
# strides = [2]
# paddings = [0]
# dilations = [1]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskSplitImplicitGemm
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
,
algos
):
if
all
([
s
>
1
,
d
>
1
]):
continue
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1
20
000
]
*
bs
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
,
data_range
=
[
0.1
,
1
])
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
).
to
(
dtype
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
net
=
SparseMaxPoolTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
,
al
).
to
(
device
)
net_ref
=
MaxPool3dTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
).
to
(
device
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
net
=
SubMConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
algo
).
to
(
device
).
to
(
dtype
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
to
(
dtype
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
times
=
[]
for
i
in
range
(
20
):
t
=
time
.
time
()
out
=
net
(
features_t
,
indices_t
,
bs
)
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
out
=
net
(
features_t
,
indices_t
,
bs
)
# print(out.indices)
out
=
out
.
dense
()
out_numpy
=
out
.
detach
().
cpu
().
numpy
()
# print(
# np.linalg.norm(out.detach().cpu().numpy() -
# out_ref.detach().cpu().numpy()))
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
out_numpy
.
sum
())
return
out_numpy
if
__name__
==
'__main__'
:
# main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
TestSpConv
().
testSpConv3d
()
outids
=
out
.
indices
outfeatures
=
out
.
features
outids_dev
=
outids
.
float
()
out_dense
=
out
.
dense
(
channels_first
=
False
)
out
=
out_dense
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
test_case
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout_sparse
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
dout_sparse_t
=
torch
.
from_numpy
(
dout_sparse
).
to
(
device
)
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
list
(
out_dense
.
shape
))
dout_t
=
dout_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
test_case
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
if
__name__
==
"__main__"
:
test_spmaxpool3d
()
test/test_implgemm.py
deleted
100644 → 0
View file @
7af751dc
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
test/test_multi_impl.py
0 → 100644
View file @
bab09b63
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compare results between different algos:
CPU: simple gather-mm-scatter
Native: Fused gather-mm-scatter
ImplicitGemm: implicit gemm
"""
import
time
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
from
cumm
import
tensorview
as
tv
from
spconv.core
import
ConvAlgo
import
spconv.pytorch
as
spconv
import
pickle
from
spconv.test_utils
import
generate_sparse_data
,
params_grid
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
pool_algo
=
algo
# pool_algo = ConvAlgo.Native
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SubMConv3d
(
32
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SparseConv3d
(
64
,
64
,
3
,
2
,
1
,
bias
=
False
,
indice_key
=
"m0"
,
algo
=
algo
),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv
.
SparseConv3d
(
96
,
96
,
2
,
2
,
bias
=
False
,
indice_key
=
"m1"
,
algo
=
algo
),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SubMConv3d
(
96
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
spconv
.
SparseMaxPool3d
(
2
,
2
,
algo
=
pool_algo
),
spconv
.
SubMConv3d
(
128
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
spconv
.
SparseMaxPool3d
(
2
,
2
,
algo
=
pool_algo
,
indice_key
=
"m3"
),
spconv
.
SubMConv3d
(
160
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
indice_key
=
"m4"
,
algo
=
pool_algo
),
# spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
# nn.BatchNorm1d(256),
# nn.ReLU(),
spconv
.
SparseInverseConv3d
(
224
,
128
,
2
,
indice_key
=
"m4"
,
bias
=
False
,
algo
=
algo
),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseInverseConv3d
(
128
,
64
,
2
,
indice_key
=
"m3"
,
bias
=
False
,
algo
=
algo
),
)
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
class
NetLight
(
nn
.
Module
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
pool_algo
=
algo
# pool_algo = ConvAlgo.Native
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SubMConv3d
(
32
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SparseConv3d
(
64
,
64
,
3
,
2
,
1
,
bias
=
False
,
indice_key
=
"m0"
,
algo
=
algo
),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv
.
SparseConv3d
(
96
,
96
,
2
,
2
,
bias
=
False
,
indice_key
=
"m1"
,
algo
=
algo
),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SparseInverseConv3d
(
96
,
64
,
2
,
indice_key
=
"m1"
,
bias
=
False
,
algo
=
algo
),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseInverseConv3d
(
64
,
32
,
3
,
indice_key
=
"m0"
,
bias
=
False
,
algo
=
algo
),
)
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
def
_test_multi_impl
(
dtype
:
torch
.
dtype
):
# TODO remove or release this when tf32 op is ready
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
np
.
random
.
seed
(
50051
)
if
dtype
!=
torch
.
float16
:
with
open
(
Path
(
__file__
).
parent
/
"data"
/
"test_spconv.pkl"
,
"rb"
)
as
f
:
(
voxels
,
coors
,
spatial_shape
)
=
pickle
.
load
(
f
)
else
:
# CPU fp16 is very slow, so we use a small data here.
spatial_shape
=
[
19
,
18
,
17
]
sparse_dict
=
generate_sparse_data
(
spatial_shape
,
[
1500
]
*
1
,
3
)
voxels
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
coors
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
device
=
torch
.
device
(
"cuda:0"
)
device_cpu
=
torch
.
device
(
"cpu:0"
)
voxels_th
=
torch
.
from_numpy
(
voxels
).
to
(
device_cpu
).
to
(
dtype
)
coors_th
=
torch
.
from_numpy
(
coors
).
to
(
device_cpu
).
int
()
voxels_th_cuda
=
torch
.
from_numpy
(
voxels
).
to
(
device
).
to
(
dtype
)
coors_th_cuda
=
torch
.
from_numpy
(
coors
).
to
(
device
).
int
()
net_cls
=
Net
if
dtype
==
torch
.
float16
:
# CPU fp16 is very slow, so we use a small network here.
net_cls
=
NetLight
# cpu
torch
.
manual_seed
(
50051
)
net_native_cpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
Native
).
to
(
device_cpu
).
to
(
dtype
)
# gpu_native
torch
.
manual_seed
(
50051
)
net_native_gpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
Native
).
to
(
device
).
to
(
dtype
)
torch
.
manual_seed
(
50051
)
net_imp_gpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
MaskImplicitGemm
).
to
(
device
).
to
(
dtype
)
torch
.
manual_seed
(
50051
)
net_simp_gpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
MaskSplitImplicitGemm
).
to
(
device
).
to
(
dtype
)
spconv
.
assign_name_for_sparse_modules
(
net_native_cpu
)
spconv
.
assign_name_for_sparse_modules
(
net_native_gpu
)
spconv
.
assign_name_for_sparse_modules
(
net_imp_gpu
)
spconv
.
assign_name_for_sparse_modules
(
net_simp_gpu
)
with
torch
.
no_grad
():
out
:
torch
.
Tensor
=
net_native_cpu
(
voxels_th
,
coors_th
,
1
).
dense
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out
.
shape
).
astype
(
np
.
float32
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device_cpu
).
to
(
dtype
)
dout_t_cu
=
torch
.
from_numpy
(
dout
).
to
(
device
).
to
(
dtype
)
out_cpu
=
net_native_cpu
(
voxels_th
,
coors_th
,
1
).
dense
()
out_cpu
.
backward
(
dout_t
)
out
=
net_native_gpu
(
voxels_th_cuda
,
coors_th_cuda
,
1
).
dense
()
out
.
backward
(
dout_t_cu
)
out_imp
=
net_imp_gpu
(
voxels_th_cuda
,
coors_th_cuda
,
1
).
dense
()
out_imp
.
backward
(
dout_t_cu
)
out_simp
=
net_simp_gpu
(
voxels_th_cuda
,
coors_th_cuda
,
1
).
dense
()
out_simp
.
backward
(
dout_t_cu
)
with
torch
.
no_grad
():
dense_cpu
=
out_cpu
.
cuda
()
dense_native
=
out
dense_imp
=
out_imp
dense_simp
=
out_simp
error_native
=
torch
.
linalg
.
norm
(
dense_cpu
-
dense_native
).
cpu
().
item
()
error_imp
=
torch
.
linalg
.
norm
(
dense_cpu
-
dense_imp
).
cpu
().
item
()
error_simp
=
torch
.
linalg
.
norm
(
dense_cpu
-
dense_simp
).
cpu
().
item
()
print
(
"error_native"
,
error_native
)
print
(
"error_imp"
,
error_imp
)
print
(
"error_simp"
,
error_simp
)
if
dtype
==
torch
.
float32
:
assert
error_native
<
0.01
assert
error_imp
<
0.01
assert
error_simp
<
0.01
else
:
assert
error_native
<
10
assert
error_imp
<
10
assert
error_simp
<
10
cpu_params
=
dict
(
net_native_cpu
.
named_parameters
())
native_params
=
dict
(
net_native_gpu
.
named_parameters
())
imp_params
=
dict
(
net_imp_gpu
.
named_parameters
())
simp_params
=
dict
(
net_simp_gpu
.
named_parameters
())
for
k
,
cpu_w
in
cpu_params
.
items
():
native_w
=
native_params
[
k
]
imp_w
=
imp_params
[
k
]
simp_w
=
simp_params
[
k
]
cpu_w_grad
=
cpu_w
.
grad
.
detach
().
cuda
()
native_w_grad
=
native_w
.
grad
.
detach
()
imp_w_grad
=
imp_w
.
grad
.
detach
()
simp_w_grad
=
simp_w
.
grad
.
detach
()
error_native
=
torch
.
linalg
.
norm
(
native_w_grad
-
cpu_w_grad
).
cpu
().
item
()
error_imp
=
torch
.
linalg
.
norm
(
native_w_grad
-
imp_w_grad
).
cpu
().
item
()
error_simp
=
torch
.
linalg
.
norm
(
native_w_grad
-
simp_w_grad
).
cpu
().
item
()
print
(
k
,
error_native
,
error_imp
,
error_simp
)
assert
error_imp
<
1
assert
error_simp
<
1
def
test_multi_impl
():
_test_multi_impl
(
torch
.
float32
)
_test_multi_impl
(
torch
.
float16
)
if
__name__
==
"__main__"
:
test_multi_impl
()
test/test_native_kernels.py
deleted
100644 → 0
View file @
7af751dc
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
test_before_push.sh
0 → 100644
View file @
bab09b63
# developers must run this file before push or pull request.
# this script contains three parts:
# 1. unit tests for all gemm/conv kernels
# 2. comparison test: compare network fwd/bwd results between CPU, Native, ImplicitGemm
# 3. f32/f16 train/eval test based on mnist and some small datasets
echo
"-------------UNIT TEST START--------------"
pytest ./test
echo
"-------------UNIT TEST END--------------"
python ./example/mnist_sparse.py
--fp16
\ No newline at end of file
tools/install_windows_cuda.ps1
View file @
bab09b63
...
...
@@ -28,12 +28,12 @@ if (($CUDA_VERSION_FULL -eq "10.2") -or ($CUDA_VERSION_FULL -eq "11.0") -or ($CU
)
}
elseif
(
$CUDA_VERSION_FULL
-eq
"11.3"
){
$CUDA_PACKAGES_IN
=
@(
"
cuda_
nvcc"
;
"nvcc"
;
"visual_studio_integration"
;
"
cuda_
nvrtc"
;
"
cuda_
cudart"
;
"
cuda_
thrust"
;
"
lib
curand"
;
"nvrtc
_dev
"
;
"cudart"
;
"thrust"
;
"curand
_dev
"
;
)
}
else
{
# after cuda 11.4
...
...
version.txt
View file @
bab09b63
2.1.21
\ No newline at end of file
2.2.0
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment