Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
bf011c76
Commit
bf011c76
authored
Nov 23, 2021
by
yan.yan
Browse files
temp commit
parent
4791f582
Changes
34
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1397 additions
and
654 deletions
+1397
-654
spconv/pytorch/conv.py
spconv/pytorch/conv.py
+17
-1
spconv/pytorch/core.py
spconv/pytorch/core.py
+8
-1
spconv/pytorch/cppcore.py
spconv/pytorch/cppcore.py
+14
-4
spconv/pytorch/modules.py
spconv/pytorch/modules.py
+1
-0
spconv/pytorch/ops.py
spconv/pytorch/ops.py
+9
-2
spconv/pytorch/utils.py
spconv/pytorch/utils.py
+56
-19
test/benchmark.py
test/benchmark.py
+20
-20
test/test_all_algo.py
test/test_all_algo.py
+663
-0
test/test_conv.py
test/test_conv.py
+274
-574
test/test_implgemm.py
test/test_implgemm.py
+0
-15
test/test_multi_impl.py
test/test_multi_impl.py
+324
-3
test/test_native_kernels.py
test/test_native_kernels.py
+0
-14
test_before_push.sh
test_before_push.sh
+10
-0
version.txt
version.txt
+1
-1
No files found.
spconv/pytorch/conv.py
View file @
bf011c76
...
@@ -35,6 +35,20 @@ from spconv.utils import nullcontext
...
@@ -35,6 +35,20 @@ from spconv.utils import nullcontext
FILTER_HWIO
=
False
FILTER_HWIO
=
False
def
expand_nd
(
val
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...]],
ndim
:
int
)
->
List
[
int
]:
if
isinstance
(
val
,
int
):
val
=
[
val
]
*
ndim
elif
isinstance
(
val
,
list
):
assert
len
(
val
)
==
ndim
elif
isinstance
(
val
,
tuple
):
assert
len
(
val
)
==
ndim
return
[
*
val
]
else
:
raise
NotImplementedError
return
val
def
_calculate_fan_in_and_fan_out_hwio
(
tensor
,
algo
:
ConvAlgo
):
def
_calculate_fan_in_and_fan_out_hwio
(
tensor
,
algo
:
ConvAlgo
):
dimensions
=
tensor
.
ndimension
()
dimensions
=
tensor
.
ndimension
()
if
dimensions
<
2
:
if
dimensions
<
2
:
...
@@ -110,7 +124,9 @@ class SparseConvolution(SparseModule):
...
@@ -110,7 +124,9 @@ class SparseConvolution(SparseModule):
self
.
out_channels
=
out_channels
self
.
out_channels
=
out_channels
self
.
kernel_size
=
kernel_size
self
.
kernel_size
=
kernel_size
kv
=
int
(
np
.
prod
(
kernel_size
))
kv
=
int
(
np
.
prod
(
kernel_size
))
self
.
conv1x1
=
kv
==
1
kv_stride
=
int
(
np
.
prod
(
kernel_size
))
self
.
conv1x1
=
kv
==
1
and
kv_stride
==
1
self
.
stride
=
stride
self
.
stride
=
stride
self
.
padding
=
padding
self
.
padding
=
padding
self
.
dilation
=
dilation
self
.
dilation
=
dilation
...
...
spconv/pytorch/core.py
View file @
bf011c76
...
@@ -104,7 +104,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
...
@@ -104,7 +104,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
indice_dict
:
Optional
[
dict
]
=
None
,
indice_dict
:
Optional
[
dict
]
=
None
,
benchmark
:
bool
=
False
,
benchmark
:
bool
=
False
,
permanent_thrust_allocator
:
bool
=
False
,
permanent_thrust_allocator
:
bool
=
False
,
enable_timer
:
bool
=
False
):
enable_timer
:
bool
=
False
,
force_algo
:
Optional
[
ConvAlgo
]
=
None
):
"""
"""
Args:
Args:
features: [num_points, num_features] feature tensor
features: [num_points, num_features] feature tensor
...
@@ -115,6 +116,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
...
@@ -115,6 +116,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
is very large.
is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor.
SparseConvTensor.
enable_timer: if exists, all spconv internal ops run time will be record in _timer.
force_algo: force conv/pool layers use this algo, should only used for debug.
"""
"""
ndim
=
indices
.
shape
[
1
]
-
1
ndim
=
indices
.
shape
[
1
]
-
1
assert
features
.
ndim
==
2
assert
features
.
ndim
==
2
...
@@ -139,6 +142,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
...
@@ -139,6 +142,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
if
permanent_thrust_allocator
:
if
permanent_thrust_allocator
:
self
.
thrust_allocator
=
ThrustSortAllocator
(
features
.
device
)
self
.
thrust_allocator
=
ThrustSortAllocator
(
features
.
device
)
self
.
_timer
=
CUDAKernelTimer
(
enable_timer
)
self
.
_timer
=
CUDAKernelTimer
(
enable_timer
)
self
.
force_algo
=
force_algo
def
replace_feature
(
self
,
feature
:
torch
.
Tensor
):
def
replace_feature
(
self
,
feature
:
torch
.
Tensor
):
"""we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features))
"""we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features))
...
@@ -152,6 +156,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
...
@@ -152,6 +156,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
new_spt
.
benchmark_record
=
self
.
benchmark_record
new_spt
.
benchmark_record
=
self
.
benchmark_record
new_spt
.
thrust_allocator
=
self
.
thrust_allocator
new_spt
.
thrust_allocator
=
self
.
thrust_allocator
new_spt
.
_timer
=
self
.
_timer
new_spt
.
_timer
=
self
.
_timer
new_spt
.
force_algo
=
self
.
force_algo
return
new_spt
return
new_spt
@
property
@
property
...
@@ -217,4 +223,5 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
...
@@ -217,4 +223,5 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
tensor
.
benchmark_record
=
self
.
benchmark_record
tensor
.
benchmark_record
=
self
.
benchmark_record
tensor
.
thrust_allocator
=
self
.
thrust_allocator
tensor
.
thrust_allocator
=
self
.
thrust_allocator
tensor
.
_timer
=
self
.
_timer
tensor
.
_timer
=
self
.
_timer
tensor
.
force_algo
=
self
.
force_algo
return
tensor
return
tensor
spconv/pytorch/cppcore.py
View file @
bf011c76
...
@@ -30,7 +30,8 @@ _TORCH_DTYPE_TO_TV = {
...
@@ -30,7 +30,8 @@ _TORCH_DTYPE_TO_TV = {
def
torch_tensor_to_tv
(
ten
:
torch
.
Tensor
,
def
torch_tensor_to_tv
(
ten
:
torch
.
Tensor
,
dtype
:
Optional
[
int
]
=
None
,
dtype
:
Optional
[
int
]
=
None
,
shape
:
Optional
[
List
[
int
]]
=
None
):
shape
:
Optional
[
List
[
int
]]
=
None
,
stride
:
Optional
[
List
[
int
]]
=
None
):
# assert ten.is_contiguous(), "must be contiguous tensor"
# assert ten.is_contiguous(), "must be contiguous tensor"
ptr
=
ten
.
data_ptr
()
ptr
=
ten
.
data_ptr
()
device
=
ten
.
device
device
=
ten
.
device
...
@@ -40,11 +41,20 @@ def torch_tensor_to_tv(ten: torch.Tensor,
...
@@ -40,11 +41,20 @@ def torch_tensor_to_tv(ten: torch.Tensor,
tv_device
=
0
tv_device
=
0
else
:
else
:
raise
NotImplementedError
raise
NotImplementedError
if
shape
is
None
:
shape
=
list
(
ten
.
shape
)
if
dtype
is
None
:
if
dtype
is
None
:
dtype
=
_TORCH_DTYPE_TO_TV
[
ten
.
dtype
]
dtype
=
_TORCH_DTYPE_TO_TV
[
ten
.
dtype
]
return
tv
.
from_blob
(
ptr
,
shape
,
list
(
ten
.
stride
()),
dtype
,
tv_device
)
if
stride
is
None
:
stride
=
list
(
ten
.
stride
())
if
shape
is
None
:
shape
=
list
(
ten
.
shape
)
else
:
if
not
ten
.
is_contiguous
():
msg
=
"if you provide custom shape for non-contig tensor, stride must not None"
assert
stride
is
not
None
,
msg
else
:
# custom shape, if tensor is contiguous, we use from_blob and calc strides
return
tv
.
from_blob
(
ptr
,
shape
,
dtype
,
tv_device
)
return
tv
.
from_blob_strided
(
ptr
,
shape
,
stride
,
dtype
,
tv_device
)
def
get_current_stream
():
def
get_current_stream
():
...
...
spconv/pytorch/modules.py
View file @
bf011c76
...
@@ -137,6 +137,7 @@ class SparseSequential(SparseModule):
...
@@ -137,6 +137,7 @@ class SparseSequential(SparseModule):
input
=
module
(
input
)
input
=
module
(
input
)
else
:
else
:
if
isinstance
(
input
,
spconv
.
SparseConvTensor
):
if
isinstance
(
input
,
spconv
.
SparseConvTensor
):
print
(
input
.
features
.
shape
)
if
input
.
indices
.
shape
[
0
]
!=
0
:
if
input
.
indices
.
shape
[
0
]
!=
0
:
input
=
input
.
replace_feature
(
module
(
input
.
features
))
input
=
input
.
replace_feature
(
module
(
input
.
features
))
else
:
else
:
...
...
spconv/pytorch/ops.py
View file @
bf011c76
...
@@ -1066,7 +1066,7 @@ def indice_conv_backward(features: torch.Tensor,
...
@@ -1066,7 +1066,7 @@ def indice_conv_backward(features: torch.Tensor,
alpha
=
1.0
,
alpha
=
1.0
,
beta
=
beta
)
beta
=
beta
)
if
not
FILTER_HWIO
:
if
is_KC_not_CK
:
a
=
out_bp_tv
a
=
out_bp_tv
b
=
features_tv
b
=
features_tv
a_inds
=
out_indices
a_inds
=
out_indices
...
@@ -1376,6 +1376,9 @@ def implicit_gemm_backward(features: torch.Tensor,
...
@@ -1376,6 +1376,9 @@ def implicit_gemm_backward(features: torch.Tensor,
mask_width
=-
1
,
mask_width
=-
1
,
beta
=
beta
,
beta
=
beta
,
stream
=
stream
)
stream
=
stream
)
# for backward weight, beta = 0 because each split
# handle different kernel locations.
# TODO remove D iterator in backward weight kernel
CONV
.
run_with_tuned_result
(
CONV
.
run_with_tuned_result
(
wgrad_tune_res
,
wgrad_tune_res
,
ConvOpType
.
kBackwardWeight
,
ConvOpType
.
kBackwardWeight
,
...
@@ -1389,7 +1392,7 @@ def implicit_gemm_backward(features: torch.Tensor,
...
@@ -1389,7 +1392,7 @@ def implicit_gemm_backward(features: torch.Tensor,
reverse_mask
=
False
,
reverse_mask
=
False
,
mask_filter
=
masks
[
j
].
item
(),
mask_filter
=
masks
[
j
].
item
(),
mask_width
=
mask_width
,
mask_width
=
mask_width
,
beta
=
beta
,
beta
=
0
,
workspace
=
workspace_tv
,
workspace
=
workspace_tv
,
stream
=
stream
)
stream
=
stream
)
...
@@ -1403,6 +1406,8 @@ def indice_maxpool(features: torch.Tensor, indice_pairs: torch.Tensor,
...
@@ -1403,6 +1406,8 @@ def indice_maxpool(features: torch.Tensor, indice_pairs: torch.Tensor,
# stream = get_current_stream()
# stream = get_current_stream()
# CONV.stream_synchronize(stream)
# CONV.stream_synchronize(stream)
# t = time.time()
# t = time.time()
if
not
features
.
is_contiguous
():
features
=
features
.
contiguous
()
out_channel
=
features
.
shape
[
-
1
]
out_channel
=
features
.
shape
[
-
1
]
out_features
=
torch
.
zeros
((
num_activate_out
,
out_channel
),
out_features
=
torch
.
zeros
((
num_activate_out
,
out_channel
),
...
@@ -1474,6 +1479,8 @@ def indice_maxpool_implicit_gemm(features: torch.Tensor,
...
@@ -1474,6 +1479,8 @@ def indice_maxpool_implicit_gemm(features: torch.Tensor,
stream
=
get_current_stream
()
stream
=
get_current_stream
()
# CONV.stream_synchronize(stream)
# CONV.stream_synchronize(stream)
# t = time.time()
# t = time.time()
if
not
features
.
is_contiguous
():
features
=
features
.
contiguous
()
out_channel
=
features
.
shape
[
-
1
]
out_channel
=
features
.
shape
[
-
1
]
out_features
=
torch
.
empty
((
num_activate_out
,
out_channel
),
out_features
=
torch
.
empty
((
num_activate_out
,
out_channel
),
...
...
spconv/pytorch/utils.py
View file @
bf011c76
...
@@ -71,36 +71,72 @@ class PointToVoxel(object):
...
@@ -71,36 +71,72 @@ class PointToVoxel(object):
pc
:
torch
.
Tensor
,
pc
:
torch
.
Tensor
,
clear_voxels
:
bool
=
True
,
clear_voxels
:
bool
=
True
,
empty_mean
:
bool
=
False
):
empty_mean
:
bool
=
False
):
"""generate voxels/indices/num_point_per_voxel/pc_voxel_ids from
point cloud.
This function don't return pc_voxel_id for backward compatility.
pc_voxel_id will be added in spconv 2.2.
Args:
pc: [N, 3+] point cloud.
clear_voxels: if True, call zero on voxels
empty_mean: if True, full empty location of voxels with mean.
Returns:
voxels: voxels
indices: quantized coords
num_per_voxel: number of points in a voxel
"""
res
=
self
.
generate_voxel_with_id
(
pc
,
clear_voxels
,
empty_mean
)
return
res
[
0
],
res
[
1
],
res
[
2
]
def
generate_voxel_with_id
(
self
,
pc
:
torch
.
Tensor
,
clear_voxels
:
bool
=
True
,
empty_mean
:
bool
=
False
):
"""generate voxels/indices/num_point_per_voxel/pc_voxel_ids from
point cloud.
Args:
pc: [N, 3+] point cloud.
clear_voxels: if True, call zero on voxels
empty_mean: if True, full empty location of voxels with mean.
Returns:
voxels: voxels
indices: quantized coords
num_per_voxel: number of points in a voxel
pc_voxel_id: voxel id for every point. if not exists, -1.
"""
assert
pc
.
device
.
type
==
self
.
device
.
type
,
"your pc device is wrong"
assert
pc
.
device
.
type
==
self
.
device
.
type
,
"your pc device is wrong"
expected_hash_data_num
=
pc
.
shape
[
0
]
*
2
expected_hash_data_num
=
pc
.
shape
[
0
]
*
2
with
torch
.
no_grad
():
with
torch
.
no_grad
():
pc_voxel_id
=
torch
.
empty
([
pc
.
shape
[
0
]],
dtype
=
torch
.
int64
,
device
=
self
.
device
)
pc_voxel_id_tv
=
torch_tensor_to_tv
(
pc_voxel_id
)
if
self
.
device
.
type
!=
"cpu"
:
if
self
.
device
.
type
!=
"cpu"
:
if
self
.
hashdata
.
shape
[
0
]
<
expected_hash_data_num
:
hashdata
=
torch
.
empty
([
expected_hash_data_num
,
2
],
self
.
hashdata
=
torch
.
empty
([
expected_hash_data_num
,
2
],
dtype
=
torch
.
int64
,
dtype
=
torch
.
int64
,
device
=
pc
.
device
)
device
=
self
.
device
)
point_indice_data
=
torch
.
empty
([
pc
.
shape
[
0
]],
dtype
=
torch
.
int64
,
device
=
pc
.
device
)
if
self
.
point_indice_data
.
shape
[
0
]
<
pc
.
shape
[
0
]:
self
.
point_indice_data
=
torch
.
empty
([
pc
.
shape
[
0
]],
dtype
=
torch
.
int64
,
device
=
self
.
device
)
pc_tv
=
torch_tensor_to_tv
(
pc
)
pc_tv
=
torch_tensor_to_tv
(
pc
)
stream
=
get_current_stream
()
stream
=
get_current_stream
()
voxels_tv
=
torch_tensor_to_tv
(
self
.
voxels
)
voxels_tv
=
torch_tensor_to_tv
(
self
.
voxels
)
indices_tv
=
torch_tensor_to_tv
(
self
.
indices
)
indices_tv
=
torch_tensor_to_tv
(
self
.
indices
)
num_per_voxel_tv
=
torch_tensor_to_tv
(
self
.
num_per_voxel
)
num_per_voxel_tv
=
torch_tensor_to_tv
(
self
.
num_per_voxel
)
hashdata_tv
=
torch_tensor_to_tv
(
hashdata_tv
=
torch_tensor_to_tv
(
self
.
hashdata
,
hashdata
,
dtype
=
tv
.
custom128
,
dtype
=
tv
.
custom128
,
shape
=
[
self
.
hashdata
.
shape
[
0
]])
shape
=
[
hashdata
.
shape
[
0
]])
point_indice_data_tv
=
torch_tensor_to_tv
(
point_indice_data_tv
=
torch_tensor_to_tv
(
point_indice_data
)
self
.
point_indice_data
)
with
torch
.
cuda
.
device
(
pc
.
device
):
res
=
SpconvOps
.
point2voxel_cuda
(
res
=
SpconvOps
.
point2voxel_cuda
(
pc_tv
,
voxels_tv
,
indices_tv
,
num_per_voxel_tv
,
pc_tv
,
voxels_tv
,
indices_tv
,
num_per_voxel_tv
,
hashdata_tv
,
point_indice_data_tv
,
pc_voxel_id_tv
,
self
.
vsize
,
hashdata_tv
,
point_indice_data_tv
,
self
.
vsize
,
self
.
grid_size
,
self
.
grid_stride
,
self
.
coors_range
,
self
.
grid_size
,
self
.
grid_stride
,
self
.
coors_range
,
empty_mean
,
clear_voxels
,
stream
)
empty_mean
,
clear_voxels
,
stream
)
num_voxels
=
res
[
0
].
shape
[
0
]
num_voxels
=
res
[
0
].
shape
[
0
]
else
:
else
:
pc_tv
=
torch_tensor_to_tv
(
pc
)
pc_tv
=
torch_tensor_to_tv
(
pc
)
...
@@ -111,6 +147,7 @@ class PointToVoxel(object):
...
@@ -111,6 +147,7 @@ class PointToVoxel(object):
hashdata_tv
=
torch_tensor_to_tv
(
self
.
hashdata
,
dtype
=
tv
.
int32
)
hashdata_tv
=
torch_tensor_to_tv
(
self
.
hashdata
,
dtype
=
tv
.
int32
)
res
=
SpconvOps
.
point2voxel_cpu
(
pc_tv
,
voxels_tv
,
indices_tv
,
res
=
SpconvOps
.
point2voxel_cpu
(
pc_tv
,
voxels_tv
,
indices_tv
,
num_per_voxel_tv
,
hashdata_tv
,
num_per_voxel_tv
,
hashdata_tv
,
pc_voxel_id_tv
,
self
.
vsize
,
self
.
grid_size
,
self
.
vsize
,
self
.
grid_size
,
self
.
grid_stride
,
self
.
grid_stride
,
self
.
coors_range
,
empty_mean
,
self
.
coors_range
,
empty_mean
,
...
@@ -118,4 +155,4 @@ class PointToVoxel(object):
...
@@ -118,4 +155,4 @@ class PointToVoxel(object):
num_voxels
=
res
[
0
].
shape
[
0
]
num_voxels
=
res
[
0
].
shape
[
0
]
return
(
self
.
voxels
[:
num_voxels
],
self
.
indices
[:
num_voxels
],
return
(
self
.
voxels
[:
num_voxels
],
self
.
indices
[:
num_voxels
],
self
.
num_per_voxel
[:
num_voxels
])
self
.
num_per_voxel
[:
num_voxels
]
,
pc_voxel_id
)
test/benchmark.py
View file @
bf011c76
...
@@ -24,7 +24,7 @@ from spconv.core import ConvAlgo
...
@@ -24,7 +24,7 @@ from spconv.core import ConvAlgo
import
spconv.pytorch
as
spconv
import
spconv.pytorch
as
spconv
from
spconv.utils
import
Point2VoxelCPU3d
from
spconv.utils
import
Point2VoxelCPU3d
# torch.backends.cudnn.enabled = False
def
waymo_data
(
batch_size
=
1
):
def
waymo_data
(
batch_size
=
1
):
gen
=
Point2VoxelCPU3d
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
3
,
gen
=
Point2VoxelCPU3d
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
3
,
150000
,
1
)
150000
,
1
)
...
@@ -289,7 +289,7 @@ def main():
...
@@ -289,7 +289,7 @@ def main():
voxels_th
=
torch
.
from_numpy
(
voxels
).
to
(
device
).
to
(
dtype
)
voxels_th
=
torch
.
from_numpy
(
voxels
).
to
(
device
).
to
(
dtype
)
coors_th
=
torch
.
from_numpy
(
coors
).
to
(
device
).
int
()
coors_th
=
torch
.
from_numpy
(
coors
).
to
(
device
).
int
()
voxels_th
.
requires_grad
=
True
voxels_th
.
requires_grad
=
True
algo
=
spconv
.
ConvAlgo
.
Native
algo
=
spconv
.
ConvAlgo
.
MaskImplicitGemm
# 3080 Laptop
# 3080 Laptop
# MaskImpGemm: 11.2ms
# MaskImpGemm: 11.2ms
# MaskSplitImpGemm: 12.2ms
# MaskSplitImpGemm: 12.2ms
...
@@ -324,26 +324,26 @@ def main():
...
@@ -324,26 +324,26 @@ def main():
print
(
out
.
spatial_shape
,
out
.
features
.
mean
(),
out
.
features
.
max
(),
print
(
out
.
spatial_shape
,
out
.
features
.
mean
(),
out
.
features
.
max
(),
out
.
features
.
min
())
out
.
features
.
min
())
#
times = []
times
=
[]
#
with torch.no_grad():
with
torch
.
no_grad
():
#
for i in range(20):
for
i
in
range
(
20
):
#
print("------------")
print
(
"------------"
)
#
torch.cuda.synchronize()
torch
.
cuda
.
synchronize
()
#
t = time.time()
t
=
time
.
time
()
#
out_nograd = net(voxels_th, coors_th, 1, False)
out_nograd
=
net
(
voxels_th
,
coors_th
,
1
,
False
)
#
timer = out_nograd._timer
timer
=
out_nograd
.
_timer
#
# res = timer.collect_by_name("forward", timer.get_all_pair_time())
# res = timer.collect_by_name("forward", timer.get_all_pair_time())
#
# res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
# res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
#
# print(sum(res.values()) + sum(res2.values()))
# print(sum(res.values()) + sum(res2.values()))
#
# print(timer.get_all_pair_time())
# print(timer.get_all_pair_time())
#
# print(sum(timer.get_all_pair_time().values()))
# print(sum(timer.get_all_pair_time().values()))
#
torch.cuda.synchronize()
torch
.
cuda
.
synchronize
()
#
# sort_bench()
# sort_bench()
#
times.append(time.time() - t)
times
.
append
(
time
.
time
()
-
t
)
#
print("spconv time", np.mean(times[10:]))
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
#
times = []
times
=
[]
# for i in range(10):
# for i in range(10):
# out = net(voxels_th, coors_th, 1)
# out = net(voxels_th, coors_th, 1)
...
...
test/test_all_algo.py
0 → 100644
View file @
bf011c76
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test all gemm/conv kernels.
We can't test all kernels in network because auto-tuner will only find one best kernel.
"""
import
sys
from
pathlib
import
Path
from
typing
import
Dict
,
List
,
Tuple
import
pickle
import
sys
import
time
from
pathlib
import
Path
from
cumm.gemm.algospec.core
import
GemmAlgo
,
ShuffleStrideType
import
numpy
as
np
import
pccm
import
torch
import
torch.nn.functional
as
F
from
spconv.test_utils
import
TestCase
from
cumm
import
tensorview
as
tv
from
cumm.conv.bases
import
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvOpType
import
os
from
cumm.gemm.codeops
import
div_up
from
spconv.core
import
AlgoHint
,
ConvAlgo
from
spconv.pytorch.conv
import
expand_nd
from
spconv.pytorch
import
ops
from
spconv.algo
import
CONV
,
GEMM
,
BestAlgoByProfile
,
BestConvAlgoByProfile
from
spconv.pytorch.cppcore
import
get_current_stream
,
torch_tensor_to_tv
from
spconv.test_utils
import
generate_sparse_data
,
params_grid
import
tqdm
from
spconv.constants
import
ALL_WEIGHT_IS_KRSC
assert
ALL_WEIGHT_IS_KRSC
is
True
,
"we only support KRSC in spconv >= 2.2"
# TODO remove or release this when tf32 op is ready
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
NUMPY_DTYPE_TO_TORCH
=
{
np
.
float32
:
torch
.
float32
,
np
.
float16
:
torch
.
float16
,
np
.
int8
:
torch
.
int8
,
}
class
SparseConvTester
:
def
__init__
(
self
,
algo
:
ConvAlgo
,
subm
:
bool
,
shape
:
List
[
int
],
bs
:
int
,
dtype
:
np
.
dtype
,
N
:
int
,
K
:
int
,
C
:
int
,
ksize
:
int
,
stride
:
int
,
padding
:
int
,
dilation
:
int
)
->
None
:
ndim
=
3
self
.
shape
=
shape
self
.
bs
=
bs
self
.
dtype
=
dtype
self
.
dtype_th
=
NUMPY_DTYPE_TO_TORCH
[
dtype
]
self
.
K
=
K
self
.
C
=
C
self
.
ksize
=
expand_nd
(
ksize
,
ndim
)
self
.
stride
=
expand_nd
(
stride
,
ndim
)
self
.
padding
=
expand_nd
(
padding
,
ndim
)
self
.
dilation
=
expand_nd
(
dilation
,
ndim
)
self
.
N
=
N
self
.
device
=
torch
.
device
(
"cuda:0"
)
op
=
expand_nd
(
0
,
ndim
)
self
.
kv
:
int
=
np
.
prod
(
self
.
ksize
)
self
.
num_split
=
1
if
algo
==
ConvAlgo
.
MaskImplicitGemm
else
2
sparse_dict
=
generate_sparse_data
(
shape
,
[
1500
]
*
bs
,
C
)
voxels_np
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices_np
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
indices_th
=
torch
.
from_numpy
(
indices_np
).
to
(
self
.
device
)
out_inds
,
pair_ref
,
indice_num_per_loc
=
ops
.
get_indice_pairs
(
indices_th
,
1
,
shape
,
ConvAlgo
.
Native
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
op
,
subm
)
self
.
indice_num_per_loc_np
=
indice_num_per_loc
.
cpu
().
numpy
()
self
.
indice_pairs_np
=
pair_ref
.
cpu
().
numpy
()
self
.
pair_native
=
pair_ref
self
.
indice_num_per_loc
=
indice_num_per_loc
if
algo
==
ConvAlgo
.
Native
:
self
.
out_inds
:
torch
.
Tensor
=
out_inds
self
.
num_inds_per_loc
:
torch
.
Tensor
=
indice_num_per_loc
self
.
pair_fwd
:
torch
.
Tensor
=
torch
.
Tensor
()
self
.
pair_bwd
:
torch
.
Tensor
=
torch
.
Tensor
()
self
.
pair_mask_fwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
pair_mask_bwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
mask_argsort_fwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
mask_argsort_bwd_splits
:
List
[
torch
.
Tensor
]
=
[]
self
.
masks
=
np
.
array
([])
else
:
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices_th
,
bs
,
shape
,
algo
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
op
,
subm
=
subm
)
self
.
out_inds
=
res
[
0
]
self
.
num_inds_per_loc
=
res
[
1
]
self
.
pair_fwd
=
res
[
2
]
self
.
pair_bwd
=
res
[
3
]
self
.
pair_mask_fwd_splits
=
res
[
4
]
self
.
pair_mask_bwd_splits
=
res
[
5
]
self
.
mask_argsort_fwd_splits
=
res
[
6
]
self
.
mask_argsort_bwd_splits
=
res
[
7
]
self
.
masks
=
res
[
8
]
self
.
voxels_np
=
voxels_np
self
.
indices_np
=
indices_np
self
.
subm
=
subm
if
dtype
==
np
.
int8
:
self
.
inp
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
voxels_np
.
shape
[
0
],
C
]).
astype
(
np
.
int8
)
self
.
weight
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
K
,
*
self
.
ksize
,
C
]).
astype
(
np
.
int8
)
self
.
output
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
self
.
out_inds
.
shape
[
0
],
K
]).
astype
(
dtype
)
else
:
self
.
inp
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
voxels_np
.
shape
[
0
],
C
]).
astype
(
dtype
)
self
.
weight
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
K
,
*
self
.
ksize
,
C
]).
astype
(
dtype
)
self
.
output
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
self
.
out_inds
.
shape
[
0
],
K
]).
astype
(
dtype
)
self
.
weight_ref
=
self
.
weight
.
transpose
(
1
,
2
,
3
,
0
,
4
)
self
.
weight_ref
=
np
.
ascontiguousarray
(
self
.
weight_ref
).
reshape
(
-
1
,
K
,
C
)
self
.
out_ref
,
self
.
din_ref
,
self
.
dw_ref
=
self
.
_get_ref_output
()
self
.
dw_ref
=
np
.
ascontiguousarray
(
self
.
dw_ref
.
transpose
(
1
,
0
,
2
).
reshape
(
K
,
*
self
.
ksize
,
C
))
def
_get_ref_output
(
self
):
output_ref
=
np
.
zeros_like
(
self
.
output
,
dtype
=
np
.
float32
)
dinput_ref
=
np
.
zeros_like
(
self
.
inp
,
dtype
=
np
.
float32
)
dw_ref
=
np
.
zeros_like
(
self
.
weight_ref
,
dtype
=
np
.
float32
)
# KV, K, C
for
filter_offset
in
range
(
self
.
kv
):
if
self
.
subm
and
filter_offset
>
self
.
kv
//
2
:
nhot
=
self
.
indice_num_per_loc_np
[
self
.
kv
-
1
-
filter_offset
]
elif
self
.
subm
and
filter_offset
==
self
.
kv
//
2
:
nhot
=
self
.
voxels_np
.
shape
[
0
]
else
:
nhot
=
self
.
indice_num_per_loc_np
[
filter_offset
]
i_inds
=
self
.
indice_pairs_np
[
0
][
filter_offset
][:
nhot
]
o_inds
=
self
.
indice_pairs_np
[
1
][
filter_offset
][:
nhot
]
a
=
self
.
inp
[
i_inds
]
cc
=
a
.
astype
(
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
T
.
astype
(
np
.
float32
)
output_ref
[
o_inds
]
+=
cc
a
=
self
.
output
[
o_inds
]
# NK @ KC
cc
=
a
.
astype
(
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
astype
(
np
.
float32
)
dinput_ref
[
i_inds
]
+=
cc
out_gather
=
self
.
output
[
o_inds
]
# [N, K]
inp_gather
=
self
.
inp
[
i_inds
]
# [N, C]
# KN @ NC
dw_res
=
out_gather
.
astype
(
np
.
float32
).
T
@
inp_gather
.
astype
(
np
.
float32
)
dw_ref
[
filter_offset
]
=
dw_res
return
output_ref
,
dinput_ref
,
dw_ref
def
get_operands
(
self
,
op_type
:
ConvOpType
):
zeros_func
=
tv
.
zeros
if
not
self
.
subm
else
tv
.
empty
if
op_type
==
ConvOpType
.
kBackwardInput
:
inp_tv
=
zeros_func
(
list
(
self
.
inp
.
shape
),
self
.
dtype
,
0
)
else
:
inp_tv
=
tv
.
from_numpy
(
self
.
inp
).
cuda
()
if
op_type
==
ConvOpType
.
kBackwardWeight
:
weight_tv
=
zeros_func
(
list
(
self
.
weight
.
shape
),
self
.
dtype
,
0
)
else
:
weight_tv
=
tv
.
from_numpy
(
self
.
weight
).
cuda
()
if
op_type
==
ConvOpType
.
kForward
:
output_tv
=
zeros_func
(
list
(
self
.
output
.
shape
),
self
.
dtype
,
0
)
else
:
output_tv
=
tv
.
from_numpy
(
self
.
output
).
cuda
()
return
inp_tv
,
weight_tv
,
output_tv
def
get_operands_torch
(
self
,
op_type
:
ConvOpType
):
zeros_func
=
torch
.
zeros
if
not
self
.
subm
else
torch
.
empty
if
op_type
==
ConvOpType
.
kBackwardInput
:
inp_tv
=
zeros_func
(
list
(
self
.
inp
.
shape
),
dtype
=
self
.
dtype_th
,
device
=
self
.
device
)
else
:
inp_tv
=
torch
.
from_numpy
(
self
.
inp
).
cuda
()
if
op_type
==
ConvOpType
.
kBackwardWeight
:
weight_tv
=
zeros_func
(
list
(
self
.
weight
.
shape
),
dtype
=
self
.
dtype_th
,
device
=
self
.
device
)
else
:
weight_tv
=
torch
.
from_numpy
(
self
.
weight
).
cuda
()
if
op_type
==
ConvOpType
.
kForward
:
output_tv
=
zeros_func
(
list
(
self
.
output
.
shape
),
dtype
=
self
.
dtype_th
,
device
=
self
.
device
)
else
:
output_tv
=
torch
.
from_numpy
(
self
.
output
).
cuda
()
return
inp_tv
,
weight_tv
,
output_tv
def
_test_impgemm_conv_cuda
(
subm
:
bool
):
ndim
=
3
dtype_to_tol
=
{
np
.
float32
:
(
1e-4
,
1e-4
),
np
.
float16
:
(
1e-2
,
1e-2
),
np
.
int8
:
(
1e-4
,
1e-4
),
}
device
=
torch
.
device
(
"cuda:0"
)
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
]
dtypes
=
[
np
.
float32
,
np
.
float16
]
test_case
=
TestCase
()
in_channels
=
[
32
,
47
]
out_channels
=
[
32
,
48
,
62
]
if
subm
:
ksizes
=
[
3
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
else
:
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
]
algos
=
[
ConvAlgo
.
MaskSplitImplicitGemm
,
ConvAlgo
.
MaskImplicitGemm
,
]
arch
=
torch
.
cuda
.
get_device_capability
()
for
shape
,
bs
,
C
,
K
,
k
,
s
,
p
,
d
,
algo
,
dtype
in
tqdm
.
tqdm
(
params_grid
(
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
,
algos
,
dtypes
)):
tester
=
SparseConvTester
(
algo
,
subm
,
shape
,
bs
,
dtype
,
1500
,
K
,
C
,
k
,
s
,
p
,
d
)
atol
,
rtol
=
dtype_to_tol
[
dtype
]
mask_width_to_mask_out_fwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
mask_width_to_mask_out_bwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
op_types
=
[
ConvOpType
.
kForward
,
ConvOpType
.
kBackwardInput
]
spk
=
1
for
op_type
in
op_types
:
inp_tv
,
weight_tv
,
output_tv
=
tester
.
get_operands
(
op_type
)
avail_desps
=
CONV
.
get_all_available
(
inp_tv
,
weight_tv
,
output_tv
,
NHWC
,
NHWC
,
NHWC
,
arch
,
op_type
,
-
1
)
for
desp
in
avail_desps
:
if
not
subm
:
if
op_type
==
ConvOpType
.
kForward
:
output_tv
.
zero_
()
else
:
inp_tv
.
zero_
()
# this algo must success
mask_width
=
desp
.
tile_shape
[
0
]
# if mask_width != 32:
# continue
if
mask_width
not
in
mask_width_to_mask_out_fwd
:
mask_width_to_mask_out_fwd
[
mask_width
]
=
torch
.
zeros
([
2
,
div_up
(
tester
.
out_inds
.
shape
[
0
],
mask_width
)],
dtype
=
torch
.
int32
,
device
=
tester
.
device
)
mask_output_fwd
=
mask_width_to_mask_out_fwd
[
mask_width
]
if
subm
:
if
desp
.
op_type
==
ConvOpType
.
kForward
.
value
:
indice_pairs
=
tester
.
pair_fwd
elif
desp
.
op_type
==
ConvOpType
.
kBackwardInput
.
value
:
indice_pairs
=
tester
.
pair_bwd
else
:
indice_pairs
=
tester
.
pair_fwd
mask_output
=
mask_output_fwd
# print([bin(x.item()) for x in masks])
for
j
in
range
(
tester
.
num_split
):
beta
=
1
if
j
==
1
else
0
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
mask_op
=
mask_output
[
j
]
else
:
mask_op
=
tester
.
pair_mask_fwd_splits
[
j
]
if
desp
.
op_type
==
ConvOpType
.
kBackwardInput
.
value
:
reverse_mask
=
True
mask_output_run
=
torch_tensor_to_tv
(
mask_output
[
j
],
dtype
=
tv
.
uint32
)
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
mask_output_run
=
tv
.
Tensor
()
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
tester
.
mask_argsort_fwd_splits
[
j
]),
mask_output_run
,
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
else
:
if
mask_width
not
in
mask_width_to_mask_out_bwd
:
mask_width_to_mask_out_bwd
[
mask_width
]
=
torch
.
zeros
([
2
,
div_up
(
tester
.
indices_np
.
shape
[
0
],
mask_width
)],
dtype
=
torch
.
int32
,
device
=
tester
.
device
)
mask_output_bwd
=
mask_width_to_mask_out_bwd
[
mask_width
]
if
desp
.
op_type
==
ConvOpType
.
kForward
.
value
:
indice_pairs
=
tester
.
pair_fwd
# inp -> out
mask_ops
=
tester
.
pair_mask_fwd_splits
mask_argsorts
=
tester
.
mask_argsort_fwd_splits
mask_output
=
mask_output_fwd
elif
desp
.
op_type
==
ConvOpType
.
kBackwardInput
.
value
:
indice_pairs
=
tester
.
pair_bwd
# out -> inp
mask_ops
=
tester
.
pair_mask_bwd_splits
mask_argsorts
=
tester
.
mask_argsort_bwd_splits
mask_output
=
mask_output_bwd
else
:
indice_pairs
=
tester
.
pair_fwd
# inp -> out
mask_ops
=
tester
.
pair_mask_fwd_splits
mask_argsorts
=
tester
.
mask_argsort_fwd_splits
mask_output
=
mask_output_fwd
for
j
in
range
(
tester
.
num_split
):
beta
=
1
if
j
==
1
else
0
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
mask_op
=
mask_output
[
j
]
else
:
mask_op
=
mask_ops
[
j
]
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
mask_argsorts
[
j
]),
torch_tensor_to_tv
(
mask_output
[
j
],
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
out_ref
=
tester
.
out_ref
din_ref
=
tester
.
din_ref
dw_ref
=
tester
.
dw_ref
if
op_type
==
ConvOpType
.
kForward
:
out_my
=
output_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
test_case
.
assertAllClose
(
out_ref
,
out_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
out_ref
.
reshape
(
-
1
)
-
out_my
.
reshape
(
-
1
))
assert
error_norm
<
5
# print(desp, )
else
:
din_my
=
inp_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
test_case
.
assertAllClose
(
din_ref
,
din_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
din_ref
.
reshape
(
-
1
)
-
din_my
.
reshape
(
-
1
))
assert
error_norm
<
10
,
f
"
{
desp
}
,
{
error_norm
}
,
{
k
}
,
{
s
}
,
{
p
}
,
{
d
}
"
inp_tv
,
weight_tv
,
output_tv
=
tester
.
get_operands
(
ConvOpType
.
kBackwardWeight
)
for
spk
in
[
1
,
4
,
16
,
64
]:
for
mask_width
,
mask_output
in
mask_width_to_mask_out_fwd
.
items
():
avail_desps
=
CONV
.
get_all_available
(
inp_tv
,
weight_tv
,
output_tv
,
NHWC
,
NHWC
,
NHWC
,
arch
,
ConvOpType
.
kBackwardWeight
,
mask_width
)
for
desp
in
avail_desps
:
weight_tv
.
zero_
()
if
subm
:
indice_pairs
=
tester
.
pair_fwd
for
j
in
range
(
tester
.
num_split
):
beta
=
0
mask_filter
=
tester
.
masks
[
j
].
item
()
mask_op
=
mask_output
[
j
]
mask_op_tv
=
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
)
# mask_op_np = mask_op_tv.cpu().numpy()
# bit_ref = np.bitwise_or.reduce(mask_op_np, axis=0)
# bit_my = mask_filter
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
mask_op_tv
,
torch_tensor_to_tv
(
tester
.
mask_argsort_fwd_splits
[
j
]),
tv
.
Tensor
(),
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
=
False
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
else
:
indice_pairs
=
tester
.
pair_fwd
# inp -> out
mask_ops
=
tester
.
pair_mask_fwd_splits
mask_argsorts
=
tester
.
mask_argsort_fwd_splits
for
j
in
range
(
tester
.
num_split
):
# beta = 1 if j == 1 else 0
beta
=
0
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
mask_op
=
mask_output
[
j
]
CONV
.
run_with_tuned_result
(
BestConvAlgoByProfile
(
desp
,
spk
),
desp
.
op_type
,
inp_tv
,
weight_tv
,
output_tv
,
torch_tensor_to_tv
(
mask_op
,
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
mask_argsorts
[
j
]),
torch_tensor_to_tv
(
mask_output
[
j
],
dtype
=
tv
.
uint32
),
torch_tensor_to_tv
(
indice_pairs
),
reverse_mask
,
mask_filter
=
mask_filter
,
mask_width
=
mask_width
,
beta
=
beta
,
verbose
=
False
,
)
dw_ref
=
tester
.
dw_ref
dw_my
=
weight_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
# print(desp, spk, K, C, mask_width, algo)
test_case
.
assertAllClose
(
dw_ref
,
dw_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
dw_ref
.
reshape
(
-
1
)
-
dw_my
.
reshape
(
-
1
))
# print(desp, error_norm)
assert
error_norm
<
5
def
_test_native_conv_cuda
(
subm
:
bool
):
ndim
=
3
dtype_to_tol
=
{
np
.
float32
:
(
1e-4
,
1e-4
),
np
.
float16
:
(
1e-2
,
1e-2
),
np
.
int8
:
(
1e-4
,
1e-4
),
}
device
=
torch
.
device
(
"cuda:0"
)
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
]
dtypes
=
[
np
.
float32
,
np
.
float16
]
test_case
=
TestCase
()
in_channels
=
[
32
,
47
]
out_channels
=
[
32
,
48
,
62
]
if
subm
:
ksizes
=
[
3
,
5
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
else
:
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
]
arch
=
torch
.
cuda
.
get_device_capability
()
stream
=
get_current_stream
()
for
shape
,
bs
,
C
,
K
,
k
,
s
,
p
,
d
,
dtype
in
tqdm
.
tqdm
(
params_grid
(
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
,
dtypes
)):
tester
=
SparseConvTester
(
ConvAlgo
.
Native
,
subm
,
shape
,
bs
,
dtype
,
1500
,
K
,
C
,
k
,
s
,
p
,
d
)
atol
,
rtol
=
dtype_to_tol
[
dtype
]
kv_center
=
tester
.
kv
//
2
kv
=
tester
.
kv
pair_in
=
torch_tensor_to_tv
(
tester
.
pair_native
)[
0
]
pair_out
=
torch_tensor_to_tv
(
tester
.
pair_native
)[
1
]
op_types
=
[
ConvOpType
.
kForward
,
ConvOpType
.
kBackwardInput
,
ConvOpType
.
kBackwardWeight
]
indice_pair_num_cpu
=
tester
.
indice_num_per_loc_np
spk
=
1
out_ref
=
tester
.
out_ref
din_ref
=
tester
.
din_ref
dw_ref
=
tester
.
dw_ref
.
reshape
(
K
,
-
1
,
C
)
for
op_type
in
op_types
:
inp_th
,
weight_th
,
output_th
=
tester
.
get_operands_torch
(
op_type
)
weight_th
=
weight_th
.
view
(
K
,
-
1
,
C
)
inp_tv
=
torch_tensor_to_tv
(
inp_th
)
weight_tv
=
torch_tensor_to_tv
(
weight_th
)
output_tv
=
torch_tensor_to_tv
(
output_th
)
if
op_type
==
ConvOpType
.
kForward
:
a
=
inp_tv
c
=
output_tv
b
=
weight_tv
.
select
(
1
,
tester
.
kv
//
2
)
avail_desps
=
GEMM
.
get_all_available
(
a
,
b
,
c
,
False
,
True
,
False
,
arch
,
ShuffleStrideType
.
ShuffleAC
)
for
desp
in
avail_desps
:
if
subm
:
torch
.
mm
(
inp_th
,
weight_th
[:,
tester
.
kv
//
2
].
T
,
out
=
output_th
)
else
:
output_tv
.
zero_
()
inited
=
subm
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
b
=
weight_tv
.
select
(
1
,
i
)
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
GEMM
.
run_with_tuned_result
(
BestAlgoByProfile
(
desp
,
1
),
a
,
b
,
c
,
False
,
True
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
inp_indices
,
c_inds
=
out_indices
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
beta
=
beta
)
inited
=
True
out_my
=
output_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
# error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
# assert error_norm < 1
# print(desp, K, C, k, error_norm)
test_case
.
assertAllClose
(
out_ref
,
out_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
error_norm
=
np
.
linalg
.
norm
(
out_ref
.
reshape
(
-
1
)
-
out_my
.
reshape
(
-
1
))
assert
error_norm
<
10
elif
op_type
==
ConvOpType
.
kBackwardInput
:
a
=
output_tv
b
=
weight_tv
.
select
(
1
,
tester
.
kv
//
2
)
c
=
inp_tv
avail_desps
=
GEMM
.
get_all_available
(
a
,
b
,
c
,
False
,
False
,
False
,
arch
,
ShuffleStrideType
.
ShuffleAC
)
for
desp
in
avail_desps
:
if
subm
:
torch
.
mm
(
output_th
,
weight_th
[:,
tester
.
kv
//
2
],
out
=
inp_th
)
else
:
inp_tv
.
zero_
()
inited
=
subm
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
b
=
weight_tv
.
select
(
1
,
i
)
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
GEMM
.
run_with_tuned_result
(
BestAlgoByProfile
(
desp
,
1
),
a
,
b
,
c
,
False
,
False
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
out_indices
,
c_inds
=
inp_indices
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
beta
=
beta
)
inited
=
True
din_my
=
inp_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
# error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
# print(desp, K, C, k, error_norm)
test_case
.
assertAllClose
(
din_ref
,
din_my
,
atol
=
atol
,
rtol
=
rtol
)
# assert error_norm < 1
else
:
error_norm
=
np
.
linalg
.
norm
(
din_ref
.
reshape
(
-
1
)
-
din_my
.
reshape
(
-
1
))
assert
error_norm
<
10
else
:
a
=
output_tv
b
=
inp_tv
c
=
weight_tv
.
select
(
1
,
tester
.
kv
//
2
)
avail_desps
=
GEMM
.
get_all_available
(
a
,
b
,
c
,
True
,
False
,
False
,
arch
,
ShuffleStrideType
.
ShuffleAB
)
for
desp
in
avail_desps
:
inited
=
subm
weight_tv
.
zero_
()
if
subm
:
torch
.
mm
(
output_th
.
T
,
inp_th
,
out
=
weight_th
[:,
kv_center
])
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
beta
=
1.0
if
inited
else
0.0
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
a_inds
=
out_indices
b_inds
=
inp_indices
GEMM
.
run_with_tuned_result
(
BestAlgoByProfile
(
desp
,
32
),
a
,
b
,
weight_tv
.
select
(
1
,
i
),
True
,
False
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAB
,
a_inds
=
a_inds
,
b_inds
=
b_inds
,
hint
=
AlgoHint
.
BackwardWeight
.
value
,
alpha
=
1.0
,
beta
=
beta
)
dw_my
=
weight_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
error_norm
=
np
.
linalg
.
norm
(
dw_ref
.
reshape
(
-
1
)
-
dw_my
.
reshape
(
-
1
))
assert
error_norm
<
1
# test_case.assertAllClose(dw_ref, dw_my, atol=atol, rtol=rtol)
# print(desp, error_norm)
else
:
error_norm
=
np
.
linalg
.
norm
(
dw_ref
.
reshape
(
-
1
)
-
dw_my
.
reshape
(
-
1
))
# print(desp, error_norm)
assert
error_norm
<
10
def
test_all_algo_unit
():
_test_impgemm_conv_cuda
(
True
)
_test_impgemm_conv_cuda
(
False
)
_test_native_conv_cuda
(
True
)
_test_native_conv_cuda
(
False
)
if
__name__
==
"__main__"
:
test_all_algo_unit
()
\ No newline at end of file
test/test_conv.py
View file @
bf011c76
...
@@ -12,6 +12,12 @@
...
@@ -12,6 +12,12 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Compare results between sparse and dense layers:
SparseConvXd
SparseConvTransposeXd
SparseMaxPoolXd
"""
import
time
import
time
import
unittest
import
unittest
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -24,13 +30,11 @@ from spconv.core import ConvAlgo
...
@@ -24,13 +30,11 @@ from spconv.core import ConvAlgo
import
spconv.pytorch
as
spconv
import
spconv.pytorch
as
spconv
from
spconv.test_utils
import
TestCase
,
generate_sparse_data
,
params_grid
from
spconv.test_utils
import
TestCase
,
generate_sparse_data
,
params_grid
from
spconv.constants
import
ALL_WEIGHT_IS_KRSC
,
FILTER_HWIO
from
spconv.constants
import
ALL_WEIGHT_IS_KRSC
,
FILTER_HWIO
# import sparseconvnet as scn
# we must disable tf32 to increase reference precision.
# we must disable tf32 to increase reference precision.
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
class
SparseConv3dTestTorch
(
nn
.
Module
):
class
SparseConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
def
__init__
(
self
,
num_layers
,
num_layers
,
...
@@ -76,52 +80,6 @@ class SparseConv3dTestTorch(nn.Module):
...
@@ -76,52 +80,6 @@ class SparseConv3dTestTorch(nn.Module):
self
.
grid
)
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SubMConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
spconv
.
ConvAlgo
.
Native
):
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
,
algo
=
algo
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
,
algo
=
algo
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self
.
grid
=
None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
# .cpu()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
class
Conv3dTestTorch
(
nn
.
Module
):
class
Conv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
):
...
@@ -150,11 +108,11 @@ class Conv3dTestTorch(nn.Module):
...
@@ -150,11 +108,11 @@ class Conv3dTestTorch(nn.Module):
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SparseDeConv3dTestTorch
(
nn
.
Module
):
class
SparseDeConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
,
algo
):
super
().
__init__
()
super
().
__init__
()
self
.
algo
=
algo
layers
=
[
layers
=
[
spconv
.
SparseConvTranspose3d
(
in_channels
,
spconv
.
SparseConvTranspose3d
(
in_channels
,
out_channels
,
out_channels
,
...
@@ -162,7 +120,8 @@ class SparseDeConv3dTestTorch(nn.Module):
...
@@ -162,7 +120,8 @@ class SparseDeConv3dTestTorch(nn.Module):
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
)
bias
=
False
,
algo
=
algo
)
]
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
layers
.
append
(
...
@@ -172,7 +131,8 @@ class SparseDeConv3dTestTorch(nn.Module):
...
@@ -172,7 +131,8 @@ class SparseDeConv3dTestTorch(nn.Module):
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
))
bias
=
False
,
algo
=
algo
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
...
@@ -213,14 +173,15 @@ class DeConv3dTestTorch(nn.Module):
...
@@ -213,14 +173,15 @@ class DeConv3dTestTorch(nn.Module):
class
SparseMaxPoolTestTorch
(
nn
.
Module
):
class
SparseMaxPoolTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
dilation
):
dilation
,
algo
):
super
().
__init__
()
super
().
__init__
()
self
.
algo
=
algo
layers
=
[
layers
=
[
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
algo
)
]
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
layers
.
append
(
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
algo
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
...
@@ -243,86 +204,6 @@ class MaxPool3dTestTorch(nn.Module):
...
@@ -243,86 +204,6 @@ class MaxPool3dTestTorch(nn.Module):
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SubmanifoldConvTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
bias
=
False
,
indice_key
=
"subm0"
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
class
SCNCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
self
.
scn_input
=
scn
.
InputLayer
(
ndim
,
shape
,
mode
=
0
)
self
.
net
=
nn
.
Sequential
(
scn
.
Convolution
(
ndim
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
Deconvolution
(
ndim
,
out_channels
,
in_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
SparseToDense
(
ndim
,
in_channels
),
)
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
long
().
cpu
()
x
=
self
.
scn_input
((
coors
,
features
))
return
self
.
net
(
x
)
class
SparseCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
indice_key
=
"cp0"
,
bias
=
False
),
spconv
.
SparseInverseConv3d
(
out_channels
,
in_channels
,
kernel_size
,
indice_key
=
"cp0"
,
bias
=
False
),
)
self
.
todense
=
spconv
.
ToDense
()
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
todense
(
self
.
net
(
x
))
# .dense()
def
gather_nd
(
params
,
indices
):
def
gather_nd
(
params
,
indices
):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim
=
indices
.
shape
[
-
1
]
ndim
=
indices
.
shape
[
-
1
]
...
@@ -349,367 +230,147 @@ def scatter_nd(indices, updates, shape):
...
@@ -349,367 +230,147 @@ def scatter_nd(indices, updates, shape):
ret
[
slices
]
=
updates
.
view
(
*
output_shape
)
ret
[
slices
]
=
updates
.
view
(
*
output_shape
)
return
ret
return
ret
def
test_spconv3d
():
test_case
=
TestCase
()
np
.
random
.
seed
(
484
)
torch
.
manual_seed
(
48848
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
class
TestSpConv
(
TestCase
):
in_channels
=
[
32
]
def
testSpConv3d
(
self
):
out_channels
=
[
32
,
48
,
64
]
np
.
random
.
seed
(
484
)
ksizes
=
[
2
,
3
]
torch
.
manual_seed
(
48848
)
strides
=
[
1
,
2
,
3
]
devices
=
[
"cuda:0"
]
paddings
=
[
0
,
1
,
2
]
shapes
=
[[
19
,
18
,
17
]]
dilations
=
[
1
,
2
,
3
]
batchsizes
=
[
1
,
2
]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
in_channels
=
[
32
]
ConvAlgo
.
MaskSplitImplicitGemm
out_channels
=
[
32
,
48
,
64
]
]
ksizes
=
[
2
,
3
]
# algos = [ConvAlgo.Native]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
,
2
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
dilations
=
[
1
,
2
,
3
]
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
algos
=
[
strides
,
paddings
,
dilations
,
algos
):
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
if
all
([
s
>
1
,
d
>
1
]):
ConvAlgo
.
MaskSplitImplicitGemm
continue
# don't support this.
]
# print(dev, shape, bs, IC, OC, k, s, p, d)
# algos = [ConvAlgo.Native]
device
=
torch
.
device
(
dev
)
num_points
=
[
1500
]
*
bs
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
dtype
=
torch
.
float32
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
net
=
SparseConv3dTestTorch
(
1
,
strides
,
paddings
,
dilations
,
algos
):
3
,
if
all
([
s
>
1
,
d
>
1
]):
shape
,
continue
# don't support this.
IC
,
# print(dev, shape, bs, IC, OC, k, s, p, d)
OC
,
device
=
torch
.
device
(
dev
)
k
,
num_points
=
[
1000
]
*
bs
s
,
dtype
=
torch
.
float32
p
,
net
=
SparseConv3dTestTorch
(
1
,
d
,
3
,
algo
=
al
).
to
(
device
).
to
(
dtype
)
shape
,
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
IC
,
d
).
to
(
device
).
to
(
dtype
)
OC
,
k
,
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
s
,
p
,
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
d
,
np
.
float32
)
algo
=
al
).
to
(
device
).
to
(
dtype
)
indices
=
np
.
ascontiguousarray
(
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
d
).
to
(
device
).
to
(
dtype
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
features_t
.
requires_grad
=
True
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
np
.
float32
)
dtype
)
indices
=
np
.
ascontiguousarray
(
features_dense_t
.
requires_grad
=
True
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
features_dense_t
.
requires_grad
=
True
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
OC
,
k
,
k
,
k
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
else
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
# OHWI -> OIHW
dw
=
dw
.
transpose
(
0
,
4
,
1
,
2
,
3
)
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
def
testSpDeConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
64
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
,
3
]
strides
=
[
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
ksizes
=
[
3
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
if
FILTER_HWIO
:
if
FILTER_HWIO
:
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
OC
]).
astype
(
np
.
float32
)
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
else
:
else
:
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
IC
]).
astype
(
np
.
float32
)
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
print
(
net_ref
.
net
[
0
].
weight
.
shape
)
if
FILTER_HWIO
:
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
4
,
3
,
0
,
1
,
2
).
contiguous
()
else
:
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
3
,
4
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
else
:
out_ref
=
net_ref
(
features_dense_t
)
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
size
=
[
OC
,
k
,
k
,
k
,
out_np
=
out
.
detach
().
cpu
().
numpy
()
IC
]).
astype
(
np
.
float32
)
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out_ref
=
net_ref
(
features_dense_t
)
out
.
backward
(
dout_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out_ref
.
backward
(
dout_t
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
1
).
contiguous
()
test_case
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
din_np
=
din
.
cpu
().
numpy
()
out_ref
.
shape
).
astype
(
features
.
dtype
)
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
out
.
backward
(
dout_t
)
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
out_ref
.
backward
(
dout_t
)
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
if
FILTER_HWIO
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
else
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
def
testSpCpConv3d
(
self
):
else
:
np
.
random
.
seed
(
484
)
# OHWI -> OIHW
devices
=
[
"cuda:0"
,
"cpu:0"
]
dw
=
dw
.
transpose
(
0
,
4
,
1
,
2
,
3
)
shapes
=
[[
20
,
20
,
20
]]
batchsizes
=
[
1
,
2
]
test_case
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
test_case
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
in_channels
=
[
64
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
]
strides
=
[
2
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
):
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_scn_t
=
torch
.
from_numpy
(
indices
[:,
[
1
,
2
,
3
,
0
]]).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_ref_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_ref_t
.
requires_grad
=
True
net_ref
=
SCNCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net
=
SparseCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
net
.
net
[
0
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
0
].
weight
.
shape
)
net_ref
.
net
[
1
].
weight
.
data
[:]
=
net
.
net
[
1
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
1
].
weight
.
shape
)
out_ref
=
net_ref
(
features_ref_t
,
indices_scn_t
,
bs
)
out
=
net
(
features_t
,
indices_t
,
bs
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din
=
features_t
.
grad
.
detach
()
din_ref
=
features_ref_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_ref_np
=
din_ref
.
cpu
().
numpy
()
self
.
assertAllClose
(
din_ref_np
,
din_np
,
atol
=
1e-4
)
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
view
(
*
dw
.
shape
).
numpy
()
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpMaxPool3d
(
self
):
np
.
random
.
seed
(
485
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
64
]
out_channels
=
[
64
]
ksizes
=
[
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
,
3
]
# ksizes = [2]
# strides = [2]
# paddings = [0]
# dilations = [1]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
,
data_range
=
[
0.1
,
1
])
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
net
=
SparseMaxPoolTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
MaxPool3dTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
).
to
(
device
)
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
)
outids
=
out
.
indices
outfeatures
=
out
.
features
outids_dev
=
outids
.
float
()
out_dense
=
out
.
dense
(
channels_first
=
False
)
out
=
out_dense
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout_sparse
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
dout_sparse_t
=
torch
.
from_numpy
(
dout_sparse
).
to
(
device
)
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
list
(
out_dense
.
shape
))
dout_t
=
dout_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
def
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
):
# function for develop.
np
.
random
.
seed
(
484
)
# devices = ["cuda:0"]
devices
=
[
"cuda:0"
]
shapes
=
[[
400
,
400
,
15
]]
batchsizes
=
[
2
]
in_channels
=
[
19
]
def
test_spdeconv3d
():
out_channels
=
[
17
]
test_case
=
TestCase
()
ksizes
=
[(
3
,
3
,
3
)]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
in_channels
=
[
64
]
out_channels
=
[
32
,
48
,
64
]
ksizes
=
[
2
,
3
]
strides
=
[
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskSplitImplicitGemm
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
,
algos
):
if
all
([
s
>
1
,
d
>
1
]):
if
all
([
s
>
1
,
d
>
1
]):
continue
continue
# don't support this.
device
=
torch
.
device
(
dev
)
device
=
torch
.
device
(
dev
)
num_points
=
[
30000
]
*
bs
num_points
=
[
1000
]
*
bs
dtype
=
torch
.
float32
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
...
@@ -718,115 +379,154 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
...
@@ -718,115 +379,154 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
indices
=
np
.
ascontiguousarray
(
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
)
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
d
,
al
).
to
(
device
)
OC
]).
astype
(
np
.
float32
)
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
).
to
(
dtype
)
d
).
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
dtype
)
if
FILTER_HWIO
:
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
algo
=
algo
).
to
(
device
).
to
(
dtype
)
size
=
[
k
,
k
,
k
,
IC
,
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
OC
]).
astype
(
np
.
float32
)
d
).
to
(
device
).
to
(
dtype
)
else
:
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
2
).
contiguous
()
IC
]).
astype
(
np
.
float32
)
net
.
net
[
0
].
weight
[:]
=
filters_t
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
else
:
filters
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
OC
,
k
,
k
,
k
,
IC
]).
astype
(
np
.
float32
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
0
,
1
,
2
,
3
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
out_ref
=
net_ref
(
features_dense_t
)
out_ref
=
net_ref
(
features_dense_t
)
times
=
[]
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
for
i
in
range
(
10
):
out_np
=
out
.
detach
().
cpu
().
numpy
()
t
=
time
.
time
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
out
=
net
(
features_t
,
indices_t
,
bs
)
test_case
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
# print((net.grid == -1).float().sum(), net.grid.numel())
out_ref
.
shape
).
astype
(
features
.
dtype
)
# print("spconv time", time.time() - t)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
print
(
"spconv time"
,
np
.
mean
(
times
[
2
:]))
out
.
backward
(
dout_t
)
out
=
net
(
features_t
,
indices_t
,
bs
)
out_ref
.
backward
(
dout_t
)
# print(out.indices)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
out
=
out
.
dense
()
1
).
contiguous
()
out_numpy
=
out
.
detach
().
cpu
().
numpy
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
print
(
din_np
=
din
.
cpu
().
numpy
()
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
out_ref
.
detach
().
cpu
().
numpy
()))
test_case
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
out_numpy
.
sum
())
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
if
net
.
algo
==
ConvAlgo
.
Native
and
not
ALL_WEIGHT_IS_KRSC
:
if
FILTER_HWIO
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
else
:
# OHWI -> OIHW
dw
=
dw
.
transpose
(
4
,
0
,
1
,
2
,
3
)
test_case
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
def
test_spmaxpool3d
():
test_case
=
TestCase
()
def
main_subm
(
algo
,
dtype
=
torch
.
float32
):
np
.
random
.
seed
(
485
)
# function for develop.
np
.
random
.
seed
(
484
)
torch
.
manual_seed
(
50051
)
# devices = ["cuda:0"]
devices
=
[
"cuda:0"
]
devices
=
[
"cuda:0"
]
shapes
=
[[
400
,
400
,
1
5
]]
shapes
=
[[
19
,
18
,
1
7
]]
batchsizes
=
[
2
]
batchsizes
=
[
1
,
2
]
in_channels
=
[
32
]
in_channels
=
[
64
]
out_channels
=
[
64
]
out_channels
=
[
64
]
ksizes
=
[(
3
,
3
,
3
)]
ksizes
=
[
2
,
3
]
strides
=
[
1
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
1
]
paddings
=
[
0
,
1
]
dilations
=
[
1
]
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
# ksizes = [2]
# strides = [2]
# paddings = [0]
# dilations = [1]
algos
=
[
ConvAlgo
.
Native
,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskSplitImplicitGemm
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
,
al
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
,
algos
):
if
all
([
s
>
1
,
d
>
1
]):
if
all
([
s
>
1
,
d
>
1
]):
continue
continue
# don't support this.
device
=
torch
.
device
(
dev
)
device
=
torch
.
device
(
dev
)
num_points
=
[
1
20
000
]
*
bs
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
,
data_range
=
[
0.1
,
1
])
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
OC
]).
astype
(
np
.
float32
)
features_t
.
requires_grad
=
True
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
).
to
(
dtype
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
to
(
dtype
)
features_dense_t
.
requires_grad
=
True
net
=
SparseMaxPoolTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
,
al
).
to
(
device
)
net_ref
=
MaxPool3dTestTorch
(
1
,
3
,
shape
,
k
,
s
,
p
,
d
).
to
(
device
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
to
(
dtype
)
net
=
SubMConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
algo
).
to
(
device
).
to
(
dtype
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
to
(
dtype
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
to
(
dtype
)
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out_ref
=
net_ref
(
features_dense_t
)
times
=
[]
for
i
in
range
(
20
):
t
=
time
.
time
()
out
=
net
(
features_t
,
indices_t
,
bs
)
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
out
=
net
(
features_t
,
indices_t
,
bs
)
out
=
net
(
features_t
,
indices_t
,
bs
)
# print(out.indices)
out
=
out
.
dense
()
outids
=
out
.
indices
out_numpy
=
out
.
detach
().
cpu
().
numpy
()
outfeatures
=
out
.
features
# print(
outids_dev
=
outids
.
float
()
# np.linalg.norm(out.detach().cpu().numpy() -
out_dense
=
out
.
dense
(
channels_first
=
False
)
# out_ref.detach().cpu().numpy()))
out
=
out_dense
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_numpy
.
sum
())
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
return
out_numpy
test_case
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout_sparse
=
np
.
random
.
uniform
(
if
__name__
==
'__main__'
:
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
# main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
dout_sparse_t
=
torch
.
from_numpy
(
dout_sparse
).
to
(
device
)
# main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
# TestCase().assertAllClose(out_my, out_ref)
list
(
out_dense
.
shape
))
# unittest.main()
dout_t
=
dout_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
TestSpConv
().
testSpConv3d
()
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
test_case
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
if
__name__
==
"__main__"
:
test_spmaxpool3d
()
\ No newline at end of file
test/test_implgemm.py
deleted
100644 → 0
View file @
4791f582
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
test/test_multi_impl.py
View file @
bf011c76
...
@@ -12,9 +12,330 @@
...
@@ -12,9 +12,330 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Compare results between different algo:
"""Compare results between different algo
s
:
CPU: gather-mm-scatter
CPU:
simple
gather-mm-scatter
Native: Fused gather-mm-scatter
Native: Fused gather-mm-scatter
ImplicitGemm
ImplicitGemm
: implicit gemm
"""
"""
import
time
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
from
cumm
import
tensorview
as
tv
from
spconv.core
import
ConvAlgo
import
spconv.pytorch
as
spconv
import
pickle
from
spconv.test_utils
import
generate_sparse_data
,
params_grid
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
pool_algo
=
algo
# pool_algo = ConvAlgo.Native
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SubMConv3d
(
32
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SparseConv3d
(
64
,
64
,
3
,
2
,
1
,
bias
=
False
,
indice_key
=
"m0"
,
algo
=
algo
),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv
.
SparseConv3d
(
96
,
96
,
2
,
2
,
bias
=
False
,
indice_key
=
"m1"
,
algo
=
algo
),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SubMConv3d
(
96
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
spconv
.
SparseMaxPool3d
(
2
,
2
,
algo
=
pool_algo
),
spconv
.
SubMConv3d
(
128
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
spconv
.
SparseMaxPool3d
(
2
,
2
,
algo
=
pool_algo
,
indice_key
=
"m3"
),
spconv
.
SubMConv3d
(
160
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
indice_key
=
"m4"
,
algo
=
pool_algo
),
# spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
# nn.BatchNorm1d(256),
# nn.ReLU(),
spconv
.
SparseInverseConv3d
(
224
,
128
,
2
,
indice_key
=
"m4"
,
bias
=
False
,
algo
=
algo
),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseInverseConv3d
(
128
,
64
,
2
,
indice_key
=
"m3"
,
bias
=
False
,
algo
=
algo
),
)
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
class
NetLight
(
nn
.
Module
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
pool_algo
=
algo
# pool_algo = ConvAlgo.Native
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SubMConv3d
(
32
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv
.
SparseConv3d
(
64
,
64
,
3
,
2
,
1
,
bias
=
False
,
indice_key
=
"m0"
,
algo
=
algo
),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv
.
SparseConv3d
(
96
,
96
,
2
,
2
,
bias
=
False
,
indice_key
=
"m1"
,
algo
=
algo
),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv
.
SparseInverseConv3d
(
96
,
64
,
2
,
indice_key
=
"m1"
,
bias
=
False
,
algo
=
algo
),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseInverseConv3d
(
64
,
32
,
3
,
indice_key
=
"m0"
,
bias
=
False
,
algo
=
algo
),
)
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
def
_test_multi_impl
(
dtype
:
torch
.
dtype
):
# TODO remove or release this when tf32 op is ready
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
np
.
random
.
seed
(
50051
)
if
dtype
!=
torch
.
float16
:
with
open
(
Path
(
__file__
).
parent
/
"data"
/
"test_spconv.pkl"
,
"rb"
)
as
f
:
(
voxels
,
coors
,
spatial_shape
)
=
pickle
.
load
(
f
)
else
:
# CPU fp16 is very slow, so we use a small data here.
spatial_shape
=
[
19
,
18
,
17
]
sparse_dict
=
generate_sparse_data
(
spatial_shape
,
[
1500
]
*
1
,
3
)
voxels
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
coors
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
device
=
torch
.
device
(
"cuda:0"
)
device_cpu
=
torch
.
device
(
"cpu:0"
)
voxels_th
=
torch
.
from_numpy
(
voxels
).
to
(
device_cpu
).
to
(
dtype
)
coors_th
=
torch
.
from_numpy
(
coors
).
to
(
device_cpu
).
int
()
voxels_th_cuda
=
torch
.
from_numpy
(
voxels
).
to
(
device
).
to
(
dtype
)
coors_th_cuda
=
torch
.
from_numpy
(
coors
).
to
(
device
).
int
()
net_cls
=
Net
if
dtype
==
torch
.
float16
:
# CPU fp16 is very slow, so we use a small network here.
net_cls
=
NetLight
# cpu
torch
.
manual_seed
(
50051
)
net_native_cpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
Native
).
to
(
device_cpu
).
to
(
dtype
)
# gpu_native
torch
.
manual_seed
(
50051
)
net_native_gpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
Native
).
to
(
device
).
to
(
dtype
)
torch
.
manual_seed
(
50051
)
net_imp_gpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
MaskImplicitGemm
).
to
(
device
).
to
(
dtype
)
torch
.
manual_seed
(
50051
)
net_simp_gpu
=
net_cls
(
spatial_shape
,
ConvAlgo
.
MaskSplitImplicitGemm
).
to
(
device
).
to
(
dtype
)
spconv
.
assign_name_for_sparse_modules
(
net_native_cpu
)
spconv
.
assign_name_for_sparse_modules
(
net_native_gpu
)
spconv
.
assign_name_for_sparse_modules
(
net_imp_gpu
)
spconv
.
assign_name_for_sparse_modules
(
net_simp_gpu
)
with
torch
.
no_grad
():
out
:
torch
.
Tensor
=
net_native_cpu
(
voxels_th
,
coors_th
,
1
).
dense
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out
.
shape
).
astype
(
np
.
float32
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device_cpu
).
to
(
dtype
)
dout_t_cu
=
torch
.
from_numpy
(
dout
).
to
(
device
).
to
(
dtype
)
out_cpu
=
net_native_cpu
(
voxels_th
,
coors_th
,
1
).
dense
()
out_cpu
.
backward
(
dout_t
)
out
=
net_native_gpu
(
voxels_th_cuda
,
coors_th_cuda
,
1
).
dense
()
out
.
backward
(
dout_t_cu
)
out_imp
=
net_imp_gpu
(
voxels_th_cuda
,
coors_th_cuda
,
1
).
dense
()
out_imp
.
backward
(
dout_t_cu
)
out_simp
=
net_simp_gpu
(
voxels_th_cuda
,
coors_th_cuda
,
1
).
dense
()
out_simp
.
backward
(
dout_t_cu
)
with
torch
.
no_grad
():
dense_cpu
=
out_cpu
.
cuda
()
dense_native
=
out
dense_imp
=
out_imp
dense_simp
=
out_simp
error_native
=
torch
.
linalg
.
norm
(
dense_cpu
-
dense_native
).
cpu
().
item
()
error_imp
=
torch
.
linalg
.
norm
(
dense_cpu
-
dense_imp
).
cpu
().
item
()
error_simp
=
torch
.
linalg
.
norm
(
dense_cpu
-
dense_simp
).
cpu
().
item
()
print
(
"error_native"
,
error_native
)
print
(
"error_imp"
,
error_imp
)
print
(
"error_simp"
,
error_simp
)
if
dtype
==
torch
.
float32
:
assert
error_native
<
0.01
assert
error_imp
<
0.01
assert
error_simp
<
0.01
else
:
assert
error_native
<
10
assert
error_imp
<
10
assert
error_simp
<
10
cpu_params
=
dict
(
net_native_cpu
.
named_parameters
())
native_params
=
dict
(
net_native_gpu
.
named_parameters
())
imp_params
=
dict
(
net_imp_gpu
.
named_parameters
())
simp_params
=
dict
(
net_simp_gpu
.
named_parameters
())
for
k
,
cpu_w
in
cpu_params
.
items
():
native_w
=
native_params
[
k
]
imp_w
=
imp_params
[
k
]
simp_w
=
simp_params
[
k
]
cpu_w_grad
=
cpu_w
.
grad
.
detach
().
cuda
()
native_w_grad
=
native_w
.
grad
.
detach
()
imp_w_grad
=
imp_w
.
grad
.
detach
()
simp_w_grad
=
simp_w
.
grad
.
detach
()
error_native
=
torch
.
linalg
.
norm
(
native_w_grad
-
cpu_w_grad
).
cpu
().
item
()
error_imp
=
torch
.
linalg
.
norm
(
native_w_grad
-
imp_w_grad
).
cpu
().
item
()
error_simp
=
torch
.
linalg
.
norm
(
native_w_grad
-
simp_w_grad
).
cpu
().
item
()
print
(
k
,
error_native
,
error_imp
,
error_simp
)
assert
error_imp
<
1
assert
error_simp
<
1
def
test_multi_impl
():
_test_multi_impl
(
torch
.
float32
)
_test_multi_impl
(
torch
.
float16
)
if
__name__
==
"__main__"
:
test_multi_impl
()
test/test_native_kernels.py
deleted
100644 → 0
View file @
4791f582
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
test_before_push.sh
0 → 100644
View file @
bf011c76
# developers must run this file before push or pull request.
# this script contains three parts:
# 1. unit tests for all gemm/conv kernels
# 2. comparison test: compare network fwd/bwd results between CPU, Native, ImplicitGemm
# 3. f32/f16 train/eval test based on mnist and some small datasets
echo
"-------------UNIT TEST START--------------"
pytest ./test
echo
"-------------UNIT TEST END--------------"
python ./example/mnist_sparse.py
--fp16
\ No newline at end of file
version.txt
View file @
bf011c76
2.
1.9
2.
2.0
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment