Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
01ed382c
Commit
01ed382c
authored
Oct 18, 2021
by
yan.yan
Browse files
working on tensor core test
parent
3517290c
Changes
159
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
720 additions
and
1234 deletions
+720
-1234
spconv/pytorch/core.py
spconv/pytorch/core.py
+14
-0
spconv/pytorch/cppcore.py
spconv/pytorch/cppcore.py
+51
-0
spconv/pytorch/functional.py
spconv/pytorch/functional.py
+5
-5
spconv/pytorch/identity.py
spconv/pytorch/identity.py
+0
-0
spconv/pytorch/modules.py
spconv/pytorch/modules.py
+8
-7
spconv/pytorch/ops.py
spconv/pytorch/ops.py
+590
-0
spconv/pytorch/pool.py
spconv/pytorch/pool.py
+12
-11
spconv/pytorch/spatial.py
spconv/pytorch/spatial.py
+6
-6
spconv/pytorch/tables.py
spconv/pytorch/tables.py
+16
-2
spconv/test_utils.py
spconv/test_utils.py
+4
-4
spconv/utils/__init__.py
spconv/utils/__init__.py
+14
-373
src/cuhash/CMakeLists.txt
src/cuhash/CMakeLists.txt
+0
-25
src/cuhash/debugging.cpp
src/cuhash/debugging.cpp
+0
-104
src/cuhash/debugging.cu
src/cuhash/debugging.cu
+0
-236
src/cuhash/hash_functions.cpp
src/cuhash/hash_functions.cpp
+0
-14
src/cuhash/hash_functions.cu
src/cuhash/hash_functions.cu
+0
-38
src/cuhash/hash_table.cpp
src/cuhash/hash_table.cpp
+0
-232
src/cuhash/hash_table.cu
src/cuhash/hash_table.cu
+0
-112
src/cuhash/main.cc
src/cuhash/main.cc
+0
-43
src/spconv/CMakeLists.txt
src/spconv/CMakeLists.txt
+0
-22
No files found.
spconv/core.py
→
spconv/
pytorch/
core.py
View file @
01ed382c
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Optional
import
numpy
as
np
...
...
spconv/pytorch/cppcore.py
0 → 100644
View file @
01ed382c
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
cumm
import
tensorview
as
tv
import
torch
from
typing
import
Optional
,
List
_TORCH_DTYPE_TO_TV
=
{
torch
.
float32
:
tv
.
float32
,
torch
.
float64
:
tv
.
float64
,
torch
.
float16
:
tv
.
float16
,
torch
.
int32
:
tv
.
int32
,
torch
.
int64
:
tv
.
int64
,
torch
.
int8
:
tv
.
int8
,
torch
.
int16
:
tv
.
int16
,
torch
.
uint8
:
tv
.
uint8
,
}
def
torch_tensor_to_tv
(
ten
:
torch
.
Tensor
,
dtype
:
Optional
[
int
]
=
None
,
shape
:
Optional
[
List
[
int
]]
=
None
):
assert
ten
.
is_contiguous
(),
"must be contiguous tensor"
ptr
=
ten
.
data_ptr
()
device
=
ten
.
device
if
device
.
type
==
"cpu"
:
tv_device
=
-
1
elif
device
.
type
==
"cuda"
:
tv_device
=
0
else
:
raise
NotImplementedError
if
shape
is
None
:
shape
=
list
(
ten
.
shape
)
if
dtype
is
None
:
dtype
=
_TORCH_DTYPE_TO_TV
[
ten
.
dtype
]
return
tv
.
from_blob
(
ptr
,
shape
,
dtype
,
tv_device
)
def
get_current_stream
():
return
torch
.
cuda
.
current_stream
().
cuda_stream
if
__name__
==
"__main__"
:
a
=
torch
.
rand
(
2
,
2
)
atv
=
torch_tensor_to_tv
(
a
)
print
(
atv
.
numpy_view
())
\ No newline at end of file
spconv/functional.py
→
spconv/
pytorch/
functional.py
View file @
01ed382c
# Copyright 201
9-2020
Yan Yan
#
# Copyright 20
2
1 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -16,7 +16,7 @@ import torch
from
torch
import
nn
from
torch.autograd
import
Function
import
spconv.ops
as
ops
import
spconv.
pytorch.
ops
as
ops
class
SparseConvFunction
(
Function
):
...
...
spconv/identity.py
→
spconv/
pytorch/
identity.py
View file @
01ed382c
File moved
spconv/modules.py
→
spconv/
pytorch/
modules.py
View file @
01ed382c
# Copyright 201
9-2020
Yan Yan
#
# Copyright 20
2
1 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
import
time
from
collections
import
OrderedDict
...
...
@@ -19,7 +20,7 @@ from collections import OrderedDict
import
torch
from
torch
import
nn
import
spconv
from
spconv
import
pytorch
as
spconv
def
is_spconv_module
(
module
):
...
...
@@ -28,7 +29,7 @@ def is_spconv_module(module):
def
is_sparse_conv
(
module
):
from
spconv.conv
import
SparseConvolution
from
spconv.
pytorch.
conv
import
SparseConvolution
return
isinstance
(
module
,
SparseConvolution
)
...
...
@@ -145,7 +146,7 @@ class SparseSequential(SparseModule):
def
fused
(
self
):
"""don't use this. no effect.
"""
from
spconv.conv
import
SparseConvolution
from
spconv.
pytorch.
conv
import
SparseConvolution
mods
=
[
v
for
k
,
v
in
self
.
_modules
.
items
()]
fused_mods
=
[]
idx
=
0
...
...
spconv/pytorch/ops.py
0 → 100644
View file @
01ed382c
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
enum
import
Enum
from
cumm
import
tensorview
as
tv
from
cumm.gemm.algospec.core
import
ShuffleStrideType
import
torch
import
numpy
as
np
import
spconv
from
spconv.algo
import
AlgoHint
,
ConvAlgo
from
typing
import
List
,
Union
from
spconv.pytorch.cppcore
import
torch_tensor_to_tv
,
get_current_stream
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
from
spconv.algo
import
GEMM
# , GATHER, SCATTER
import
time
from
spconv.constants
import
FILTER_HWIO
def
get_conv_output_size
(
input_size
,
kernel_size
,
stride
,
padding
,
dilation
):
ndim
=
len
(
input_size
)
output_size
=
[]
for
i
in
range
(
ndim
):
size
=
(
input_size
[
i
]
+
2
*
padding
[
i
]
-
dilation
[
i
]
*
(
kernel_size
[
i
]
-
1
)
-
1
)
//
stride
[
i
]
+
1
if
kernel_size
[
i
]
==
-
1
:
output_size
.
append
(
1
)
else
:
output_size
.
append
(
size
)
return
output_size
def
get_deconv_output_size
(
input_size
,
kernel_size
,
stride
,
padding
,
dilation
,
output_padding
):
ndim
=
len
(
input_size
)
output_size
=
[]
for
i
in
range
(
ndim
):
if
kernel_size
[
i
]
==
-
1
:
raise
ValueError
(
"deconv don't support kernel_size < 0"
)
size
=
(
input_size
[
i
]
-
1
)
*
stride
[
i
]
-
2
*
padding
[
i
]
+
kernel_size
[
i
]
+
output_padding
[
i
]
output_size
.
append
(
size
)
return
output_size
def
get_indice_pairs
(
indices
:
torch
.
Tensor
,
batch_size
:
int
,
spatial_shape
:
List
[
int
],
algo
:
ConvAlgo
,
ksize
:
Union
[
int
,
List
[
int
]],
stride
:
Union
[
int
,
List
[
int
]],
padding
:
Union
[
int
,
List
[
int
]],
dilation
:
Union
[
int
,
List
[
int
]],
out_padding
:
Union
[
int
,
List
[
int
]],
subm
:
bool
=
False
,
transpose
:
bool
=
False
):
ndim
=
indices
.
shape
[
1
]
-
1
if
not
isinstance
(
ksize
,
(
list
,
tuple
)):
ksize
=
[
ksize
]
*
ndim
if
not
isinstance
(
stride
,
(
list
,
tuple
)):
stride
=
[
stride
]
*
ndim
if
not
isinstance
(
padding
,
(
list
,
tuple
)):
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
if
not
isinstance
(
out_padding
,
(
list
,
tuple
)):
out_padding
=
[
out_padding
]
*
ndim
kv
:
int
=
int
(
np
.
prod
(
ksize
))
if
not
subm
:
if
transpose
:
out_shape
=
get_deconv_output_size
(
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
)
else
:
out_shape
=
get_conv_output_size
(
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
)
else
:
out_shape
=
spatial_shape
assert
algo
==
ConvAlgo
.
Native
and
not
transpose
,
"TODO"
stream
=
get_current_stream
()
pair
=
torch
.
full
((
2
,
kv
,
indices
.
shape
[
0
]),
-
1
,
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
indice_num_per_loc
=
torch
.
zeros
((
kv
,
),
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
inds_tv
=
torch_tensor_to_tv
(
indices
)
pair_tv
=
torch_tensor_to_tv
(
pair
)
indice_num_per_loc_tv
=
torch_tensor_to_tv
(
indice_num_per_loc
)
# torch.cuda.synchronize()
# t = time.time()
if
subm
:
out_inds
=
indices
hashdata
=
torch
.
empty
((
out_inds
.
shape
[
0
]
*
2
,
),
dtype
=
torch
.
int64
,
device
=
indices
.
device
)
out_inds_tv
=
torch_tensor_to_tv
(
out_inds
)
hashdata_tv
=
torch_tensor_to_tv
(
hashdata
,
dtype
=
tv
.
custom64
)
SpconvOps
.
generate_subm_conv_inds
(
inds_tv
,
hashdata_tv
,
pair_tv
,
out_inds_tv
,
indice_num_per_loc_tv
,
batch_size
=
batch_size
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
dilation
=
dilation
,
stream_int
=
stream
)
# torch.cuda.synchronize()
# print("SUBM INDICE GEN", time.time() - t)
else
:
indice_pairs_uniq
=
torch
.
empty
((
pair
.
numel
()
//
2
+
1
,
),
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
indice_pairs_uniq_tv
=
torch_tensor_to_tv
(
indice_pairs_uniq
)
num_act_out
=
SpconvOps
.
generate_conv_inds_stage1
(
inds_tv
,
pair_tv
,
indice_pairs_uniq_tv
,
indice_num_per_loc_tv
,
batch_size
=
batch_size
,
output_dims
=
out_shape
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
stream_int
=
stream
)
out_inds
=
torch
.
empty
((
num_act_out
,
indices
.
shape
[
1
]),
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
hashdata
=
torch
.
empty
((
out_inds
.
shape
[
0
]
*
2
,
),
dtype
=
torch
.
int64
,
device
=
indices
.
device
)
out_inds_tv
=
torch_tensor_to_tv
(
out_inds
)
hashdata_tv
=
torch_tensor_to_tv
(
hashdata
,
dtype
=
tv
.
custom64
)
SpconvOps
.
generate_conv_inds_stage2
(
inds_tv
,
hashdata_tv
,
pair_tv
,
indice_pairs_uniq_tv
,
out_inds_tv
,
num_out_act
=
num_act_out
,
batch_size
=
batch_size
,
output_dims
=
out_shape
,
input_dims
=
spatial_shape
,
ksize
=
ksize
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
stream_int
=
stream
)
# torch.cuda.synchronize()
# print("INDICE GEN", time.time() - t)
return
out_inds
,
pair
,
indice_num_per_loc
def
indice_conv
(
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
indice_pairs
:
torch
.
Tensor
,
indice_pair_num
:
torch
.
Tensor
,
num_activate_out
:
int
,
inverse
:
bool
=
False
,
subm
:
bool
=
False
,
algo
:
ConvAlgo
=
ConvAlgo
.
Native
):
# filters: RSKC
# torch.cuda.synchronize()
# t = time.time()
if
features
.
dtype
==
torch
.
int8
or
features
.
dtype
==
torch
.
qint8
:
raise
NotImplementedError
(
"work in progress"
)
if
FILTER_HWIO
:
out_channel
=
filters
.
shape
[
-
1
]
else
:
out_channel
=
filters
.
shape
[
-
2
]
filters
=
filters
.
reshape
(
-
1
,
*
filters
.
shape
[
-
2
:])
kv
=
filters
.
shape
[
0
]
kv_center
=
kv
//
2
if
subm
:
if
FILTER_HWIO
:
out_features
=
torch
.
mm
(
features
,
filters
[
kv_center
])
else
:
out_features
=
torch
.
mm
(
features
,
filters
[
kv_center
].
T
)
else
:
out_features
=
torch
.
zeros
((
num_activate_out
,
out_channel
),
dtype
=
features
.
dtype
,
device
=
features
.
device
)
if
kv
==
1
and
subm
:
return
out_features
stream
=
get_current_stream
()
indice_pair_num_cpu
=
indice_pair_num
.
cpu
().
tolist
()
arch
=
torch
.
cuda
.
get_device_capability
()
inited
:
bool
=
subm
a
=
torch_tensor_to_tv
(
features
)
c
=
torch_tensor_to_tv
(
out_features
)
profile_idx
=
kv_center
if
subm
:
profile_idx
=
kv_center
-
1
# profile_idx = first_n
nhot_profile
=
indice_pair_num_cpu
[
profile_idx
]
# print(nhot_profile, indice_pair_num_cpu)
profile_res
=
GEMM
.
get_profiled_algo
(
a
.
shape
,
filters
.
shape
[
-
2
:],
c
.
shape
,
False
,
False
if
FILTER_HWIO
else
True
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds_shape
=
[
nhot_profile
],
c_inds_shape
=
[
nhot_profile
],
hint
=
AlgoHint
.
Fowrard
.
value
)
gather_data_tv
=
tv
.
Tensor
()
scatter_data_tv
=
tv
.
Tensor
()
maxnhot
=
max
(
indice_pair_num_cpu
)
if
profile_res
is
None
:
# run profile on center
inp_indices_th
=
indice_pairs
[
int
(
inverse
)][
profile_idx
,
:
nhot_profile
]
out_indices_th
=
indice_pairs
[
int
(
not
inverse
)][
profile_idx
,
:
nhot_profile
]
inp_indices
=
torch_tensor_to_tv
(
inp_indices_th
)
out_indices
=
torch_tensor_to_tv
(
out_indices_th
)
filter_tv
=
torch_tensor_to_tv
(
filters
)[
profile_idx
]
profile_res
,
min_time
=
GEMM
.
profile_and_cache
(
a
,
filter_tv
,
c
,
False
,
False
if
FILTER_HWIO
else
True
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
inp_indices
,
c_inds
=
out_indices
,
alpha
=
1.0
,
beta
=
0.0
,
hint
=
AlgoHint
.
Fowrard
.
value
,
stream
=
stream
)
indice_pairs_tv
=
torch_tensor_to_tv
(
indice_pairs
)
pair_in
=
indice_pairs_tv
[
int
(
inverse
)]
pair_out
=
indice_pairs_tv
[
int
(
not
inverse
)]
filters_tv
=
torch_tensor_to_tv
(
filters
)
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
# inp_indices = torch_tensor_to_tv(inp_indices_th)
# out_indices = torch_tensor_to_tv(out_indices_th)
b
=
filters_tv
[
i
]
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
algo_desp
=
GEMM
.
run_profile
(
profile_res
,
a
,
b
,
c
,
False
,
False
if
FILTER_HWIO
else
True
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
inp_indices
,
c_inds
=
out_indices
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
beta
=
beta
)
# gather_times += gather_time
inited
=
True
# torch.cuda.synchronize()
# print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
# print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times)
# print(indice_pair_num_cpu)
# print(time.time() - t)
return
out_features
def
fused_indice_conv
(
features
,
filters
,
bias
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
inverse
,
subm
):
raise
NotImplementedError
def
indice_conv_backward
(
features
:
torch
.
Tensor
,
filters
:
torch
.
Tensor
,
out_bp
:
torch
.
Tensor
,
indice_pairs
:
torch
.
Tensor
,
indice_pair_num
:
torch
.
Tensor
,
inverse
:
bool
=
False
,
subm
:
bool
=
False
,
algo
:
ConvAlgo
=
ConvAlgo
.
Native
):
# workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
# workspace_tv = torch_tensor_to_tv(workspace)
# torch.cuda.synchronize()
# t = time.time()
num_activate_out
=
out_bp
.
shape
[
0
]
out_channel
=
out_bp
.
shape
[
-
1
]
filters_shape
=
filters
.
shape
filters
=
filters
.
reshape
(
-
1
,
*
filters
.
shape
[
-
2
:])
kv
=
filters
.
shape
[
0
]
kv_center
=
kv
//
2
assert
out_bp
.
is_contiguous
()
assert
filters
.
is_contiguous
()
assert
features
.
is_contiguous
()
if
subm
:
dfilters
=
torch
.
zeros_like
(
filters
)
if
FILTER_HWIO
:
torch
.
mm
(
features
.
T
,
out_bp
,
out
=
dfilters
[
kv_center
])
# TODO can we use torch mm for f16 backward weight?
din
=
torch
.
mm
(
out_bp
,
filters
[
kv_center
].
T
)
else
:
torch
.
mm
(
out_bp
.
T
,
features
,
out
=
dfilters
[
kv_center
])
# TODO can we use torch mm for f16 backward weight?
din
=
torch
.
mm
(
out_bp
,
filters
[
kv_center
])
else
:
dfilters
=
torch
.
zeros_like
(
filters
)
din
=
torch
.
zeros_like
(
features
)
if
kv
==
1
and
subm
:
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
inited
:
bool
=
subm
indice_pairs_tv
=
torch_tensor_to_tv
(
indice_pairs
)
# torch slice (a_th[x]) is very slow, so we need to use tv.Tensor earlier.
pair_in
=
indice_pairs_tv
[
int
(
inverse
)]
pair_out
=
indice_pairs_tv
[
int
(
not
inverse
)]
stream
=
get_current_stream
()
indice_pair_num_cpu
=
indice_pair_num
.
cpu
().
tolist
()
arch
=
torch
.
cuda
.
get_device_capability
()
filters_tv
=
torch_tensor_to_tv
(
filters
)
dfilters_tv
=
torch_tensor_to_tv
(
dfilters
)
out_bp_tv
=
torch_tensor_to_tv
(
out_bp
)
features_tv
=
torch_tensor_to_tv
(
features
)
din_tv
=
torch_tensor_to_tv
(
din
)
profile_idx
=
kv_center
if
subm
:
profile_idx
=
kv_center
-
1
# profile_idx = first_n
nhot_profile
=
indice_pair_num_cpu
[
profile_idx
]
# print(nhot_profile, indice_pair_num_cpu)
profile_res_dgrad
=
GEMM
.
get_profiled_algo
(
out_bp_tv
.
shape
,
filters
.
shape
[
-
2
:],
din_tv
.
shape
,
False
,
True
if
FILTER_HWIO
else
False
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds_shape
=
[
nhot_profile
],
c_inds_shape
=
[
nhot_profile
],
hint
=
AlgoHint
.
BackwardInput
.
value
)
if
profile_res_dgrad
is
None
:
inp_indices
=
pair_in
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
out_indices
=
pair_out
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
filter_tv
=
filters_tv
[
profile_idx
]
profile_res_dgrad
,
min_time
=
GEMM
.
profile_and_cache
(
out_bp_tv
,
filter_tv
,
din_tv
,
False
,
True
if
FILTER_HWIO
else
False
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
inp_indices
,
c_inds
=
out_indices
,
alpha
=
1.0
,
beta
=
0.0
,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint
=
AlgoHint
.
BackwardInput
.
value
,
stream
=
stream
)
if
not
FILTER_HWIO
:
a_wgrad
=
out_bp_tv
b_wgrad
=
features_tv
else
:
a_wgrad
=
features_tv
b_wgrad
=
out_bp_tv
profile_res_wgrad
=
GEMM
.
get_profiled_algo
(
a_wgrad
.
shape
,
b_wgrad
.
shape
,
filters
.
shape
[
-
2
:],
True
,
False
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAB
,
a_inds_shape
=
[
nhot_profile
],
b_inds_shape
=
[
nhot_profile
],
hint
=
AlgoHint
.
BackwardWeight
.
value
)
if
profile_res_wgrad
is
None
:
inp_indices
=
pair_in
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
out_indices
=
pair_out
[
profile_idx
].
slice_first_axis
(
0
,
nhot_profile
)
dfilter_tv
=
dfilters_tv
[
profile_idx
]
if
not
FILTER_HWIO
:
a_inds_wgrad
=
out_indices
b_inds_wgrad
=
inp_indices
else
:
a_inds_wgrad
=
inp_indices
b_inds_wgrad
=
out_indices
profile_res_wgrad
,
min_time
=
GEMM
.
profile_and_cache
(
a_wgrad
,
b_wgrad
,
dfilter_tv
,
True
,
False
,
False
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAB
,
a_inds
=
a_inds_wgrad
,
b_inds
=
b_inds_wgrad
,
alpha
=
1.0
,
beta
=
0.0
,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint
=
AlgoHint
.
BackwardWeight
.
value
,
stream
=
stream
)
# print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
maxnhot
=
max
(
indice_pair_num_cpu
)
# get workspace size for wgrad
if
not
FILTER_HWIO
:
a_shape
=
[
maxnhot
,
out_bp_tv
.
dim
(
1
)]
b_shape
=
[
maxnhot
,
features_tv
.
dim
(
1
)]
else
:
b_shape
=
[
maxnhot
,
out_bp_tv
.
dim
(
1
)]
a_shape
=
[
maxnhot
,
features_tv
.
dim
(
1
)]
m
,
n
,
k
=
GEMM
.
extract_mnk
(
a_shape
,
b_shape
,
profile_res_wgrad
.
algo_desp
.
trans_a
,
profile_res_wgrad
.
algo_desp
.
trans_b
,
profile_res_wgrad
.
algo_desp
.
trans_c
,
arch
=
arch
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAB
,
a_inds_shape
=
[
maxnhot
],
b_inds_shape
=
[
maxnhot
],
hint
=
AlgoHint
.
BackwardWeight
.
value
)
workspace_size
=
profile_res_wgrad
.
algo_desp
.
query_workspace_size
(
m
,
n
,
k
,
profile_res_wgrad
.
splitk
)
workspace
=
torch
.
Tensor
()
workspace_tv
=
tv
.
Tensor
()
if
workspace_size
>
0
:
workspace
=
torch
.
empty
((
workspace_size
,),
dtype
=
torch
.
int8
,
device
=
features
.
device
)
workspace_tv
=
torch_tensor_to_tv
(
workspace
)
# print(workspace_size, m, n, k, profile_res_wgrad.splitk)
# torch.cuda.synchronize()
# di_time = time.time() - t
# t = time.time()
inited
=
subm
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
continue
if
subm
and
i
>
kv_center
:
nhot
=
indice_pair_num_cpu
[
kv
-
i
-
1
]
if
nhot
<=
0
:
continue
beta
=
1.0
if
inited
else
0.0
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
# out.T @ inp, NK @ NC
# print(features_tv.shape, out_bp_tv.shape)
GEMM
.
run_profile
(
profile_res_dgrad
,
out_bp_tv
,
filters_tv
[
i
],
din_tv
,
False
,
True
if
FILTER_HWIO
else
False
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAC
,
a_inds
=
out_indices
,
c_inds
=
inp_indices
,
hint
=
AlgoHint
.
BackwardInput
.
value
,
alpha
=
1.0
,
beta
=
beta
)
if
not
FILTER_HWIO
:
a
=
out_bp_tv
b
=
features_tv
a_inds
=
out_indices
b_inds
=
inp_indices
else
:
a
=
features_tv
b
=
out_bp_tv
a_inds
=
inp_indices
b_inds
=
out_indices
GEMM
.
run_profile
(
profile_res_wgrad
,
a
,
b
,
dfilters_tv
[
i
],
True
,
False
,
False
,
arch
=
arch
,
stream
=
stream
,
shuffle_type
=
ShuffleStrideType
.
ShuffleAB
,
a_inds
=
a_inds
,
b_inds
=
b_inds
,
hint
=
AlgoHint
.
BackwardWeight
.
value
,
alpha
=
1.0
,
beta
=
beta
,
workspace
=
workspace_tv
)
inited
=
True
# torch.cuda.synchronize()
# dw_time = time.time() - t
# # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
# # print(dw_time + di_time)
# print(time.time() - t)
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
def
indice_maxpool
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
):
out_channel
=
features
.
shape
[
-
1
]
out_features
=
torch
.
zeros
((
num_activate_out
,
out_channel
),
dtype
=
features
.
dtype
,
device
=
features
.
device
)
stream
=
get_current_stream
()
indice_pair_num_cpu
=
indice_pair_num
.
cpu
().
tolist
()
out_features_tv
=
torch_tensor_to_tv
(
out_features
)
features_tv
=
torch_tensor_to_tv
(
features
)
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
nhot
<=
0
:
continue
inp_indices
=
torch_tensor_to_tv
(
indice_pairs
[
0
][
i
,
:
nhot
])
out_indices
=
torch_tensor_to_tv
(
indice_pairs
[
1
][
i
,
:
nhot
])
SpconvOps
.
maxpool_forward
(
out_features_tv
,
features_tv
,
out_indices
,
inp_indices
,
stream
)
return
out_features
def
indice_maxpool_backward
(
features
,
out_features
,
out_bp
,
indice_pairs
,
indice_pair_num
):
out_channel
=
features
.
shape
[
-
1
]
din
=
torch
.
zeros_like
(
features
)
stream
=
get_current_stream
()
indice_pair_num_cpu
=
indice_pair_num
.
cpu
().
tolist
()
out_features_tv
=
torch_tensor_to_tv
(
out_features
)
features_tv
=
torch_tensor_to_tv
(
features
)
out_bp_tv
=
torch_tensor_to_tv
(
out_bp
)
din_tv
=
torch_tensor_to_tv
(
din
)
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
nhot
<=
0
:
continue
inp_indices
=
torch_tensor_to_tv
(
indice_pairs
[
0
][
i
,
:
nhot
])
out_indices
=
torch_tensor_to_tv
(
indice_pairs
[
1
][
i
,
:
nhot
])
SpconvOps
.
maxpool_backward
(
out_features_tv
,
features_tv
,
out_bp_tv
,
din_tv
,
out_indices
,
inp_indices
,
stream
)
return
din
def
nms
(
boxes
,
scores
,
pre_max_size
,
post_max_size
,
thresh
,
eps
):
raise
NotImplementedError
def
pillar_scatter
(
features
,
coors
,
shape
):
raise
NotImplementedError
spconv/pool.py
→
spconv/
pytorch/
pool.py
View file @
01ed382c
# Copyright 201
9-2020
Yan Yan
#
# Copyright 20
2
1 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -21,11 +21,12 @@ from torch import nn
from
torch.nn
import
init
from
torch.nn.parameter
import
Parameter
import
spconv
import
spconv.functional
as
Fsp
from
spconv
import
ops
from
spconv.core
import
IndiceData
from
spconv.modules
import
SparseModule
from
spconv
import
pytorch
as
spconv
from
spconv.algo
import
ConvAlgo
import
spconv.pytorch.functional
as
Fsp
from
spconv.pytorch
import
ops
from
spconv.pytorch.core
import
IndiceData
from
spconv.pytorch.modules
import
SparseModule
class
SparseMaxPool
(
SparseModule
):
...
...
@@ -100,13 +101,13 @@ class SparseMaxPool(SparseModule):
indices
,
batch_size
,
spatial_shape
,
ConvAlgo
.
Native
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
,
grid
=
input
.
grid
)
False
)
if
input
.
benchmark
:
torch
.
cuda
.
synchronize
()
interval
=
time
.
time
()
-
t
...
...
spconv/spatial.py
→
spconv/
pytorch/
spatial.py
View file @
01ed382c
# Copyright 201
9-2020
Yan Yan
#
# Copyright 20
2
1 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -21,8 +21,8 @@ from torch import nn
from
torch.nn
import
init
from
torch.nn.parameter
import
Parameter
import
spconv
from
spconv.modules
import
SparseModule
from
spconv
import
pytorch
as
spconv
from
spconv.
pytorch.
modules
import
SparseModule
class
RemoveDuplicate
(
SparseModule
):
...
...
spconv/tables.py
→
spconv/
pytorch/
tables.py
View file @
01ed382c
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
torch.autograd
import
Function
import
spconv
import
spconv
.pytorch
as
spconv
#from torch.nn import Module
from
spconv.modules
import
SparseModule
from
spconv.
pytorch.
modules
import
SparseModule
class
JoinTable
(
SparseModule
):
# Module):
...
...
spconv/test_utils.py
View file @
01ed382c
# Copyright 201
9-2020
Yan Yan
#
# Copyright 20
2
1 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
spconv/utils/__init__.py
View file @
01ed382c
# Copyright 201
9-2020
Yan Yan
#
# Copyright 20
2
1 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -13,372 +13,13 @@
# limitations under the License.
import
numpy
as
np
import
torch
from
spconv
import
spconv_utils
from
spconv.spconv_utils
import
(
non_max_suppression_cpu
,
points_to_voxel_3d_np
,
points_to_voxel_3d_np_mean
,
points_to_voxel_3d_with_filtering
,
rbbox_intersection
,
rbbox_iou
,
rotate_non_max_suppression_cpu
)
try
:
from
spconv.spconv_utils
import
non_max_suppression
except
ImportError
:
pass
def
points_to_voxel
(
points
,
voxel_size
,
coors_range
,
coor_to_voxelidx
,
max_points
=
35
,
max_voxels
=
20000
,
full_mean
=
False
,
block_filtering
=
True
,
block_factor
=
1
,
block_size
=
8
,
height_threshold
=
0.2
,
height_high_threshold
=
3.0
,
pad_output
=
False
):
"""convert 3d points(N, >=3) to voxels. This version calculate
everything in one loop. now it takes only 0.8ms(~6k voxels)
with c++ and 3.2ghz cpu.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array, float. indicate voxel range.
format: xyzxyz, minmax
coor_to_voxelidx: int array. used as a dense map.
max_points: int. indicate maximum points contained in a voxel.
max_voxels: int. indicate maximum voxels this function create.
for voxelnet, 20000 is a good choice. you should shuffle points
before call this function because max_voxels may drop some points.
full_mean: bool. if true, all empty points in voxel will be filled with mean
of exist points.
block_filtering: filter voxels by height. used for lidar point cloud.
use some visualization tool to see filtered result.
Returns:
voxels: [M, max_points, ndim] float tensor. only contain points.
coordinates: [M, 3] int32 tensor. zyx format.
num_points_per_voxel: [M] int32 tensor.
"""
if
full_mean
:
assert
block_filtering
is
False
if
not
isinstance
(
voxel_size
,
np
.
ndarray
):
voxel_size
=
np
.
array
(
voxel_size
,
dtype
=
points
.
dtype
)
if
not
isinstance
(
coors_range
,
np
.
ndarray
):
coors_range
=
np
.
array
(
coors_range
,
dtype
=
points
.
dtype
)
voxelmap_shape
=
(
coors_range
[
3
:]
-
coors_range
[:
3
])
/
voxel_size
voxelmap_shape
=
tuple
(
np
.
round
(
voxelmap_shape
).
astype
(
np
.
int32
).
tolist
())
voxelmap_shape
=
voxelmap_shape
[::
-
1
]
num_points_per_voxel
=
np
.
zeros
(
shape
=
(
max_voxels
,
),
dtype
=
np
.
int32
)
voxels
=
np
.
zeros
(
shape
=
(
max_voxels
,
max_points
,
points
.
shape
[
-
1
]),
dtype
=
points
.
dtype
)
voxel_point_mask
=
np
.
zeros
(
shape
=
(
max_voxels
,
max_points
),
dtype
=
points
.
dtype
)
coors
=
np
.
zeros
(
shape
=
(
max_voxels
,
3
),
dtype
=
np
.
int32
)
res
=
{
"voxels"
:
voxels
,
"coordinates"
:
coors
,
"num_points_per_voxel"
:
num_points_per_voxel
,
"voxel_point_mask"
:
voxel_point_mask
,
}
if
full_mean
:
means
=
np
.
zeros
(
shape
=
(
max_voxels
,
points
.
shape
[
-
1
]),
dtype
=
points
.
dtype
)
voxel_num
=
points_to_voxel_3d_np_mean
(
points
,
voxels
,
voxel_point_mask
,
means
,
coors
,
num_points_per_voxel
,
coor_to_voxelidx
,
voxel_size
.
tolist
(),
coors_range
.
tolist
(),
max_points
,
max_voxels
)
else
:
if
block_filtering
:
block_shape
=
[
*
voxelmap_shape
[
1
:]]
block_shape
=
[
b
//
block_factor
for
b
in
block_shape
]
mins
=
np
.
full
(
block_shape
,
99999999
,
dtype
=
points
.
dtype
)
maxs
=
np
.
full
(
block_shape
,
-
99999999
,
dtype
=
points
.
dtype
)
voxel_mask
=
np
.
zeros
((
max_voxels
,
),
dtype
=
np
.
int32
)
voxel_num
=
points_to_voxel_3d_with_filtering
(
points
,
voxels
,
voxel_point_mask
,
voxel_mask
,
mins
,
maxs
,
coors
,
num_points_per_voxel
,
coor_to_voxelidx
,
voxel_size
.
tolist
(),
coors_range
.
tolist
(),
max_points
,
max_voxels
,
block_factor
,
block_size
,
height_threshold
,
height_high_threshold
)
voxel_mask
=
voxel_mask
.
astype
(
np
.
bool_
)
coors_
=
coors
[
voxel_mask
]
if
pad_output
:
res
[
"coordinates"
][:
voxel_num
]
=
coors_
res
[
"voxels"
][:
voxel_num
]
=
voxels
[
voxel_mask
]
res
[
"voxel_point_mask"
][:
voxel_num
]
=
voxel_point_mask
[
voxel_mask
]
res
[
"num_points_per_voxel"
][:
voxel_num
]
=
num_points_per_voxel
[
voxel_mask
]
res
[
"coordinates"
][
voxel_num
:]
=
0
res
[
"voxels"
][
voxel_num
:]
=
0
res
[
"num_points_per_voxel"
][
voxel_num
:]
=
0
res
[
"voxel_point_mask"
][
voxel_num
:]
=
0
else
:
res
[
"coordinates"
]
=
coors_
res
[
"voxels"
]
=
voxels
[
voxel_mask
]
res
[
"num_points_per_voxel"
]
=
num_points_per_voxel
[
voxel_mask
]
res
[
"voxel_point_mask"
]
=
voxel_point_mask
[
voxel_mask
]
voxel_num
=
coors_
.
shape
[
0
]
else
:
voxel_num
=
points_to_voxel_3d_np
(
points
,
voxels
,
voxel_point_mask
,
coors
,
num_points_per_voxel
,
coor_to_voxelidx
,
voxel_size
.
tolist
(),
coors_range
.
tolist
(),
max_points
,
max_voxels
)
res
[
"voxel_num"
]
=
voxel_num
res
[
"voxel_point_mask"
]
=
res
[
"voxel_point_mask"
].
reshape
(
-
1
,
max_points
,
1
)
return
res
class
VoxelGenerator
:
def
__init__
(
self
,
voxel_size
,
point_cloud_range
,
max_num_points
,
max_voxels
=
20000
,
full_mean
=
True
):
point_cloud_range
=
np
.
array
(
point_cloud_range
,
dtype
=
np
.
float32
)
# [0, -40, -3, 70.4, 40, 1]
voxel_size
=
np
.
array
(
voxel_size
,
dtype
=
np
.
float32
)
grid_size
=
(
point_cloud_range
[
3
:]
-
point_cloud_range
[:
3
])
/
voxel_size
grid_size
=
np
.
round
(
grid_size
).
astype
(
np
.
int64
)
voxelmap_shape
=
tuple
(
np
.
round
(
grid_size
).
astype
(
np
.
int32
).
tolist
())
voxelmap_shape
=
voxelmap_shape
[::
-
1
]
self
.
_coor_to_voxelidx
=
np
.
full
(
voxelmap_shape
,
-
1
,
dtype
=
np
.
int32
)
self
.
_voxel_size
=
voxel_size
self
.
_point_cloud_range
=
point_cloud_range
self
.
_max_num_points
=
max_num_points
self
.
_max_voxels
=
max_voxels
self
.
_grid_size
=
grid_size
self
.
_full_mean
=
full_mean
def
generate
(
self
,
points
,
max_voxels
=
None
):
res
=
points_to_voxel
(
points
,
self
.
_voxel_size
,
self
.
_point_cloud_range
,
self
.
_coor_to_voxelidx
,
self
.
_max_num_points
,
max_voxels
or
self
.
_max_voxels
,
self
.
_full_mean
)
voxels
=
res
[
"voxels"
]
coors
=
res
[
"coordinates"
]
num_points_per_voxel
=
res
[
"num_points_per_voxel"
]
voxel_num
=
res
[
"voxel_num"
]
coors
=
coors
[:
voxel_num
]
voxels
=
voxels
[:
voxel_num
]
num_points_per_voxel
=
num_points_per_voxel
[:
voxel_num
]
return
(
voxels
,
coors
,
num_points_per_voxel
)
def
generate_multi_gpu
(
self
,
points
,
max_voxels
=
None
):
res
=
points_to_voxel
(
points
,
self
.
_voxel_size
,
self
.
_point_cloud_range
,
self
.
_coor_to_voxelidx
,
self
.
_max_num_points
,
max_voxels
or
self
.
_max_voxels
,
self
.
_full_mean
)
voxels
=
res
[
"voxels"
]
coors
=
res
[
"coordinates"
]
num_points_per_voxel
=
res
[
"num_points_per_voxel"
]
voxel_num
=
res
[
"voxel_num"
]
return
(
voxels
,
coors
,
num_points_per_voxel
)
@
property
def
voxel_size
(
self
):
return
self
.
_voxel_size
@
property
def
max_num_points_per_voxel
(
self
):
return
self
.
_max_num_points
@
property
def
point_cloud_range
(
self
):
return
self
.
_point_cloud_range
@
property
def
grid_size
(
self
):
return
self
.
_grid_size
class
VoxelGeneratorV2
:
def
__init__
(
self
,
voxel_size
,
point_cloud_range
,
max_num_points
,
max_voxels
=
20000
,
full_mean
=
False
,
block_filtering
=
False
,
block_factor
=
8
,
block_size
=
3
,
height_threshold
=
0.1
,
height_high_threshold
=
2.0
):
assert
full_mean
is
False
,
"don't use this."
point_cloud_range
=
np
.
array
(
point_cloud_range
,
dtype
=
np
.
float32
)
# [0, -40, -3, 70.4, 40, 1]
voxel_size
=
np
.
array
(
voxel_size
,
dtype
=
np
.
float32
)
grid_size
=
(
point_cloud_range
[
3
:]
-
point_cloud_range
[:
3
])
/
voxel_size
grid_size
=
np
.
round
(
grid_size
).
astype
(
np
.
int64
)
if
block_filtering
:
assert
block_size
>
0
assert
grid_size
[
0
]
%
block_factor
==
0
assert
grid_size
[
1
]
%
block_factor
==
0
voxelmap_shape
=
tuple
(
np
.
round
(
grid_size
).
astype
(
np
.
int32
).
tolist
())
voxelmap_shape
=
voxelmap_shape
[::
-
1
]
self
.
_coor_to_voxelidx
=
np
.
full
(
voxelmap_shape
,
-
1
,
dtype
=
np
.
int32
)
self
.
_voxel_size
=
voxel_size
self
.
_point_cloud_range
=
point_cloud_range
self
.
_max_num_points
=
max_num_points
self
.
_max_voxels
=
max_voxels
self
.
_grid_size
=
grid_size
self
.
_full_mean
=
full_mean
self
.
_block_filtering
=
block_filtering
self
.
_block_factor
=
block_factor
self
.
_height_threshold
=
height_threshold
self
.
_block_size
=
block_size
self
.
_height_high_threshold
=
height_high_threshold
def
generate
(
self
,
points
,
max_voxels
=
None
):
res
=
points_to_voxel
(
points
,
self
.
_voxel_size
,
self
.
_point_cloud_range
,
self
.
_coor_to_voxelidx
,
self
.
_max_num_points
,
max_voxels
or
self
.
_max_voxels
,
self
.
_full_mean
,
self
.
_block_filtering
,
self
.
_block_factor
,
self
.
_block_size
,
self
.
_height_threshold
,
self
.
_height_high_threshold
)
for
k
,
v
in
res
.
items
():
if
k
!=
"voxel_num"
:
res
[
k
]
=
v
[:
res
[
"voxel_num"
]]
return
res
def
generate_multi_gpu
(
self
,
points
,
max_voxels
=
None
):
res
=
points_to_voxel
(
points
,
self
.
_voxel_size
,
self
.
_point_cloud_range
,
self
.
_coor_to_voxelidx
,
self
.
_max_num_points
,
max_voxels
or
self
.
_max_voxels
,
self
.
_full_mean
,
self
.
_block_filtering
,
self
.
_block_factor
,
self
.
_block_size
,
self
.
_height_threshold
,
self
.
_height_high_threshold
,
pad_output
=
True
)
return
res
@
property
def
voxel_size
(
self
):
return
self
.
_voxel_size
@
property
def
max_num_points_per_voxel
(
self
):
return
self
.
_max_num_points
@
property
def
point_cloud_range
(
self
):
return
self
.
_point_cloud_range
@
property
def
grid_size
(
self
):
return
self
.
_grid_size
class
VoxelGeneratorV3
:
def
__init__
(
self
,
voxel_size
,
point_cloud_range
,
max_points
,
num_features
,
dtype
,
device
):
self
.
_max_points
=
max_points
self
.
_point_cloud_range
=
point_cloud_range
self
.
_voxel_size
=
voxel_size
self
.
_grid_size
=
torch
.
round
(
(
self
.
_point_cloud_range
[
3
:]
-
self
.
_point_cloud_range
[:
3
])
/
self
.
_voxel_size
).
to
(
torch
.
int32
)
grid_volume
=
self
.
_grid_size
.
prod
()
self
.
_grid_size
=
self
.
_grid_size
.
cpu
().
numpy
().
tolist
()
self
.
_ndim
=
len
(
self
.
_grid_size
)
self
.
_dtype
=
dtype
self
.
_device
=
device
self
.
_point_index
=
torch
.
full
([
max_points
+
1
],
grid_volume
,
dtype
=
torch
.
int32
,
device
=
self
.
_device
)
self
.
_grids
=
torch
.
zeros
([
grid_volume
,
num_features
],
dtype
=
self
.
_dtype
,
device
=
self
.
_device
)
self
.
_num_points_per_grid
=
torch
.
zeros
([
grid_volume
],
dtype
=
torch
.
int32
,
device
=
self
.
_device
)
self
.
_voxels
=
torch
.
zeros
([
max_points
,
num_features
],
dtype
=
self
.
_dtype
,
device
=
self
.
_device
)
self
.
_coors
=
torch
.
zeros
([
max_points
,
self
.
_ndim
],
dtype
=
torch
.
int32
,
device
=
self
.
_device
)
def
generate
(
self
,
points
):
assert
points
.
shape
[
0
]
<=
self
.
_max_points
,
'please enlarge max_points to not smaller than '
+
str
(
points
.
shape
[
0
])
points
.
to
(
self
.
_dtype
).
to
(
self
.
_device
)
return
self
.
points_to_voxel
(
points
)
def
generate_multi_gpu
(
self
,
points
):
assert
points
.
shape
[
0
]
<=
self
.
_max_points
,
'please enlarge max_points to not smaller than '
+
str
(
points
.
shape
[
0
])
points
.
to
(
self
.
_dtype
).
to
(
self
.
_device
)
return
self
.
points_to_voxel
(
points
)
@
property
def
voxel_size
(
self
):
return
self
.
_voxel_size
@
property
def
point_cloud_range
(
self
):
return
self
.
_point_cloud_range
@
property
def
grid_size
(
self
):
return
self
.
_grid_size
def
points_to_voxel
(
self
,
points
):
"""
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
format: xyzxyz, minmax
"""
indexes
=
torch
.
floor
((
points
[:,
:
3
]
-
self
.
_point_cloud_range
[:
3
])
/
self
.
_voxel_size
).
to
(
torch
.
int32
)
num_voxel
=
torch
.
ops
.
spconv
.
points_to_voxel
(
points
,
indexes
,
self
.
_point_index
,
self
.
_grids
,
self
.
_num_points_per_grid
,
self
.
_voxels
,
self
.
_coors
,
self
.
_grid_size
,
self
.
_ndim
)
voxels
=
self
.
_voxels
[:
num_voxel
,
:]
coors
=
self
.
_coors
[:
num_voxel
,
:]
# xyz --> zyx
#coors = coors[::-1]
x
,
y
,
z
=
coors
[:,
0
].
reshape
([
-
1
,
1
]),
coors
[:,
1
].
reshape
(
[
-
1
,
1
]),
coors
[:,
2
].
reshape
([
-
1
,
1
])
coors
=
torch
.
cat
([
z
,
y
,
x
],
dim
=
1
)
# can be skipped
# x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
# voxels = torch.cat([z, y, x, f], dim=1)
return
voxels
,
coors
from
cumm
import
tensorview
as
tv
from
spconv.core_cc.csrc.sparse.all.ops1d
import
Point2Voxel
as
Point2VoxelGPU1d
from
spconv.core_cc.csrc.sparse.all.ops2d
import
Point2Voxel
as
Point2VoxelGPU2d
from
spconv.core_cc.csrc.sparse.all.ops3d
import
Point2Voxel
as
Point2VoxelGPU3d
from
spconv.core_cc.csrc.sparse.all.ops4d
import
Point2Voxel
as
Point2VoxelGPU4d
from
spconv.core_cc.csrc.sparse.all.ops_cpu1d
import
Point2VoxelCPU
as
Point2VoxelCPU1d
from
spconv.core_cc.csrc.sparse.all.ops_cpu2d
import
Point2VoxelCPU
as
Point2VoxelCPU2d
from
spconv.core_cc.csrc.sparse.all.ops_cpu3d
import
Point2VoxelCPU
as
Point2VoxelCPU3d
from
spconv.core_cc.csrc.sparse.all.ops_cpu4d
import
Point2VoxelCPU
as
Point2VoxelCPU4d
\ No newline at end of file
src/cuhash/CMakeLists.txt
deleted
100644 → 0
View file @
3517290c
if
(
WIN32
)
add_library
(
cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp
)
else
()
add_library
(
cuhash STATIC hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp
)
endif
()
target_include_directories
(
cuhash PRIVATE
${
ALL_INCLUDE
}
)
set_property
(
TARGET cuhash PROPERTY CUDA_STANDARD 14
)
set_property
(
TARGET cuhash PROPERTY CXX_STANDARD 14
)
set_target_properties
(
cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON
)
set_target_properties
(
cuhash PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
if
(
NOT WIN32
)
set_property
(
TARGET cuhash PROPERTY POSITION_INDEPENDENT_CODE ON
)
endif
()
target_link_libraries
(
cuhash PRIVATE
${
ALL_LIBS
}
)
install
(
TARGETS cuhash DESTINATION lib
)
if
(
SPCONV_BuildTests
)
add_executable
(
cuhash_test main.cc
)
target_include_directories
(
cuhash_test PRIVATE
${
ALL_INCLUDE
}
)
set_property
(
TARGET cuhash_test PROPERTY CUDA_STANDARD 14
)
set_property
(
TARGET cuhash_test PROPERTY CXX_STANDARD 14
)
set_target_properties
(
cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON
)
target_link_libraries
(
cuhash_test PRIVATE
${
ALL_LIBS
}
cuhash
)
install
(
TARGETS cuhash_test DESTINATION bin
)
endif
()
\ No newline at end of file
src/cuhash/debugging.cpp
deleted
100644 → 0
View file @
3517290c
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cpp
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <algorithm>
#include <cstring>
#include <cuhash/cuda_util.h>
namespace
cuhash
{
void
OutputRetrievalStatistics
(
const
unsigned
n_queries
,
const
unsigned
*
d_retrieval_probes
,
const
unsigned
n_functions
)
{
unsigned
*
retrieval_probes
=
new
unsigned
[
n_queries
];
CUDA_SAFE_CALL
(
cudaMemcpy
(
retrieval_probes
,
d_retrieval_probes
,
sizeof
(
unsigned
)
*
n_queries
,
cudaMemcpyDeviceToHost
));
// Create a histogram showing how many items needed how many probes to be
// found.
unsigned
possible_probes
=
n_functions
+
2
;
unsigned
*
histogram
=
new
unsigned
[
possible_probes
];
memset
(
histogram
,
0
,
sizeof
(
unsigned
)
*
(
possible_probes
));
for
(
unsigned
i
=
0
;
i
<
n_queries
;
++
i
)
{
histogram
[
retrieval_probes
[
i
]]
++
;
}
// Dump it.
char
buffer
[
10000
];
sprintf
(
buffer
,
"Probes for retrieval: "
);
PrintMessage
(
buffer
);
for
(
unsigned
i
=
0
;
i
<
possible_probes
;
++
i
)
{
sprintf
(
buffer
,
"
\t
(%u, %u)"
,
i
,
histogram
[
i
]);
PrintMessage
(
buffer
);
}
delete
[]
retrieval_probes
;
delete
[]
histogram
;
}
void
OutputBuildStatistics
(
const
unsigned
n
,
const
unsigned
*
d_iterations_taken
)
{
// Output how many iterations each thread took until it found an empty slot.
unsigned
*
iterations_taken
=
new
unsigned
[
n
];
CUDA_SAFE_CALL
(
cudaMemcpy
(
iterations_taken
,
d_iterations_taken
,
sizeof
(
unsigned
)
*
n
,
cudaMemcpyDeviceToHost
));
std
::
sort
(
iterations_taken
,
iterations_taken
+
n
);
unsigned
total_iterations
=
0
;
unsigned
max_iterations_taken
=
0
;
for
(
unsigned
i
=
0
;
i
<
n
;
++
i
)
{
total_iterations
+=
iterations_taken
[
i
];
max_iterations_taken
=
std
::
max
(
max_iterations_taken
,
iterations_taken
[
i
]);
}
unsigned
current_value
=
iterations_taken
[
0
];
unsigned
count
=
1
;
char
buffer
[
10000
];
sprintf
(
buffer
,
"Iterations taken:
\n
"
);
for
(
unsigned
i
=
1
;
i
<
n
;
++
i
)
{
if
(
iterations_taken
[
i
]
!=
current_value
)
{
sprintf
(
buffer
,
"%s
\t
(%u, %u)
\n
"
,
buffer
,
current_value
,
count
);
current_value
=
iterations_taken
[
i
];
count
=
1
;
}
else
{
count
++
;
}
}
sprintf
(
buffer
,
"%s
\t
(%u, %u)"
,
buffer
,
current_value
,
count
);
PrintMessage
(
buffer
);
sprintf
(
buffer
,
"Total iterations: %u"
,
total_iterations
);
PrintMessage
(
buffer
);
sprintf
(
buffer
,
"Avg/Med/Max iterations: (%f %u %u)"
,
(
float
)
total_iterations
/
n
,
iterations_taken
[
n
/
2
],
iterations_taken
[
n
-
1
]);
PrintMessage
(
buffer
);
delete
[]
iterations_taken
;
// Print the length of the longest eviction chain.
sprintf
(
buffer
,
"Max iterations: %u"
,
max_iterations_taken
);
PrintMessage
(
buffer
);
}
};
// namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
src/cuhash/debugging.cu
deleted
100644 → 0
View file @
3517290c
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cu
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <algorithm>
#include <cuhash/cuda_util.h>
namespace
cuhash
{
//! Debugging function: Takes statistics on the hash functions' distribution.
/*! Determines:
* - How many unique slots each key has.
* - How many keys hash into each slot.
* - Whether any keys failed to get a full set of slots.
*/
__global__
void
take_hash_function_statistics_kernel
(
const
unsigned
*
keys
,
const
unsigned
n_entries
,
const
unsigned
table_size
,
const
uint2
*
constants
,
const
unsigned
num_functions
,
unsigned
*
num_slots_available
,
unsigned
*
num_hashing_in
,
unsigned
*
failed
)
{
unsigned
thread_index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
+
blockIdx
.
y
*
blockDim
.
x
*
gridDim
.
x
;
if
(
thread_index
>=
n_entries
)
return
;
unsigned
key
=
keys
[
thread_index
];
// Determine all of the locations the key hashes into.
// Also count how many keys hash into each location.
unsigned
locations
[
kMaxHashFunctions
];
for
(
unsigned
i
=
0
;
i
<
num_functions
;
++
i
)
{
locations
[
i
]
=
hash_function_inner
(
constants
[
i
],
key
)
%
table_size
;
if
(
num_hashing_in
!=
NULL
)
{
atomicAdd
(
num_hashing_in
+
locations
[
i
],
1
);
}
}
// Determine whether all of the locations were different.
unsigned
num_slots
=
1
;
for
(
unsigned
i
=
1
;
i
<
num_functions
;
++
i
)
{
bool
matched
=
false
;
for
(
unsigned
j
=
0
;
j
<
i
;
++
j
)
{
if
(
locations
[
i
]
==
locations
[
j
])
{
matched
=
true
;
break
;
}
}
if
(
!
matched
)
{
num_slots
++
;
}
}
if
(
num_slots_available
!=
NULL
)
{
num_slots_available
[
thread_index
]
=
num_slots
;
}
if
(
failed
!=
NULL
&&
num_slots
!=
num_functions
)
{
*
failed
=
1
;
}
}
void
TakeHashFunctionStatistics
(
const
unsigned
num_keys
,
const
unsigned
*
d_keys
,
const
unsigned
table_size
,
const
uint2
*
constants
,
const
unsigned
kNumHashFunctions
)
{
char
buffer
[
16000
];
PrintMessage
(
"Hash function constants: "
);
for
(
unsigned
i
=
0
;
i
<
kNumHashFunctions
;
++
i
)
{
sprintf
(
buffer
,
"
\t
%10u, %10u"
,
constants
[
i
].
x
,
constants
[
i
].
y
);
PrintMessage
(
buffer
);
}
unsigned
*
d_num_hashing_in
=
NULL
;
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_num_hashing_in
,
sizeof
(
unsigned
)
*
table_size
));
CUDA_SAFE_CALL
(
cudaMemset
(
d_num_hashing_in
,
0
,
sizeof
(
unsigned
)
*
table_size
));
#endif
unsigned
*
d_num_slots_available
=
NULL
;
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_num_slots_available
,
sizeof
(
unsigned
)
*
num_keys
));
#endif
uint2
*
d_constants
=
NULL
;
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_constants
,
sizeof
(
uint2
)
*
kNumHashFunctions
));
CUDA_SAFE_CALL
(
cudaMemcpy
(
d_constants
,
constants
,
sizeof
(
uint2
)
*
kNumHashFunctions
,
cudaMemcpyHostToDevice
));
take_hash_function_statistics_kernel
<<<
ComputeGridDim
(
num_keys
),
kBlockSize
>>>
(
d_keys
,
num_keys
,
table_size
,
d_constants
,
kNumHashFunctions
,
d_num_slots_available
,
d_num_hashing_in
,
NULL
);
CUDA_SAFE_CALL
(
cudaFree
(
d_constants
));
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
unsigned
*
num_hashing_in
=
new
unsigned
[
table_size
];
CUDA_SAFE_CALL
(
cudaMemcpy
(
num_hashing_in
,
d_num_hashing_in
,
sizeof
(
unsigned
)
*
table_size
,
cudaMemcpyDeviceToHost
));
/*
// Print how many items hash into each slot.
// Used to make sure items are spread evenly throughout the table.
buffer[0] = '\0';
PrintMessage("Num hashing into each: ", true);
for (unsigned i = 0; i < table_size; ++i) {
sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
if (i % 25 == 24) {
PrintMessage(buffer, true);
buffer[0] = '\0';
}
}
PrintMessage(buffer,true);
*/
// Print a histogram of how many items are hashed into each slot. Shows
// if average number of items hashing into each slot is low.
std
::
sort
(
num_hashing_in
,
num_hashing_in
+
table_size
);
int
count
=
1
;
unsigned
previous
=
num_hashing_in
[
0
];
sprintf
(
buffer
,
"Num items hashing into a slot:
\t
"
);
PrintMessage
(
buffer
);
for
(
unsigned
i
=
1
;
i
<
table_size
;
++
i
)
{
if
(
num_hashing_in
[
i
]
!=
previous
)
{
sprintf
(
buffer
,
"
\t
(%u, %u)"
,
previous
,
count
);
PrintMessage
(
buffer
);
previous
=
num_hashing_in
[
i
];
count
=
1
;
}
else
{
count
++
;
}
}
sprintf
(
buffer
,
"
\t
(%u, %u)"
,
previous
,
count
);
PrintMessage
(
buffer
);
delete
[]
num_hashing_in
;
CUDA_SAFE_CALL
(
cudaFree
(
d_num_hashing_in
));
#endif
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
unsigned
*
num_slots_available
=
new
unsigned
[
num_keys
];
CUDA_SAFE_CALL
(
cudaMemcpy
(
num_slots_available
,
d_num_slots_available
,
sizeof
(
unsigned
)
*
num_keys
,
cudaMemcpyDeviceToHost
));
static
const
unsigned
kHistogramSize
=
kNumHashFunctions
+
1
;
unsigned
*
histogram
=
new
unsigned
[
kHistogramSize
];
memset
(
histogram
,
0
,
sizeof
(
unsigned
)
*
kHistogramSize
);
for
(
unsigned
i
=
0
;
i
<
num_keys
;
++
i
)
{
histogram
[
num_slots_available
[
i
]]
++
;
}
sprintf
(
buffer
,
"Slots assigned to each key: "
);
for
(
unsigned
i
=
1
;
i
<
kHistogramSize
;
++
i
)
{
sprintf
(
buffer
,
"%s(%u, %u) "
,
buffer
,
i
,
histogram
[
i
]);
}
PrintMessage
(
buffer
);
delete
[]
histogram
;
delete
[]
num_slots_available
;
CUDA_SAFE_CALL
(
cudaFree
(
d_num_slots_available
));
#endif
}
bool
CheckAssignedSameSlot
(
const
unsigned
N
,
const
unsigned
num_keys
,
const
unsigned
*
d_keys
,
const
unsigned
table_size
,
uint2
*
constants
)
{
unsigned
*
d_cycle_exists
=
NULL
;
uint2
*
d_constants
=
NULL
;
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_cycle_exists
,
sizeof
(
unsigned
)));
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_constants
,
sizeof
(
uint2
)
*
N
));
CUDA_SAFE_CALL
(
cudaMemset
(
d_cycle_exists
,
0
,
sizeof
(
unsigned
)));
CUDA_SAFE_CALL
(
cudaMemcpy
(
d_constants
,
constants
,
sizeof
(
uint2
)
*
N
,
cudaMemcpyHostToDevice
));
// Check if all keys were given a full set of N slots by the functions.
take_hash_function_statistics_kernel
<<<
ComputeGridDim
(
num_keys
),
kBlockSize
>>>
(
d_keys
,
num_keys
,
table_size
,
d_constants
,
N
,
NULL
,
NULL
,
d_cycle_exists
);
unsigned
cycle_exists
;
CUDA_SAFE_CALL
(
cudaMemcpy
(
&
cycle_exists
,
d_cycle_exists
,
sizeof
(
unsigned
),
cudaMemcpyDeviceToHost
));
CUDA_SAFE_CALL
(
cudaFree
(
d_cycle_exists
));
CUDA_SAFE_CALL
(
cudaFree
(
d_constants
));
return
(
cycle_exists
!=
0
);
}
void
PrintStashContents
(
const
Entry
*
d_stash
)
{
Entry
*
stash
=
new
Entry
[
cuhash
::
kStashSize
];
CUDA_SAFE_CALL
(
cudaMemcpy
(
stash
,
d_stash
,
sizeof
(
Entry
)
*
cuhash
::
kStashSize
,
cudaMemcpyDeviceToHost
));
for
(
unsigned
i
=
0
;
i
<
cuhash
::
kStashSize
;
++
i
)
{
if
(
get_key
(
stash
[
i
])
!=
kKeyEmpty
)
{
char
buffer
[
256
];
sprintf
(
buffer
,
"Stash[%u]: %u = %u"
,
i
,
get_key
(
stash
[
i
]),
get_value
(
stash
[
i
]));
PrintMessage
(
buffer
,
true
);
}
}
delete
[]
stash
;
}
};
// namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
src/cuhash/hash_functions.cpp
deleted
100644 → 0
View file @
3517290c
// nvcc (cuda) 9.0 with gcc 5.5 don't support random, so compile it in host
#include <random>
namespace
cuhash
{
std
::
random_device
random_dev
;
std
::
mt19937
random_engine
(
random_dev
());
std
::
uniform_int_distribution
<
unsigned
>
uint_distribution
;
unsigned
generate_random_uint32
()
{
return
uint_distribution
(
random_engine
);
}
}
// namespace cuhash
\ No newline at end of file
src/cuhash/hash_functions.cu
deleted
100644 → 0
View file @
3517290c
#include <cassert>
#include <cuhash/debugging.h>
#include <cuhash/hash_functions.h>
#include <cuhash/hash_table.h>
namespace
cuhash
{
void
GenerateFunctions
(
const
unsigned
N
,
const
unsigned
num_keys
,
const
unsigned
*
d_keys
,
const
unsigned
table_size
,
uint2
*
constants
)
{
bool
regenerate
=
true
;
while
(
regenerate
)
{
regenerate
=
false
;
// Generate a set of hash function constants for this build attempt.
for
(
unsigned
i
=
0
;
i
<
N
;
++
i
)
{
// uint_distribution(random_engine) % kPrimeDivisor;
// genrand_int32() % kPrimeDivisor;
unsigned
new_a
=
generate_random_uint32
()
%
kPrimeDivisor
;
constants
[
i
].
x
=
(
1
>
new_a
?
1
:
new_a
);
constants
[
i
].
y
=
generate_random_uint32
()
%
kPrimeDivisor
;
}
#ifdef FORCEFULLY_GENERATE_NO_CYCLES
// Ensure that every key gets N different slots.
regenerate
=
CheckAssignedSameSlot
(
N
,
num_keys
,
d_keys
,
table_size
,
constants
);
#endif
}
#ifdef TAKE_HASH_FUNCTION_STATISTICS
// Examine how well distributed the items are.
TakeHashFunctionStatistics
(
num_keys
,
d_keys
,
table_size
,
constants
,
N
);
#endif
}
};
// namespace cuhash
src/cuhash/hash_table.cpp
deleted
100644 → 0
View file @
3517290c
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cpp
*
* @brief Implements a basic hash table that stores one value per key.
*/
#include <cuhash/debugging.h>
#include <cuhash/hash_table.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cuda_runtime_api.h>
#include <cuhash/cuda_util.h>
#include <limits>
namespace
cuhash
{
char
buffer
[
256
];
//! @name Internal
/// @{
dim3
ComputeGridDim
(
unsigned
n
)
{
// Round up in order to make sure all items are hashed in.
dim3
grid
((
n
+
kBlockSize
-
1
)
/
kBlockSize
);
if
(
grid
.
x
>
kGridSize
)
{
grid
.
y
=
(
grid
.
x
+
kGridSize
-
1
)
/
kGridSize
;
grid
.
x
=
kGridSize
;
}
return
grid
;
}
unsigned
ComputeMaxIterations
(
const
unsigned
n
,
const
unsigned
table_size
,
const
unsigned
num_functions
)
{
float
lg_input_size
=
(
float
)(
log
((
double
)
n
)
/
log
(
2.0
));
// #define CONSTANT_ITERATIONS
#ifdef CONSTANT_ITERATIONS
// Set the maximum number of iterations to 7lg(N).
const
unsigned
MAX_ITERATION_CONSTANT
=
7
;
unsigned
max_iterations
=
MAX_ITERATION_CONSTANT
*
lg_input_size
;
#else
// Use an empirical formula for determining what the maximum number of
// iterations should be. Works OK in most situations.
float
load_factor
=
float
(
n
)
/
table_size
;
float
ln_load_factor
=
(
float
)(
log
(
load_factor
)
/
log
(
2.71828183
));
unsigned
max_iterations
=
(
unsigned
)(
4.0
*
ceil
(
-
1.0
/
(
0.028255
+
1.1594772
*
ln_load_factor
)
*
lg_input_size
));
#endif
return
max_iterations
;
}
/// @}
HashTable
::
HashTable
()
:
table_size_
(
0
),
d_contents_
(
NULL
),
stash_count_
(
0
),
d_failures_
(
NULL
)
{
CUDA_CHECK_ERROR
(
"Failed in constructor.
\n
"
);
}
bool
HashTable
::
Initialize
(
const
unsigned
max_table_entries
,
const
float
space_usage
,
const
unsigned
num_functions
)
{
Release
();
// Determine the minimum amount of slots the table requires,
// and whether the space_usage is within range.
float
minimum_space_usage
;
if
(
num_functions
<
2
||
num_functions
>
5
)
{
char
message
[
256
]
=
"Number of hash functions must be from 2 to 5; "
"others are unimplemented."
;
PrintMessage
(
message
,
true
);
return
false
;
}
else
{
minimum_space_usage
=
kMinimumSpaceUsages
[
num_functions
];
}
if
(
space_usage
<
minimum_space_usage
)
{
sprintf
(
buffer
,
"Minimum possible space usage for %u functions is %f."
,
num_functions
,
minimum_space_usage
);
PrintMessage
(
buffer
);
return
false
;
}
num_hash_functions_
=
num_functions
;
table_size_
=
unsigned
(
ceil
(
max_table_entries
*
space_usage
));
// Allocate memory.
const
unsigned
slots_to_allocate
=
table_size_
+
kStashSize
;
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_contents_
,
sizeof
(
Entry
)
*
slots_to_allocate
));
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_failures_
,
sizeof
(
unsigned
)));
if
(
!
d_contents_
||
!
d_failures_
)
{
fprintf
(
stderr
,
"Failed to allocate %u slots.
\n
"
,
slots_to_allocate
);
return
false
;
}
CUDA_CHECK_ERROR
(
"Failed to initialize.
\n
"
);
return
true
;
}
void
HashTable
::
Release
()
{
table_size_
=
0
;
CUDA_SAFE_CALL
(
cudaFree
(
d_contents_
));
CUDA_SAFE_CALL
(
cudaFree
(
d_failures_
));
d_contents_
=
NULL
;
d_failures_
=
NULL
;
CUDA_CHECK_ERROR
(
"Failed during release.
\n
"
);
}
bool
HashTable
::
Build
(
const
unsigned
n
,
const
unsigned
*
d_keys
,
const
unsigned
*
d_values
)
{
unsigned
max_iterations
=
ComputeMaxIterations
(
n
,
table_size_
,
num_hash_functions_
);
unsigned
num_failures
=
1
;
unsigned
num_attempts
=
0
;
// Storage for statistics collection.
unsigned
*
d_iterations_taken
=
NULL
;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_iterations_taken
,
sizeof
(
unsigned
)
*
n
));
#endif
// Track how many items ended up in the stash.
unsigned
*
d_stash_count
=
NULL
;
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_stash_count
,
sizeof
(
unsigned
)));
CUDA_CHECK_ERROR
(
"Failed before main build loop.
\n
"
);
// Main build loop.
while
(
num_failures
&&
++
num_attempts
<
kMaxRestartAttempts
)
{
CUDA_SAFE_CALL
(
cudaMemset
(
d_stash_count
,
0
,
sizeof
(
unsigned
)));
// Generate new hash functions.
if
(
num_hash_functions_
==
2
)
constants_2_
.
Generate
(
n
,
d_keys
,
table_size_
);
else
if
(
num_hash_functions_
==
3
)
constants_3_
.
Generate
(
n
,
d_keys
,
table_size_
);
else
if
(
num_hash_functions_
==
4
)
constants_4_
.
Generate
(
n
,
d_keys
,
table_size_
);
else
constants_5_
.
Generate
(
n
,
d_keys
,
table_size_
);
stash_constants_
.
x
=
std
::
max
(
1u
,
generate_random_uint32
())
%
kPrimeDivisor
;
stash_constants_
.
y
=
generate_random_uint32
()
%
kPrimeDivisor
;
stash_count_
=
0
;
// Initialize memory.
unsigned
slots_in_table
=
table_size_
+
kStashSize
;
CUDAWrapper
::
ClearTable
(
slots_in_table
,
kEntryEmpty
,
d_contents_
);
num_failures
=
0
;
CUDAWrapper
::
CallCuckooHash
(
n
,
num_hash_functions_
,
d_keys
,
d_values
,
table_size_
,
constants_2_
,
constants_3_
,
constants_4_
,
constants_5_
,
max_iterations
,
d_contents_
,
stash_constants_
,
d_stash_count
,
d_failures_
,
d_iterations_taken
);
// Check if successful.
CUDA_SAFE_CALL
(
cudaMemcpy
(
&
num_failures
,
d_failures_
,
sizeof
(
unsigned
),
cudaMemcpyDeviceToHost
));
#ifdef COUNT_UNINSERTED
if
(
num_failures
)
{
printf
(
"Failed to insert %u items.
\n
"
,
num_failures
);
}
#endif
}
// Copy out the stash size.
CUDA_SAFE_CALL
(
cudaMemcpy
(
&
stash_count_
,
d_stash_count
,
sizeof
(
unsigned
),
cudaMemcpyDeviceToHost
));
if
(
stash_count_
&&
num_failures
==
0
)
{
// sprintf(buffer, "Stash size: %u", stash_count_);
// PrintMessage(buffer, true);
#ifdef _DEBUG
PrintStashContents
(
d_contents_
+
table_size_
);
#endif
}
CUDA_SAFE_CALL
(
cudaFree
(
d_stash_count
));
#ifdef TRACK_ITERATIONS
if
(
num_failures
==
0
)
{
OutputBuildStatistics
(
n
,
d_iterations_taken
);
}
CUDA_SAFE_CALL
(
cudaFree
(
d_iterations_taken
));
#endif
// Dump some info if a restart was required.
if
(
num_attempts
>=
kMaxRestartAttempts
)
{
sprintf
(
buffer
,
"Completely failed to build"
);
PrintMessage
(
buffer
,
true
);
}
else
if
(
num_attempts
>
1
)
{
sprintf
(
buffer
,
"Needed %u attempts to build, you can ignore this message."
,
num_attempts
);
PrintMessage
(
buffer
,
true
);
}
CUDA_CHECK_ERROR
(
"Error occurred during hash table build.
\n
"
);
return
num_failures
==
0
;
}
void
HashTable
::
Retrieve
(
const
unsigned
n_queries
,
const
unsigned
*
d_keys
,
unsigned
*
d_values
)
{
CUDAWrapper
::
CallHashRetrieve
(
n_queries
,
num_hash_functions_
,
d_keys
,
table_size_
,
d_contents_
,
constants_2_
,
constants_3_
,
constants_4_
,
constants_5_
,
stash_constants_
,
stash_count_
,
d_values
);
}
};
// namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
src/cuhash/hash_table.cu
deleted
100644 → 0
View file @
3517290c
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cu
*
* @brief Hides all of the CUDA calls from the actual CPP file.
*/
#include <cuhash/cuda_util.h>
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <cuda.h>
namespace
cuhash
{
namespace
CUDAWrapper
{
void
ClearTable
(
const
unsigned
slots_in_table
,
const
Entry
fill_value
,
Entry
*
d_contents
)
{
clear_table
<
Entry
><<<
ComputeGridDim
(
slots_in_table
),
kBlockSize
>>>
(
slots_in_table
,
fill_value
,
d_contents
);
TV_CHECK_CUDA_ERR_V2
(
"Error occurred during hash table clear.
\n
"
);
}
void
CallCuckooHash
(
const
unsigned
n
,
const
unsigned
num_hash_functions
,
const
unsigned
*
d_keys
,
const
unsigned
*
d_values
,
const
unsigned
table_size
,
const
Functions
<
2
>
constants_2
,
const
Functions
<
3
>
constants_3
,
const
Functions
<
4
>
constants_4
,
const
Functions
<
5
>
constants_5
,
const
unsigned
max_iterations
,
Entry
*
d_contents
,
uint2
stash_constants
,
unsigned
*
d_stash_count
,
unsigned
*
d_failures
,
unsigned
*
d_iterations_taken
)
{
// Build the table.
cudaMemset
(
d_failures
,
0
,
sizeof
(
unsigned
));
if
(
num_hash_functions
==
2
)
{
CuckooHash
<<<
ComputeGridDim
(
n
),
kBlockSize
>>>
(
n
,
d_keys
,
d_values
,
table_size
,
constants_2
,
max_iterations
,
d_contents
,
stash_constants
,
d_stash_count
,
d_failures
,
d_iterations_taken
);
}
else
if
(
num_hash_functions
==
3
)
{
CuckooHash
<<<
ComputeGridDim
(
n
),
kBlockSize
>>>
(
n
,
d_keys
,
d_values
,
table_size
,
constants_3
,
max_iterations
,
d_contents
,
stash_constants
,
d_stash_count
,
d_failures
,
d_iterations_taken
);
}
else
if
(
num_hash_functions
==
4
)
{
CuckooHash
<<<
ComputeGridDim
(
n
),
kBlockSize
>>>
(
n
,
d_keys
,
d_values
,
table_size
,
constants_4
,
max_iterations
,
d_contents
,
stash_constants
,
d_stash_count
,
d_failures
,
d_iterations_taken
);
}
else
{
CuckooHash
<<<
ComputeGridDim
(
n
),
kBlockSize
>>>
(
n
,
d_keys
,
d_values
,
table_size
,
constants_5
,
max_iterations
,
d_contents
,
stash_constants
,
d_stash_count
,
d_failures
,
d_iterations_taken
);
}
CUDA_CHECK_ERROR
(
"Error occurred during hash table build.
\n
"
);
}
void
CallHashRetrieve
(
const
unsigned
n_queries
,
const
unsigned
num_hash_functions
,
const
unsigned
*
d_keys
,
const
unsigned
table_size
,
const
Entry
*
d_contents
,
const
Functions
<
2
>
constants_2
,
const
Functions
<
3
>
constants_3
,
const
Functions
<
4
>
constants_4
,
const
Functions
<
5
>
constants_5
,
const
uint2
stash_constants
,
const
unsigned
stash_count
,
unsigned
*
d_values
)
{
unsigned
*
d_retrieval_probes
=
NULL
;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
d_retrieval_probes
,
sizeof
(
unsigned
)
*
n_queries
));
#endif
if
(
num_hash_functions
==
2
)
{
hash_retrieve
<<<
ComputeGridDim
(
n_queries
),
kBlockSize
>>>
(
n_queries
,
d_keys
,
table_size
,
d_contents
,
constants_2
,
stash_constants
,
stash_count
,
d_values
,
d_retrieval_probes
);
}
else
if
(
num_hash_functions
==
3
)
{
hash_retrieve
<<<
ComputeGridDim
(
n_queries
),
kBlockSize
>>>
(
n_queries
,
d_keys
,
table_size
,
d_contents
,
constants_3
,
stash_constants
,
stash_count
,
d_values
,
d_retrieval_probes
);
}
else
if
(
num_hash_functions
==
4
)
{
hash_retrieve
<<<
ComputeGridDim
(
n_queries
),
kBlockSize
>>>
(
n_queries
,
d_keys
,
table_size
,
d_contents
,
constants_4
,
stash_constants
,
stash_count
,
d_values
,
d_retrieval_probes
);
}
else
{
hash_retrieve
<<<
ComputeGridDim
(
n_queries
),
kBlockSize
>>>
(
n_queries
,
d_keys
,
table_size
,
d_contents
,
constants_5
,
stash_constants
,
stash_count
,
d_values
,
d_retrieval_probes
);
}
CUDA_CHECK_ERROR
(
"Retrieval failed.
\n
"
);
#ifdef TRACK_ITERATIONS
OutputRetrievalStatistics
(
n_queries
,
d_retrieval_probes
,
num_hash_functions
);
CUDA_SAFE_CALL
(
cudaFree
(
d_retrieval_probes
));
#endif
}
};
// namespace CUDAWrapper
};
// namespace cuhash
src/cuhash/main.cc
deleted
100644 → 0
View file @
3517290c
#include <cuda.h>
#include <cuhash/hash_table.h>
int
main
()
{
auto
table
=
cuhash
::
HashTable
();
table
.
Initialize
(
10
,
2.0
);
const
int
N
=
10
;
// ハッシュテーブルに格納するデータ
int
keys
[
N
]
=
{
1
,
6
,
4
,
9
,
0
,
3
,
7
,
2
,
5
,
8
};
int
vals
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
// デバイスメモリにコピー
int
*
d_keys
,
*
d_vals
;
cudaMalloc
((
void
**
)
&
d_keys
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_keys
,
keys
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_vals
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_vals
,
vals
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
// ハッシュテーブルにクエリするデータ
int
input
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
int
output
[
N
];
// デバイスメモリにコピー
int
*
d_input
,
*
d_output
;
cudaMalloc
((
void
**
)
&
d_input
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_input
,
input
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_output
,
sizeof
(
int
)
*
N
);
cudaMemset
(
d_output
,
0
,
sizeof
(
int
)
*
N
);
bool
s
=
table
.
Build
(
N
,
(
const
unsigned
int
*
)
d_keys
,
(
const
unsigned
int
*
)
d_vals
);
std
::
cout
<<
s
<<
std
::
endl
;
table
.
Retrieve
(
N
,
(
const
unsigned
int
*
)
d_input
,
(
unsigned
int
*
)
d_output
);
std
::
cout
<<
s
<<
std
::
endl
;
cudaMemcpy
(
output
,
d_output
,
sizeof
(
int
)
*
N
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
printf
(
"%d
\n
"
,
output
[
i
]);
}
return
0
;
}
\ No newline at end of file
src/spconv/CMakeLists.txt
deleted
100644 → 0
View file @
3517290c
set
(
ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc point2voxel_ops.cc
)
if
(
SPCONV_BuildCUDA
)
set
(
ALL_FILES
${
ALL_FILES
}
indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc point2voxel.cu fused_conv.cu
)
endif
()
add_library
(
spconv SHARED
${
ALL_FILES
}
)
find_package
(
OpenMP
)
if
(
OpenMP_CXX_FOUND
)
target_link_libraries
(
spconv PUBLIC OpenMP::OpenMP_CXX
)
endif
()
target_include_directories
(
spconv PRIVATE
${
ALL_INCLUDE
}
${
MP11_INCLUDE
}
)
set_property
(
TARGET spconv PROPERTY CUDA_STANDARD 14
)
set_property
(
TARGET spconv PROPERTY CXX_STANDARD 14
)
set_target_properties
(
spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON
)
if
(
SPCONV_BuildCUDA
)
target_link_libraries
(
spconv PRIVATE
${
ALL_LIBS
}
cuhash spgemm
)
else
()
target_link_libraries
(
spconv PRIVATE
${
ALL_LIBS
}
)
endif
()
install
(
TARGETS spconv DESTINATION lib
)
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment