Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
238d6a83
Commit
238d6a83
authored
Sep 23, 2022
by
yan.yan
Browse files
add a simple example for c++ inference
parent
ce8a91e4
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
311 additions
and
250 deletions
+311
-250
.github/workflows/stale.yaml
.github/workflows/stale.yaml
+1
-1
example/libspconv/CMakeLists.txt
example/libspconv/CMakeLists.txt
+1
-1
example/libspconv/benchmark-pc.jarr
example/libspconv/benchmark-pc.jarr
+0
-0
example/libspconv/main.cc
example/libspconv/main.cc
+0
-28
example/libspconv/main.cu
example/libspconv/main.cu
+168
-0
example/libspconv/run_build.sh
example/libspconv/run_build.sh
+1
-1
setup.py
setup.py
+2
-2
spconv/algo.py
spconv/algo.py
+64
-147
spconv/csrc/sparse/convops.py
spconv/csrc/sparse/convops.py
+61
-62
test/dev2.py
test/dev2.py
+13
-8
No files found.
.github/workflows/stale.yaml
View file @
238d6a83
...
...
@@ -2,7 +2,7 @@ name: 'Close stale issues and PRs'
on
:
schedule
:
-
cron
:
'
30
1
*
*
*'
-
cron
:
'
30
1
1
*
*'
workflow_dispatch
:
inputs
:
logLevel
:
...
...
example/libspconv/CMakeLists.txt
View file @
238d6a83
...
...
@@ -7,7 +7,7 @@ set(CUMM_DISABLE_CMAKE_INSTALL ON CACHE BOOL "enable X functionality" FORCE)
add_subdirectory
(
cumm
)
add_subdirectory
(
spconv
)
add_executable
(
main main.c
c
)
add_executable
(
main main.c
u
)
# SPCONV2_INCLUDE_PATH come from spconv/CMakeLists.txt
target_include_directories
(
main PRIVATE
${
SPCONV2_INCLUDE_PATH
}
)
target_link_libraries
(
main spconv cumm::cumm
)
example/libspconv/benchmark-pc.jarr
0 → 100644
View file @
238d6a83
File added
example/libspconv/main.cc
deleted
100644 → 0
View file @
ce8a91e4
#include <spconvlib/cumm/gemm/main/GemmMainUnitTest.h>
#include <spconvlib/spconv/csrc/sparse/all/SpconvOps.h>
#include <spconvlib/spconv/csrc/sparse/alloc/StaticAllocator.h>
#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
#include <spconvlib/spconv/csrc/sparse/inference/InferenceOps.h>
#include <spconvlib/spconv/csrc/sparse/convops/SimpleExternalSpconvMatmul.h>
#include <spconvlib/spconv/csrc/sparse/convops/gemmops/GemmTunerSimple.h>
#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
using
StaticAllocator
=
spconvlib
::
spconv
::
csrc
::
sparse
::
alloc
::
StaticAllocator
;
using
SpconvOps
=
spconvlib
::
spconv
::
csrc
::
sparse
::
all
::
SpconvOps
;
using
ConvMain
=
spconvlib
::
cumm
::
conv
::
main
::
ConvMainUnitTest
;
using
ConvTunerSimple
=
spconvlib
::
spconv
::
csrc
::
sparse
::
convops
::
spops
::
ConvTuner
;
using
ConvGemmOps
=
spconvlib
::
spconv
::
csrc
::
sparse
::
convops
::
spops
::
ConvGemmOps
;
using
SimpleExternalSpconvMatmul
=
spconvlib
::
spconv
::
csrc
::
sparse
::
convops
::
SimpleExternalSpconvMatmul
;
using
InferenceOps
=
spconvlib
::
spconv
::
csrc
::
sparse
::
inference
::
InferenceOps
;
int
main
(){
tv
::
ssprint
(
"Hello libspconv!!!"
);
return
0
;
}
\ No newline at end of file
example/libspconv/main.cu
0 → 100644
View file @
238d6a83
#include <spconvlib/cumm/gemm/main/GemmMainUnitTest.h>
#include <spconvlib/spconv/csrc/sparse/all/SpconvOps.h>
#include <spconvlib/spconv/csrc/sparse/alloc/StaticAllocator.h>
#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
#include <spconvlib/spconv/csrc/sparse/inference/InferenceOps.h>
#include <spconvlib/spconv/csrc/sparse/all/ops3d/Point2Voxel.h>
#include <spconvlib/spconv/csrc/sparse/convops/SimpleExternalSpconvMatmul.h>
#include <spconvlib/spconv/csrc/sparse/convops/gemmops/GemmTunerSimple.h>
#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
#include <tensorview/io/jsonarray.h>
#include <tensorview/parallel/map.h>
using
StaticAllocator
=
spconvlib
::
spconv
::
csrc
::
sparse
::
alloc
::
StaticAllocator
;
using
SpconvOps
=
spconvlib
::
spconv
::
csrc
::
sparse
::
all
::
SpconvOps
;
using
ConvMain
=
spconvlib
::
cumm
::
conv
::
main
::
ConvMainUnitTest
;
using
ConvTunerSimple
=
spconvlib
::
spconv
::
csrc
::
sparse
::
convops
::
spops
::
ConvTuner
;
using
ConvGemmOps
=
spconvlib
::
spconv
::
csrc
::
sparse
::
convops
::
spops
::
ConvGemmOps
;
using
SimpleExternalSpconvMatmul
=
spconvlib
::
spconv
::
csrc
::
sparse
::
convops
::
SimpleExternalSpconvMatmul
;
using
InferenceOps
=
spconvlib
::
spconv
::
csrc
::
sparse
::
inference
::
InferenceOps
;
using
Point2VoxelGPU3D
=
spconvlib
::
spconv
::
csrc
::
sparse
::
all
::
ops3d
::
Point2Voxel
;
int
main
(
int
argc
,
char
**
argv
){
tv
::
ssprint
(
"Hello libspconv!!!"
);
TV_ASSERT_RT_ERR
(
argc
==
2
,
"usage: main /path/to/benchmark-pc.jarr, you can find it in example/libspconv."
)
std
::
string
path
=
argv
[
1
];
Point2VoxelGPU3D
p2v
{{
0.1
,
0.1
,
0.1
},
{
-
80
,
-
80
,
-
2
,
80
,
80
,
6
},
3
,
200000
,
1
};
auto
pc_jarr
=
tv
::
io
::
load_from_file
(
path
);
auto
pc
=
pc_jarr
.
tensors
.
at
(
0
).
cuda
();
// you should use point_to_voxel_hash_static in tensorrt and manage hash data in tensorrt workspace.
auto
p2v_res
=
p2v
.
point_to_voxel_hash
(
pc
);
tv
::
Tensor
voxels
=
std
::
get
<
0
>
(
p2v_res
).
cuda
().
view
(
-
1
,
3
);
auto
indices_without_bs
=
std
::
get
<
1
>
(
p2v_res
);
auto
indices
=
tv
::
zeros
({
indices_without_bs
.
dim
(
0
),
4
},
tv
::
int32
,
0
);
indices
.
slice
(
1
,
1
,
4
,
1
,
false
,
false
).
copy_2d_pitched_
(
indices_without_bs
);
auto
indices_cpu
=
indices
.
cpu
();
auto
indices_cpu_data_ptr
=
indices_cpu
.
data_ptr
<
int32_t
>
();
for
(
int
i
=
0
;
i
<
5
;
++
i
){
auto
cur_indices_cpu_data_ptr
=
indices_cpu_data_ptr
+
i
*
4
;
tv
::
ssprint
(
cur_indices_cpu_data_ptr
[
0
],
cur_indices_cpu_data_ptr
[
1
],
cur_indices_cpu_data_ptr
[
2
],
cur_indices_cpu_data_ptr
[
3
]);
}
auto
num_per_voxel
=
std
::
get
<
2
>
(
p2v_res
);
tv
::
ssprint
(
"num voxels"
,
voxels
.
shape
());
auto
voxels_f16
=
tv
::
zeros
(
voxels
.
shape
(),
tv
::
float16
,
0
);
auto
voxels_f16_ptr
=
voxels_f16
.
data_ptr
<
__half
>
();
auto
voxels_ptr
=
voxels
.
data_ptr
<
float
>
();
tv
::
kernel_1d_map
(
0
,
voxels_f16
.
size
(),
[
=
]
TV_GPU_LAMBDA
(
size_t
i
)
mutable
{
voxels_f16_ptr
[
i
]
=
__half
(
voxels_ptr
[
i
]);
});
// out channels, ksize, in channels
tv
::
Tensor
weights
=
tv
::
zeros
({
64
,
3
,
3
,
3
,
3
},
tv
::
float16
,
0
);
tv
::
Tensor
bias
=
tv
::
zeros
({
64
},
tv
::
float16
,
0
);
int
KV
=
27
;
int
out_inds_num_limit
=
100000
;
// upper bound of number of output indices.
std
::
vector
<
int32_t
>
ksize
{
3
,
3
,
3
};
std
::
vector
<
int32_t
>
padding
{
1
,
1
,
1
};
std
::
vector
<
int32_t
>
dilation
{
1
,
1
,
1
};
std
::
vector
<
int32_t
>
stride
{
1
,
1
,
1
};
int
ndim
=
3
;
auto
p2v_grid_size
=
p2v
.
get_grid_size
();
std
::
vector
<
int32_t
>
input_dims
(
p2v_grid_size
.
begin
(),
p2v_grid_size
.
end
());
auto
out_dims
=
SpconvOps
::
get_conv_output_size
(
input_dims
,
ksize
,
stride
,
padding
,
dilation
);
tv
::
ssprint
(
ksize
,
input_dims
,
out_dims
);
std
::
vector
<
int64_t
>
output_dims_i64
(
out_dims
.
begin
(),
out_dims
.
end
());
int64_t
out_spatial_volume
=
std
::
accumulate
(
output_dims_i64
.
begin
(),
output_dims_i64
.
end
(),
int64_t
(
1
),
std
::
multiplies
<
int64_t
>
());
bool
use_int64_hash_k
=
out_spatial_volume
>=
int64_t
(
std
::
numeric_limits
<
int
>::
max
());
int
num_act_in
=
voxels
.
dim
(
0
);
bool
is_subm
=
true
;
bool
direct_table
=
true
;
int
batch_size
=
1
;
int
transpose
=
false
;
bool
use_direct_table
=
direct_table
&&
!
is_subm
;
auto
conv_algo
=
tv
::
gemm
::
SparseConvAlgo
::
kMaskImplicitGemm
;
auto
max_act_out_theory
=
SpconvOps
::
get_handcrafted_max_act_out
(
num_act_in
,
ksize
,
stride
,
padding
,
dilation
);
int
workspace_size
=
SpconvOps
::
get_indice_gen_workspace_size
(
KV
,
num_act_in
,
out_inds_num_limit
,
max_act_out_theory
,
is_subm
,
use_int64_hash_k
,
use_direct_table
);
// you should return workspace size in tensorrt plugin method.
tv
::
Tensor
workspace
=
tv
::
empty
({
workspace_size
},
tv
::
uint8
,
0
);
// get tensor map required by pair gen from workspace
auto
ws_tensors
=
SpconvOps
::
get_indice_gen_tensors_from_workspace
(
workspace
.
raw_data
(),
KV
,
num_act_in
,
is_subm
?
num_act_in
:
out_inds_num_limit
,
max_act_out_theory
,
is_subm
,
use_int64_hash_k
,
use_direct_table
);
// create output tensors and insert them to static allocator
int
pair_size
=
is_subm
?
num_act_in
:
out_inds_num_limit
;
tv
::
Tensor
pair_fwd
=
tv
::
empty
({
KV
,
pair_size
},
tv
::
int32
,
0
);
bool
is_split_mask
=
conv_algo
==
tv
::
gemm
::
SparseConvAlgo
::
kMaskSplitImplicitGemm
;
int
mask_count
=
is_split_mask
?
2
:
1
;
tv
::
Tensor
pair_mask_fwd
=
tv
::
empty
({
mask_count
,
pair_size
},
tv
::
int32
,
0
);
tv
::
Tensor
mask_argsort_fwd
=
tv
::
empty
({
mask_count
,
pair_size
},
tv
::
int32
,
0
);
tv
::
Tensor
out_inds
=
tv
::
empty
({
out_inds_num_limit
,
ndim
+
1
},
tv
::
int32
,
0
);
tv
::
Tensor
indices_kernel_num
=
tv
::
zeros
({
KV
},
tv
::
int32
,
0
);
cudaStream_t
stream
=
0
;
ws_tensors
.
insert
({
SPCONV_ALLOC_PAIR_FWD
,
pair_fwd
});
ws_tensors
.
insert
({
SPCONV_ALLOC_PAIR_MASK
,
pair_mask_fwd
});
ws_tensors
.
insert
({
SPCONV_ALLOC_MASK_ARG_SORT
,
mask_argsort_fwd
});
ws_tensors
.
insert
({
SPCONV_ALLOC_OUT_INDICES
,
out_inds
});
ws_tensors
.
insert
({
SPCONV_ALLOC_INDICE_NUM_PER_LOC
,
indices_kernel_num
});
StaticAllocator
alloc
(
ws_tensors
);
auto
pair_res
=
SpconvOps
::
get_indice_pairs_implicit_gemm
(
alloc
,
indices
,
batch_size
,
input_dims
,
static_cast
<
int
>
(
conv_algo
),
ksize
,
stride
,
padding
,
dilation
,
{
0
,
0
,
0
},
is_subm
,
transpose
,
false
,
reinterpret_cast
<
std
::
uintptr_t
>
(
stream
),
out_inds_num_limit
,
tv
::
CUDAKernelTimer
(
false
),
use_direct_table
);
int
num_act_out
=
std
::
get
<
1
>
(
pair_res
);
tv
::
Tensor
out_features
=
tv
::
empty
({
num_act_out
,
64
},
tv
::
float16
,
0
);
// this function is very slow, don't forget to cache result.
auto
arch
=
ConvGemmOps
::
get_compute_capability
();
int
kv
=
pair_fwd
.
dim
(
0
);
bool
is_mask_split
=
pair_mask_fwd
.
dim
(
0
)
>
1
;
int
mask_split_cnt
=
pair_mask_fwd
.
dim
(
0
);
tv
::
Tensor
mask_tensor
=
tv
::
zeros
({
pair_mask_fwd
.
dim
(
0
)},
tv
::
uint32
,
-
1
);
auto
mask_tensor_ptr
=
mask_tensor
.
data_ptr
<
uint32_t
>
();
if
(
is_mask_split
)
{
auto
kv_div_2
=
kv
/
2
;
auto
remain
=
kv
-
kv_div_2
;
uint64_t
mask_np_1
=
1
;
uint64_t
first
=
((
mask_np_1
<<
remain
)
-
1
);
uint64_t
second
=
((
mask_np_1
<<
kv_div_2
)
-
1
)
<<
remain
;
mask_tensor_ptr
[
0
]
=
uint32_t
(
first
);
mask_tensor_ptr
[
1
]
=
uint32_t
(
second
);
}
else
{
mask_tensor_ptr
[
0
]
=
0xffffffff
;
}
std
::
vector
<
tv
::
Tensor
>
pair_mask_splits
;
std
::
vector
<
tv
::
Tensor
>
mask_argsort_splits
;
for
(
int
i
=
0
;
i
<
mask_split_cnt
;
++
i
)
{
pair_mask_splits
.
push_back
(
pair_mask_fwd
[
i
]);
mask_argsort_splits
.
push_back
(
mask_argsort_fwd
[
i
]);
}
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
tensor_dict
{
{
SPCONV_ALLOC_FEATURES
,
voxels_f16
},
{
SPCONV_ALLOC_FILTERS
,
weights
},
{
SPCONV_ALLOC_OUT_FEATURES
,
out_features
}};
StaticAllocator
alloc2
(
tensor_dict
);
ConvTunerSimple
tuner
(
ConvMain
::
get_all_conv_algo_desp
());
auto
conv_res
=
ConvGemmOps
::
implicit_gemm
(
alloc2
,
tuner
,
voxels_f16
,
weights
,
pair_fwd
,
pair_mask_splits
,
mask_argsort_splits
,
num_act_out
,
mask_tensor
,
arch
,
false
,
is_subm
,
reinterpret_cast
<
std
::
uintptr_t
>
(
stream
),
tv
::
CUDAKernelTimer
(
false
),
false
,
false
,
bias
,
1.0
,
0.0
,
tv
::
gemm
::
Activation
::
kReLU
);
// p2v.point_to_voxel_hash()
return
0
;
}
\ No newline at end of file
example/libspconv/run_build.sh
View file @
238d6a83
...
...
@@ -16,4 +16,4 @@ python -m spconv.gencode --include=$SCRIPT_DIR/spconv/include --src=$SCRIPT_DIR/
mkdir
-p
$SCRIPT_DIR
/build
cd
$SCRIPT_DIR
/build
cmake ..
cmake
--build
$SCRIPT_DIR
/build
--config
Release
-j
8
cmake
--build
$SCRIPT_DIR
/build
--config
Release
-j
8
--verbose
setup.py
View file @
238d6a83
...
...
@@ -48,11 +48,11 @@ DESCRIPTION = 'spatial sparse convolution'
URL
=
'https://github.com/traveller59/spconv'
EMAIL
=
'yanyan.sub@outlook.com'
AUTHOR
=
'Yan Yan'
REQUIRES_PYTHON
=
'>=3.
6
'
REQUIRES_PYTHON
=
'>=3.
7
'
VERSION
=
None
# What packages are required for this module to be executed?
REQUIRED
=
[
"pccm>=0.
3.5
"
,
"pybind11>=2.6.0"
,
"fire"
,
"numpy"
,
*
deps
]
REQUIRED
=
[
"pccm>=0.
4.0"
,
"ccimport>=0.4.0
"
,
"pybind11>=2.6.0"
,
"fire"
,
"numpy"
,
*
deps
]
# What packages are optional?
EXTRAS
=
{
...
...
spconv/algo.py
View file @
238d6a83
...
...
@@ -55,7 +55,6 @@ from spconv.core_cc.csrc.sparse.convops.convops import ConvTunerSimple as ConvTu
ALL_ALGO_DESPS
=
GemmMainUnitTest
.
get_all_algo_desp
()
ALL_CONV_ALGO_DESPS
=
ConvMainUnitTest
.
get_all_conv_algo_desp
()
_GEMM_STATIC_KEY
=
Tuple
[
bool
,
bool
,
bool
,
int
,
int
,
int
,
int
,
str
]
class
SimpleGemmAlgoMeta
:
...
...
@@ -205,6 +204,8 @@ class ConvTunerSimple(ConvTunerSimpleBase):
self
.
_nvrtc_caches
[
key
]
=
nvrtc_params
return
nvrtc_params
_GEMM_STATIC_KEY
=
Tuple
[
bool
,
bool
,
bool
,
int
,
int
,
int
,
int
]
class
SimpleGemm
:
def
__init__
(
self
,
prebuilt_desps
:
List
[
GemmAlgoDesp
])
->
None
:
...
...
@@ -256,7 +257,7 @@ class SimpleGemm:
@
staticmethod
def
get_static_key
(
d
:
GemmAlgoDesp
)
->
_GEMM_STATIC_KEY
:
return
(
d
.
trans_a
,
d
.
trans_b
,
d
.
trans_c
,
d
.
dtype_a
,
d
.
dtype_b
,
d
.
dtype_c
,
d
.
shuffle_type
.
value
,
d
.
algo
)
d
.
dtype_c
,
d
.
shuffle_type
.
value
)
def
device_synchronize
(
self
):
return
GemmMainUnitTest
.
device_synchronize
()
...
...
@@ -310,15 +311,19 @@ class SimpleGemm:
avail_algos
=
get_available_algo_str_from_arch
(
arch
)
finally_algos
:
List
[
GemmAlgoDesp
]
=
[]
# print(self.static_key_to_desps)
for
algo
in
avail_algos
:
static_key
=
(
trans_a
,
trans_b
,
trans_c
,
a
.
dtype
,
b
.
dtype
,
c
.
dtype
,
shuffle_type
.
value
,
algo
)
shuffle_type
.
value
)
# for algo in avail_algos:
# static_key = (trans_a, trans_b, trans_c, a.dtype, b.dtype, c.dtype,
# shuffle_type.value)
# print(static_key)
desps
=
self
.
static_key_to_desps
.
get
(
static_key
,
None
)
if
desps
is
None
or
len
(
desps
)
==
0
:
continue
return
finally_algos
# print(desps)
for
desp
in
desps
:
if
arch
<
desp
.
min_arch
:
continue
# skip volta tensor op since it is very slow in architectures except volta.
if
arch
>=
(
7
,
5
)
and
desp
.
algo
==
GemmAlgo
.
Volta
.
value
:
continue
...
...
@@ -334,95 +339,6 @@ class SimpleGemm:
finally_algos
.
append
(
desp
)
return
finally_algos
def
select
(
self
,
a
:
tv
.
Tensor
,
b
:
tv
.
Tensor
,
c
:
tv
.
Tensor
,
trans_a
:
bool
,
trans_b
:
bool
,
trans_c
:
bool
,
arch
:
Tuple
[
int
,
int
],
shuffle_type
:
ShuffleStrideType
=
ShuffleStrideType
.
NoShuffle
,
a_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
b_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
c_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
hint
:
int
=
AlgoHint
.
NoHint
.
value
):
m
,
n
,
k
=
GemmMainUnitTest
.
extract_mnk
(
a
.
shape
,
b
.
shape
,
trans_a
,
trans_b
,
trans_c
,
shuffle_type
.
value
,
a_inds
.
shape
,
b_inds
.
shape
,
c_inds
.
shape
)
if
trans_c
:
trans_a
=
not
trans_a
trans_b
=
not
trans_b
trans_a
,
trans_b
=
trans_b
,
trans_a
a
,
b
=
b
,
a
trans_c
=
False
avail_algos
=
get_available_algo_str_from_arch
(
arch
)
finally_algos
:
List
[
GemmAlgoDesp
]
=
[]
for
algo
in
avail_algos
:
static_key
=
(
trans_a
,
trans_b
,
trans_c
,
a
.
dtype
,
b
.
dtype
,
c
.
dtype
,
shuffle_type
.
value
,
algo
)
desps
=
self
.
static_key_to_desps
.
get
(
static_key
,
None
)
if
desps
is
None
or
len
(
desps
)
==
0
:
continue
meta
=
self
.
static_key_to_meta
[
static_key
]
# for shuffle stride algos, we need to make channel tile size as large as possible.
# so if ShuffleAC, we need to make k largest.
selected_algo_desps
=
GemmMainUnitTest
.
simple_select_tile_shape
(
m
,
n
,
k
,
meta
.
tile_ms
,
meta
.
tile_ns
,
meta
.
tile_ks
,
meta
.
tile_shape_to_algos
,
large_k_first
=
shuffle_type
==
shuffle_type
.
ShuffleAC
)
if
not
selected_algo_desps
:
candidate
=
desps
else
:
candidate
=
[
desps
[
i
]
for
i
in
selected_algo_desps
]
# select by hint
if
hint
==
0
:
return
candidate
[
0
]
if
hint
&
(
AlgoHint
.
Fowrard
.
value
|
AlgoHint
.
BackwardInput
.
value
):
# m may be huge, n and k are small
# don't need mixed precision
# don't need splitk
finally_algos
=
[]
if
a
.
dtype
==
tv
.
float16
:
dacc
=
tv
.
float16
dcomp
=
tv
.
float16
for
can
in
candidate
:
if
can
.
dacc
==
dacc
and
can
.
dcomp
==
dcomp
:
finally_algos
.
append
(
can
)
else
:
finally_algos
=
candidate
elif
hint
&
AlgoHint
.
BackwardWeight
.
value
:
# k is huge
# don't support i8
# if f16, acc and comp must be f32
finally_algos
=
[]
candidate_filtered
:
List
[
GemmAlgoDesp
]
=
list
(
filter
(
lambda
x
:
x
.
split_k_serial
,
candidate
))
if
not
candidate_filtered
:
candidate_filtered
=
candidate
if
a
.
dtype
==
tv
.
int8
:
continue
elif
a
.
dtype
==
tv
.
float16
:
dacc
=
tv
.
float32
dcomp
=
tv
.
float32
for
can
in
candidate_filtered
:
if
can
.
dacc
==
dacc
and
can
.
dcomp
==
dcomp
:
finally_algos
.
append
(
can
)
else
:
finally_algos
=
candidate_filtered
else
:
return
candidate
[
0
]
# print(finally_algos)
if
finally_algos
:
return
finally_algos
[
0
]
return
None
def
get_tuned_algo
(
self
,
...
...
@@ -672,7 +588,7 @@ class SimpleGemm:
return
algo_desp
_CONV_STATIC_KEY
=
Tuple
[
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
str
,
int
]
_CONV_STATIC_KEY
=
Tuple
[
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
]
class
SimpleConv
:
...
...
@@ -729,7 +645,7 @@ class SimpleConv:
def
get_static_key
(
d
:
ConvAlgoDesp
)
->
_CONV_STATIC_KEY
:
return
(
d
.
layout_i
.
value
,
d
.
layout_w
.
value
,
d
.
layout_o
.
value
,
d
.
interleave_i
,
d
.
interleave_w
,
d
.
interleave_o
,
d
.
dtype_input
,
d
.
dtype_weight
,
d
.
dtype_output
,
d
.
algo
,
d
.
op_type
.
value
)
d
.
dtype_weight
,
d
.
dtype_output
,
d
.
op_type
.
value
)
def
device_synchronize
(
self
):
return
GemmMainUnitTest
.
device_synchronize
()
...
...
@@ -762,16 +678,17 @@ class SimpleConv:
else
:
use_f32_as_accum
=
fp32_accum
# use_f32_as_accum = False
for
algo
in
avail_algos
:
static_key
=
(
layout_i
.
layout_type
.
value
,
layout_w
.
layout_type
.
value
,
layout_o
.
layout_type
.
value
,
layout_i
.
interleave
,
layout_w
.
interleave
,
layout_o
.
interleave
,
inp
.
dtype
,
weight
.
dtype
,
out
.
dtype
,
algo
,
op_type
.
value
)
weight
.
dtype
,
out
.
dtype
,
op_type
.
value
)
desps
=
self
.
static_key_to_desps
.
get
(
static_key
,
None
)
if
desps
is
None
or
len
(
desps
)
==
0
:
continue
return
finally_algos
for
desp
in
desps
:
if
arch
<
desp
.
min_arch
:
continue
# skip volta tensor op since it is very slow in architectures except volta.
if
arch
>=
(
7
,
5
)
and
desp
.
algo
==
GemmAlgo
.
Volta
.
value
:
continue
...
...
@@ -1058,5 +975,5 @@ CONV_CPP = ConvTunerSimple([
for
p
in
ALL_IMPGEMM_PARAMS
])
if
__name__
==
"__main__"
:
print
(
len
(
ALL_CONV_ALGO_DESPS
))
print
(
ALL_CONV_ALGO_DESPS
[
0
]
)
for
desp
in
ALL_CONV_ALGO_DESPS
:
print
(
desp
,
desp
.
min_arch
)
spconv/csrc/sparse/convops.py
View file @
238d6a83
...
...
@@ -4,9 +4,7 @@ from cumm.common import GemmBasicHost, NlohmannJson, TensorView
from
cumm.constants
import
CUMM_CPU_ONLY_BUILD
from
cumm.conv.main
import
ConvMainUnitTest
from
cumm.gemm.algospec.core
import
(
_GEMM_MIN_ARCH_TO_ALGO
,
GemmAlgo
,
ShuffleStrideType
,
get_available_algo_str_from_arch
,
get_min_arch_of_algo_str
)
ShuffleStrideType
)
from
cumm.gemm.main
import
GemmMainUnitTest
from
spconv.constants
import
NDIM_DONT_CARE
,
SPCONV_BWD_SPLITK
,
AllocKeys
from
spconv.core
import
AlgoHint
,
ConvAlgo
...
...
@@ -472,7 +470,7 @@ class GemmTunerSimple(pccm.ParameterizedClass):
self
.
add_typedef
(
"static_key_t"
,
"std::tuple<bool, bool, bool, int, "
"int, int, int
, std::string
>"
)
"int, int, int>"
)
self
.
add_typedef
(
"algo_cache_key_t"
,
"std::tuple<int, "
"int, int, int, int>"
)
...
...
@@ -501,7 +499,7 @@ class GemmTunerSimple(pccm.ParameterizedClass):
for (auto& d : desps){{
static_key_t static_key = std::make_tuple(d.trans_a(), d.trans_b(), d.trans_c(), d.dtype_a, d.dtype_b,
d.dtype_c, int(d.shuffle_type)
, d.algo
);
d.dtype_c, int(d.shuffle_type));
auto& vec = static_key_to_desps_[static_key];
vec.push_back(d);
}}
...
...
@@ -548,17 +546,19 @@ class GemmTunerSimple(pccm.ParameterizedClass):
std::swap(a, b);
trans_c = false;
}}
auto avail_algos = get_available_algo_str_from_arch(arch);
//
auto avail_algos = get_available_algo_str_from_arch(arch);
std::vector<tv::gemm::GemmAlgoDesp> finally_algos;
auto is_arch_compiled = CompileInfo::arch_is_compiled_gemm(arch);
for (auto algo : avail_algos){{
static_key_t static_key = std::make_tuple(trans_a, trans_b, trans_c, int(a.dtype()),
int(b.dtype()), int(c.dtype()), shuffle_type
, algo
);
int(b.dtype()), int(c.dtype()), shuffle_type);
if (static_key_to_desps_.find(static_key) == static_key_to_desps_.end()){{
continue
;
return finally_algos
;
}}
auto& desps = static_key_to_desps_.at(static_key);
for (auto& desp : desps){{
if (arch < desp.min_arch){{
continue;
}}
if (arch >= std::make_tuple(7, 5) && desp.algo ==
{
pccm
.
literal
(
GemmAlgo
.
Volta
.
value
)
}
){{
continue;
}}
...
...
@@ -575,7 +575,6 @@ class GemmTunerSimple(pccm.ParameterizedClass):
}}
}}
}}
}}
return finally_algos;
"""
)
return
code
.
ret
(
"std::vector<tv::gemm::GemmAlgoDesp>"
,
...
...
@@ -895,7 +894,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
self
.
add_typedef
(
"static_key_t"
,
(
"std::tuple<int, int, int, int, int, "
"int, int, int, int,
std::string,
int>"
))
"int, int, int, int, int>"
))
self
.
add_typedef
(
"algo_cache_key_t"
,
"std::tuple<int, int, int, int, "
"int, int, int, int>"
)
...
...
@@ -927,7 +926,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
static_key_t static_key = std::make_tuple(
int(d.layout_i), int(d.layout_w), int(d.layout_o),
d.interleave_i, d.interleave_w, d.interleave_o, d.dtype_input(),
d.dtype_weight(), d.dtype_output(),
d.algo,
int(d.op_type));
d.dtype_weight(), d.dtype_output(), int(d.op_type));
auto& vec = static_key_to_desps_[static_key];
vec.push_back(d);
}}
...
...
@@ -974,7 +973,6 @@ class ConvTunerSimple(pccm.ParameterizedClass):
code
.
raw
(
f
"""
tv::gemm::ConvOpType op_type_cpp = static_cast<tv::gemm::ConvOpType>(op_type);
auto avail_algos = get_available_algo_str_from_arch(arch);
bool is_fp16 = (inp.dtype() == tv::float16 &&
weight.dtype() == tv::float16 && out.dtype() == tv::float16);
bool use_f32_as_accum = false;
...
...
@@ -997,16 +995,18 @@ class ConvTunerSimple(pccm.ParameterizedClass):
std::vector<tv::gemm::ConvAlgoDesp> finally_algos;
auto is_arch_compiled = CompileInfo::arch_is_compiled_gemm(arch);
for (auto algo : avail_algos){{
static_key_t static_key = std::make_tuple(
layout_i, layout_w, layout_o,
interleave_i, interleave_w, interleave_o, inp.dtype(),
weight.dtype(), out.dtype(),
algo,
op_type);
weight.dtype(), out.dtype(), op_type);
if (static_key_to_desps_.find(static_key) == static_key_to_desps_.end()){{
continue
;
return finally_algos
;
}}
auto& desps = static_key_to_desps_.at(static_key);
for (auto& desp : desps){{
if (arch < desp.min_arch){{
continue;
}}
if (arch >= std::make_tuple(7, 5) && desp.algo ==
{
pccm
.
literal
(
GemmAlgo
.
Volta
.
value
)
}
){{
continue;
}}
...
...
@@ -1042,7 +1042,6 @@ class ConvTunerSimple(pccm.ParameterizedClass):
}}
}}
}}
}}
return finally_algos;
"""
)
return
code
.
ret
(
"std::vector<tv::gemm::ConvAlgoDesp>"
,
...
...
test/dev2.py
View file @
238d6a83
from
spconv.pytorch.cppcore
import
TorchAllocator
print
(
1
)
from
cumm
import
tensorview
as
tv
from
cumm.tensorview
import
tvio
import
numpy
as
np
from
pathlib
import
Path
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
import
torch
print
(
2
)
if
__name__
==
"__main__"
:
alloc
=
TorchAllocator
(
torch
.
device
(
"cuda:0"
))
SpconvOps
.
test_allocator
(
alloc
)
def
main
():
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
with
open
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.jarr"
,
"wb"
)
as
f
:
f
.
write
(
tvio
.
dumps_jsonarray
({
"pc"
:
data
}).
tobytes
())
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment