Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
23d9faaf
Commit
23d9faaf
authored
Jun 08, 2020
by
yanyan
Browse files
working on cutlass
parent
853302aa
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
192 additions
and
63 deletions
+192
-63
CMakeLists.txt
CMakeLists.txt
+1
-0
codeai-devops.yaml
codeai-devops.yaml
+109
-0
cutlass
cutlass
+0
-1
include/tensorview/mp_helper.h
include/tensorview/mp_helper.h
+4
-0
include/tensorview/tensor.h
include/tensorview/tensor.h
+2
-1
setup.py
setup.py
+1
-1
src/spconv/CMakeLists.txt
src/spconv/CMakeLists.txt
+1
-1
src/spconv/indice.cu
src/spconv/indice.cu
+65
-50
src/spconv/reordering.cu
src/spconv/reordering.cu
+5
-5
src/spgemm/CMakeLists.txt
src/spgemm/CMakeLists.txt
+0
-0
src/spgemm/torchdev_cutlass.cu
src/spgemm/torchdev_cutlass.cu
+0
-0
test/benchmark.py
test/benchmark.py
+1
-1
test/test_conv.py
test/test_conv.py
+3
-3
No files found.
CMakeLists.txt
View file @
23d9faaf
...
@@ -43,6 +43,7 @@ add_subdirectory(third_party/pybind11)
...
@@ -43,6 +43,7 @@ add_subdirectory(third_party/pybind11)
set
(
ALL_LIBS
${
TORCH_LIBRARIES
}
)
set
(
ALL_LIBS
${
TORCH_LIBRARIES
}
)
set
(
ALL_INCLUDE
${
PROJECT_SOURCE_DIR
}
/include
)
set
(
ALL_INCLUDE
${
PROJECT_SOURCE_DIR
}
/include
)
set
(
MP11_INCLUDE
${
PROJECT_SOURCE_DIR
}
/third_party/mp11/include
)
if
(
SPCONV_BuildCUDA
)
if
(
SPCONV_BuildCUDA
)
set
(
ALL_LIBS
${
ALL_LIBS
}
${
CUDA_CUDART
}
${
CUDA_CUBLAS
}
)
set
(
ALL_LIBS
${
ALL_LIBS
}
${
CUDA_CUDART
}
${
CUDA_CUBLAS
}
)
...
...
codeai-devops.yaml
0 → 100644
View file @
23d9faaf
global
:
console_url
:
localhost:50091
envs
:
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION
:
python
# c++ libprotobuf and python will conflicit
analyzers
:
# only one analyzer is allowed for one type for now.
PythonAnalyzer
:
SimpleCPPAnalyzer
:
# $<astex> devops.devs = ["_ci_dev_xxx"] </astex> is allowed in raw sources.
includes
:
[
"
*.cpp"
,
"
*.cu"
,
"
*.cc"
,
"
*.h"
,
"
*.hpp"
,
"
*.hxx"
,
"
*.cxx"
]
observers
:
# run test functions when that function change or marked function change.
test
:
type
:
TestObserver
# run dev functions when that function change or marked function change.
dev
:
type
:
DevObserver
pattern
:
_ci_dev_.*
clangdev
:
type
:
CPPDevObserver
main_pattern
:
dev_.*\.(cc|cpp|cxx)
pattern
:
.*\.(cc|cpp|cxx|h|hpp|hxx)
compiler
:
clang++
executable
:
build/codeai_dev
includes
:
[
include
,
/usr/local/cuda/include
,
/home/yy/anaconda3/include
,
/home/yy/anaconda3/include/python3.7m
,
third_party/pybind11/include
,
third_party/include
,
/home/yy/library/boost_1_72_0
,
]
libpaths
:
[
/home/yy/anaconda3/lib
,
]
libraries
:
[
-lnvinfer
,
-lpython3.7m
,
-lcublas
,
-lcudart
,
-ljpeg
]
std
:
c++2a
options
:
[
-Wall
,
-Wextra
]
cudadev
:
type
:
CPPDevObserver
main_pattern
:
dev_.*\.cu
pattern
:
.*\.(cc|cpp|cxx|h|hpp|hxx|cu)
compiler
:
nvcc
executable
:
build/codeai_dev_cuda
run_cmd
:
[
$(executable)
]
sources
:
[]
includes
:
[
include
,
/usr/local/cuda/include
,
/home/yy/anaconda3/include
,
/home/yy/anaconda3/include/python3.7m
,
third_party/pybind11/include
,
third_party/cutlass/include
,
]
libpaths
:
[
/usr/local/cuda/lib64
,
/home/yy/anaconda3/lib
,
]
libraries
:
[
-lpython3.7m
,
-lcudart
,
-lcublas
,
-ljpeg
]
std
:
c++14
options
:
[
-Wno-deprecated-declarations
,
"
-gencode=arch=compute_52,code=sm_61"
,
"
-gencode=arch=compute_61,code=sm_61"
,
"
-gencode=arch=compute_60,code=sm_60"
,
"
-gencode=arch=compute_70,code=sm_70"
,
"
-gencode=arch=compute_75,code=sm_75"
,
]
torchdev
:
type
:
CPPDevObserver
main_pattern
:
torchdev_.*\.(cu|cpp|cc|cxx)
pattern
:
.*\.(cc|cpp|cxx|h|hpp|hxx|cu)
compiler
:
clang++
executable
:
build/codeai_dev_torch
run_cmd
:
[
$(executable)
]
fail_cmds
:
# run cmd when pervious run fail with retcode
-6
:
[
gdb
,
-ex
,
run
,
-ex
,
bt
,
-ex
,
quit
,
$(executable)
]
# segfault in unix
includes
:
[
include
,
/home/yy/anaconda3/lib/python3.7/site-packages/torch/include
,
/home/yy/anaconda3/lib/python3.7/site-packages/torch/include/torch/csrc/api/include
,
/usr/local/cuda/include
,
/home/yy/anaconda3/include
,
/home/yy/anaconda3/include/python3.7m
,
third_party/pybind11/include
,
third_party/cutlass/include
,
]
libpaths
:
[
/home/yy/anaconda3/lib/python3.7/site-packages/torch/lib
,
/usr/local/cuda/lib64
,
/home/yy/anaconda3/lib
,
]
libraries
:
[
-lpython3.7m
,
-lcublas
,
-lcudart
,
-ljpeg
,
-lpthread
,
"
-Wl,--no-as-needed,-lc10"
,
"
-Wl,--no-as-needed,-ltorch"
,
"
-Wl,--no-as-needed,-ltorch_cpu"
,
"
-Wl,--no-as-needed,-lc10_cuda"
,
"
-Wl,--no-as-needed,-ltorch_cuda"
]
std
:
c++2a
options
:
[
--cuda-gpu-arch=sm_61
,
-Wno-deprecated-declarations
,
-D_GLIBCXX_USE_CXX11_ABI=0
]
cutlass
@
e33d90b3
Subproject commit e33d90b36109f67915a80c532ebbb978b72c7bd2
include/tensorview/mp_helper.h
View file @
23d9faaf
...
@@ -9,6 +9,10 @@ template <class... T> struct mp_list {};
...
@@ -9,6 +9,10 @@ template <class... T> struct mp_list {};
template
<
class
T
,
T
...
I
>
template
<
class
T
,
T
...
I
>
using
mp_list_c
=
mp_list
<
std
::
integral_constant
<
T
,
I
>
...
>
;
using
mp_list_c
=
mp_list
<
std
::
integral_constant
<
T
,
I
>
...
>
;
template
<
int
...
I
>
using
mp_list_int_c
=
mp_list
<
std
::
integral_constant
<
int
,
I
>
...
>
;
namespace
detail
{
namespace
detail
{
template
<
class
...
Ts
,
class
F
>
template
<
class
...
Ts
,
class
F
>
...
...
include/tensorview/tensor.h
View file @
23d9faaf
...
@@ -418,7 +418,8 @@ bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
...
@@ -418,7 +418,8 @@ bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
return
;
return
;
}
}
if
(
count
>=
val_lst_size
)
{
if
(
count
>=
val_lst_size
)
{
TV_THROW_INVALID_ARG
(
"iterator length invalid:"
,
val_lst_size
);
equal
=
false
;
return
;
}
}
constexpr
auto
c
=
decltype
(
E
)
::
value
;
constexpr
auto
c
=
decltype
(
E
)
::
value
;
if
(
c
!=
*
iter
)
{
if
(
c
!=
*
iter
)
{
...
...
setup.py
View file @
23d9faaf
...
@@ -54,7 +54,7 @@ class CMakeBuild(build_ext):
...
@@ -54,7 +54,7 @@ class CMakeBuild(build_ext):
'-DCMAKE_PREFIX_PATH={}'
.
format
(
LIBTORCH_ROOT
),
'-DCMAKE_PREFIX_PATH={}'
.
format
(
LIBTORCH_ROOT
),
'-DPYBIND11_PYTHON_VERSION={}'
.
format
(
PYTHON_VERSION
),
'-DPYBIND11_PYTHON_VERSION={}'
.
format
(
PYTHON_VERSION
),
'-DSPCONV_BuildTests=OFF'
,
'-DSPCONV_BuildTests=OFF'
,
'-DPYTORCH_VERSION={}'
.
format
(
PYTORCH_VERSION_NUMBER
)
'-DPYTORCH_VERSION={}'
.
format
(
PYTORCH_VERSION_NUMBER
)
,
]
# -arch=sm_61
]
# -arch=sm_61
if
not
torch
.
cuda
.
is_available
()
and
SPCONV_FORCE_BUILD_CUDA
is
None
:
if
not
torch
.
cuda
.
is_available
()
and
SPCONV_FORCE_BUILD_CUDA
is
None
:
cmake_args
+=
[
'-DSPCONV_BuildCUDA=OFF'
]
cmake_args
+=
[
'-DSPCONV_BuildCUDA=OFF'
]
...
...
src/spconv/CMakeLists.txt
View file @
23d9faaf
...
@@ -10,7 +10,7 @@ if(OpenMP_CXX_FOUND)
...
@@ -10,7 +10,7 @@ if(OpenMP_CXX_FOUND)
endif
()
endif
()
target_include_directories
(
spconv PRIVATE
${
ALL_INCLUDE
}
)
target_include_directories
(
spconv PRIVATE
${
ALL_INCLUDE
}
${
MP11_INCLUDE
}
)
set_property
(
TARGET spconv PROPERTY CUDA_STANDARD 14
)
set_property
(
TARGET spconv PROPERTY CUDA_STANDARD 14
)
set_property
(
TARGET spconv PROPERTY CXX_STANDARD 14
)
set_property
(
TARGET spconv PROPERTY CXX_STANDARD 14
)
set_target_properties
(
spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON
)
set_target_properties
(
spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON
)
...
...
src/spconv/indice.cu
View file @
23d9faaf
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <boost/mp11.hpp>
#include <chrono>
#include <chrono>
#include <cuhash/hash_table.h>
#include <cuhash/hash_table.h>
#include <limits>
#include <limits>
...
@@ -79,7 +80,7 @@ int create_conv_indice_pair_p1_cuda(
...
@@ -79,7 +80,7 @@ int create_conv_indice_pair_p1_cuda(
pa
,
di
,
ou
);
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareIndicePairsKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"prepareIndicePairsKernel failed"
);
}
}
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
#ifdef TV_LOG_KERNEL_INFO
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr
;
cudaFuncAttributes
attr
;
checkCudaErrors
(
cudaFuncGetAttributes
(
checkCudaErrors
(
cudaFuncGetAttributes
(
...
@@ -226,6 +227,18 @@ int create_submconv_indice_pair_cuda(
...
@@ -226,6 +227,18 @@ int create_submconv_indice_pair_cuda(
spatialVolume
*=
outSpatialShape
[
i
];
spatialVolume
*=
outSpatialShape
[
i
];
}
}
auto
dispatcher
=
tv
::
DispatchIntNoexcept
<
tv
::
mp_list_c
<
int
,
1
,
3
,
5
>>
();
auto
dispatcher
=
tv
::
DispatchIntNoexcept
<
tv
::
mp_list_c
<
int
,
1
,
3
,
5
>>
();
namespace
mp11
=
boost
::
mp11
;
using
kernel2_candidates_t
=
mp11
::
mp_product
<
tv
::
mp_list
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>>
;
using
kernel3_candidates_t
=
mp11
::
mp_product
<
tv
::
mp_list
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>>
;
using
kernel3_candidates_final_t
=
mp11
::
mp_push_back
<
kernel3_candidates_t
>
;
auto
dispatcher2
=
tv
::
DispatchContainerNoexcept
<
kernel2_candidates_t
>
();
auto
dispatcher3
=
tv
::
DispatchContainerNoexcept
<
kernel3_candidates_final_t
>
();
if
(
useHash
)
{
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
auto
table
=
cuhash
::
HashTable
();
...
@@ -263,43 +276,33 @@ int create_submconv_indice_pair_cuda(
...
@@ -263,43 +276,33 @@ int create_submconv_indice_pair_cuda(
if
(
NDim
==
2
)
{
if
(
NDim
==
2
)
{
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
dispatcher2
(
kernelSize
.
begin
(),
kernelSize
.
end
(),
[
&
](
auto
K
)
{
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
constexpr
int
K0
=
mp11
::
mp_at_c
<
decltype
(
K
),
0
>::
value
;
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
constexpr
int
K1
=
mp11
::
mp_at_c
<
decltype
(
K
),
1
>::
value
;
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
found
=
true
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
getSubMIndicePairsHashUnrollKernel2
<
Index
,
K0
,
K1
>
found
=
true
;
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
getSubMIndicePairsHashUnrollKernel2
<
Index
,
K0
,
K1
>
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
tv
::
torch2tv
<
Index
>
(
indicesIn
),
spatialVolume
,
tableSize
,
tableData
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
constants
,
stash_constants
,
stash_count
);
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
});
});
});
}
else
if
(
NDim
==
3
)
{
}
else
if
(
NDim
==
3
)
{
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
dispatcher3
(
kernelSize
.
begin
(),
kernelSize
.
end
(),
[
&
](
auto
K
)
{
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
constexpr
int
K0
=
mp11
::
mp_at_c
<
decltype
(
K
),
0
>::
value
;
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
constexpr
int
K1
=
mp11
::
mp_at_c
<
decltype
(
K
),
1
>::
value
;
dispatcher
(
kernelSize
[
2
],
[
&
](
auto
K2C
)
{
constexpr
int
K2
=
mp11
::
mp_at_c
<
decltype
(
K
),
2
>::
value
;
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
found
=
true
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
getSubMIndicePairsHashUnrollKernel3
<
Index
,
K0
,
K1
,
K2
>
constexpr
int
K2
=
decltype
(
K2C
)
::
value
;
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
found
=
true
;
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
getSubMIndicePairsHashUnrollKernel3
<
Index
,
K0
,
K1
,
K2
>
tv
::
torch2tv
<
Index
>
(
indicePairs
),
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
spatialVolume
,
tableSize
,
tableData
,
tv
::
torch2tv
<
Index
>
(
indicesIn
),
constants
,
stash_constants
,
stash_count
);
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
});
});
});
});
}
}
}
}
...
@@ -338,24 +341,35 @@ int create_submconv_indice_pair_cuda(
...
@@ -338,24 +341,35 @@ int create_submconv_indice_pair_cuda(
if
(
NDim
==
2
)
{
if
(
NDim
==
2
)
{
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
dispatcher2
(
kernelSize
.
begin
(),
kernelSize
.
end
(),
[
&
](
auto
K
)
{
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
constexpr
int
K0
=
mp11
::
mp_at_c
<
decltype
(
K
),
0
>::
value
;
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
constexpr
int
K1
=
mp11
::
mp_at_c
<
decltype
(
K
),
1
>::
value
;
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
found
=
true
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
getSubMIndicePairsUnrollKernel2
<
Index
,
IndexGrid
,
K0
,
K1
>
found
=
true
;
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
getSubMIndicePairsUnrollKernel2
<
Index
,
IndexGrid
,
K0
,
K1
>
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
spatialVolume
);
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
);
});
});
});
}
else
if
(
NDim
==
3
)
{
}
else
if
(
NDim
==
3
)
{
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
dispatcher3
(
kernelSize
.
begin
(),
kernelSize
.
end
(),
[
&
](
auto
K
)
{
constexpr
int
K0
=
mp11
::
mp_at_c
<
decltype
(
K
),
0
>::
value
;
constexpr
int
K1
=
mp11
::
mp_at_c
<
decltype
(
K
),
1
>::
value
;
constexpr
int
K2
=
mp11
::
mp_at_c
<
decltype
(
K
),
2
>::
value
;
found
=
true
;
getSubMIndicePairsUnrollKernel3
<
Index
,
IndexGrid
,
K0
,
K1
,
K2
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
);
});
/*
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
dispatcher(kernelSize[1], [&](auto K1C) {
dispatcher(kernelSize[2], [&](auto K2C) {
dispatcher(kernelSize[2], [&](auto K2C) {
...
@@ -372,7 +386,7 @@ int create_submconv_indice_pair_cuda(
...
@@ -372,7 +386,7 @@ int create_submconv_indice_pair_cuda(
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
});
});
});
});
});
});
*/
}
}
}
}
if
(
!
found
)
{
if
(
!
found
)
{
...
@@ -396,7 +410,8 @@ int create_submconv_indice_pair_cuda(
...
@@ -396,7 +410,8 @@ int create_submconv_indice_pair_cuda(
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
indicesIn
.
data_ptr
<
Index
>
(),
stream
>>>
(
indicesIn
.
data_ptr
<
Index
>
(),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
,
spatialVolume
);
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
,
spatialVolume
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
}
});
});
...
...
src/spconv/reordering.cu
View file @
23d9faaf
...
@@ -35,7 +35,7 @@ using half_vec_t =
...
@@ -35,7 +35,7 @@ using half_vec_t =
template
<
typename
T
>
template
<
typename
T
>
using
half_vec_sadd_t
=
using
half_vec_sadd_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int4
,
int4
>
;
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int4
,
int4
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
,
8
>
;
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
)
{
torch
::
Tensor
indices
,
int
size
)
{
...
@@ -45,6 +45,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
...
@@ -45,6 +45,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
dtype
=
features
.
scalar_type
();
auto
dtype
=
features
.
scalar_type
();
auto
inds_dtype
=
indices
.
scalar_type
();
auto
inds_dtype
=
indices
.
scalar_type
();
// auto timer = spconv::CudaContextTimer<>();
tv
::
DispatchTorch
<
float_types_t
>
()(
dtype
,
[
&
](
auto
TValue
)
{
tv
::
DispatchTorch
<
float_types_t
>
()(
dtype
,
[
&
](
auto
TValue
)
{
using
T
=
decltype
(
TValue
);
using
T
=
decltype
(
TValue
);
using
vecload_type_t
=
half_vec_t
<
T
>
;
using
vecload_type_t
=
half_vec_t
<
T
>
;
...
@@ -53,8 +54,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
...
@@ -53,8 +54,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
bool
notFound
=
true
;
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
tv
::
mp_for_each
<
kernel_block_t
>
([
&
](
auto
NumTLP
)
{
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
...
@@ -87,6 +87,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
...
@@ -87,6 +87,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
features
.
data_ptr
<
T
>
(),
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
()
+
nHotBlock
,
indices
.
data_ptr
<
Index
>
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
#ifdef TV_LOG_KERNEL_INFO
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr
;
cudaFuncAttributes
attr
;
checkCudaErrors
(
cudaFuncGetAttributes
(
checkCudaErrors
(
cudaFuncGetAttributes
(
...
@@ -143,8 +144,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
...
@@ -143,8 +144,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
constexpr
int
vecloadFactor
=
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
buffer
,
&
indices
,
tv
::
mp_for_each
<
kernel_block_t
>
([
&
](
auto
NumTLP
)
{
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
// vecloadFactor));
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
...
...
src/spgemm/CMakeLists.txt
0 → 100644
View file @
23d9faaf
src/spgemm/torchdev_cutlass.cu
0 → 100644
View file @
23d9faaf
test/benchmark.py
View file @
23d9faaf
...
@@ -70,7 +70,7 @@ class Net(nn.Module):
...
@@ -70,7 +70,7 @@ class Net(nn.Module):
def
main
():
def
main
():
voxels
,
coors
,
spatial_shape
=
waymo_data
()
voxels
,
coors
,
spatial_shape
=
waymo_data
()
voxels_th
=
torch
.
from_numpy
(
voxels
).
cuda
().
float
()
voxels_th
=
torch
.
from_numpy
(
voxels
).
cuda
().
float
()
coors_th
=
torch
.
from_numpy
(
coors
).
cuda
()
coors_th
=
torch
.
from_numpy
(
coors
).
cuda
()
.
int
()
net
=
Net
(
spatial_shape
[::
-
1
]).
cuda
().
eval
().
float
()
net
=
Net
(
spatial_shape
[::
-
1
]).
cuda
().
eval
().
float
()
print
(
coors_th
.
shape
)
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
...
...
test/test_conv.py
View file @
23d9faaf
...
@@ -349,7 +349,7 @@ def scatter_nd(indices, updates, shape):
...
@@ -349,7 +349,7 @@ def scatter_nd(indices, updates, shape):
class
TestSpConv
(
TestCase
):
class
TestSpConv
(
TestCase
):
def
testSpConv3d
(
self
):
def
testSpConv3d
(
self
):
np
.
random
.
seed
(
484
)
np
.
random
.
seed
(
484
)
devices
=
[
"c
p
u:0"
]
devices
=
[
"cu
da
:0"
]
shapes
=
[[
19
,
18
,
17
]]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
batchsizes
=
[
1
,
2
]
...
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
...
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
)
main
_subm
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
)
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
half
)
#
main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
# TestCase().assertAllClose(out_my, out_ref)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
# unittest.main()
# TestSpConv().testSpConv3d()
# TestSpConv().testSpConv3d()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment