Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
60781119
Commit
60781119
authored
Jun 09, 2020
by
yanyan
Browse files
still working on cutlass
parent
23d9faaf
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
315 additions
and
127 deletions
+315
-127
CMakeLists.txt
CMakeLists.txt
+2
-0
codeai-devops.yaml
codeai-devops.yaml
+16
-9
include/spconv/indice.cu.h
include/spconv/indice.cu.h
+39
-52
include/spconv/reordering.h
include/spconv/reordering.h
+8
-1
include/spgemm/gemm.h
include/spgemm/gemm.h
+80
-0
include/spgemm/gemm_th.h
include/spgemm/gemm_th.h
+11
-0
include/sphash/hashmap.h
include/sphash/hashmap.h
+4
-12
include/tensorview/mp_helper.h
include/tensorview/mp_helper.h
+0
-1
include/tensorview/tensor.h
include/tensorview/tensor.h
+0
-1
spconv/__init__.py
spconv/__init__.py
+1
-1
spconv/ops.py
spconv/ops.py
+5
-5
spconv/pool.py
spconv/pool.py
+10
-2
src/spconv/CMakeLists.txt
src/spconv/CMakeLists.txt
+1
-1
src/spconv/indice.cu
src/spconv/indice.cu
+13
-20
src/spconv/reordering.cu
src/spconv/reordering.cu
+18
-6
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+8
-5
src/spgemm/CMakeLists.txt
src/spgemm/CMakeLists.txt
+10
-0
src/spgemm/gemm.cu
src/spgemm/gemm.cu
+34
-0
src/spgemm/torchdev_cutlass.cu
src/spgemm/torchdev_cutlass.cu
+38
-0
test/benchmark.py
test/benchmark.py
+17
-11
No files found.
CMakeLists.txt
View file @
60781119
...
...
@@ -44,11 +44,13 @@ set(ALL_LIBS ${TORCH_LIBRARIES})
set
(
ALL_INCLUDE
${
PROJECT_SOURCE_DIR
}
/include
)
set
(
MP11_INCLUDE
${
PROJECT_SOURCE_DIR
}
/third_party/mp11/include
)
set
(
CUTLASS_INCLUDE
${
PROJECT_SOURCE_DIR
}
/third_party/cutlass/include
)
if
(
SPCONV_BuildCUDA
)
set
(
ALL_LIBS
${
ALL_LIBS
}
${
CUDA_CUDART
}
${
CUDA_CUBLAS
}
)
set
(
ALL_INCLUDE
${
ALL_INCLUDE
}
${
CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES
}
)
add_subdirectory
(
src/cuhash
)
add_subdirectory
(
src/spgemm
)
endif
()
add_subdirectory
(
src/spconv
)
add_subdirectory
(
src/utils
)
...
...
codeai-devops.yaml
View file @
60781119
...
...
@@ -65,7 +65,7 @@ observers:
std
:
c++14
options
:
[
-Wno-deprecated-declarations
,
"
-gencode=arch=compute_52,code=sm_
61
"
,
"
-gencode=arch=compute_52,code=sm_
52
"
,
"
-gencode=arch=compute_61,code=sm_61"
,
"
-gencode=arch=compute_60,code=sm_60"
,
"
-gencode=arch=compute_70,code=sm_70"
,
...
...
@@ -76,7 +76,7 @@ observers:
type
:
CPPDevObserver
main_pattern
:
torchdev_.*\.(cu|cpp|cc|cxx)
pattern
:
.*\.(cc|cpp|cxx|h|hpp|hxx|cu)
compiler
:
clang++
compiler
:
nvcc
executable
:
build/codeai_dev_torch
run_cmd
:
[
$(executable)
]
fail_cmds
:
# run cmd when pervious run fail with retcode
...
...
@@ -97,13 +97,20 @@ observers:
/home/yy/anaconda3/lib
,
]
libraries
:
[
-lpython3.7m
,
-lcublas
,
-lcudart
,
-ljpeg
,
-lpthread
,
"
-Wl,--no-as-needed,-lc10"
,
"
-Wl,--no-as-needed,-ltorch"
,
"
-Wl,--no-as-needed,-ltorch_cpu"
,
"
-Wl,--no-as-needed,-lc10_cuda"
,
"
-Wl,--no-as-needed,-ltorch_cuda"
]
std
:
c++2a
options
:
[
--cuda-gpu-arch=sm_61
,
-Wno-deprecated-declarations
,
-D_GLIBCXX_USE_CXX11_ABI=0
]
"
-Xcompiler=
\"
-Wl,--no-as-needed,-lc10
\"
"
,
"
-Xcompiler=
\"
-Wl,--no-as-needed,-ltorch
\"
"
,
"
-Xcompiler=
\"
-Wl,--no-as-needed,-ltorch_cpu
\"
"
,
"
-Xcompiler=
\"
-Wl,--no-as-needed,-lc10_cuda
\"
"
,
"
-Xcompiler=
\"
-Wl,--no-as-needed,-ltorch_cuda
\"
"
]
std
:
c++14
# options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
options
:
[
-Wno-deprecated-declarations
,
--expt-relaxed-constexpr
,
"
-gencode=arch=compute_61,code=sm_61"
,
-D_GLIBCXX_USE_CXX11_ABI=0
,
]
include/spconv/indice.cu.h
View file @
60781119
...
...
@@ -20,8 +20,8 @@
#include <tensorview/tensorview.h>
namespace
spconv
{
template
<
typename
Index
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
,
typename
Index1D
=
int
>
template
<
typename
Index
,
unsigned
NDim
,
bool
UseDeconv
,
int
KernelMaxVolume
=
256
,
typename
Index1D
=
int
>
__global__
void
prepareIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index1D
>
indicePairUnique
,
...
...
@@ -47,54 +47,19 @@ __global__ void prepareIndicePairsKernel(
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
1
,
offset
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
if
(
UseDeconv
)
{
// nvcc will optimize this fake "if constexpr"
// after cuda 11 released, we will start to use real if constexpr.
numValidPoints
=
getValidOutPosTranspose
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
}
else
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
}
}
}
template
<
typename
Index
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
prepareDeConvIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPosTranspose
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
...
...
@@ -191,6 +156,29 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignIndicePairsLimitedKernel
(
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
)
{
Index
index
,
val
;
int
kernelVolume
=
indicePairs
.
dim
(
0
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairs
(
i
,
1
,
ix
);
if
(
index
!=
-
1
)
{
val
=
gridsOut
[
index
];
if
(
val
!=
-
1
)
{
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
i
,
Index
(
1
));
indicePairs
(
i
,
0
,
oldNum
)
=
indicePairs
(
i
,
0
,
ix
);
indicePairs
(
i
,
1
,
oldNum
)
=
val
;
}
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
prepareSubMGridKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
...
...
@@ -542,7 +530,6 @@ __global__ void getSubMIndicePairsHashUnrollKernel2(
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
...
...
@@ -567,8 +554,8 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indsPtr
+
1
,
outSpatialShape
.
data
(),
0
);
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indsPtr
+
1
,
outSpatialShape
.
data
(),
0
);
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
}
}
...
...
include/spconv/reordering.h
View file @
60781119
...
...
@@ -14,9 +14,9 @@
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <cuda_runtime_api.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
void
batch_sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
...
...
@@ -35,6 +35,13 @@ void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
void
sparse_scatter_add_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cuda
(
cudaStream_t
s
,
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cuda
(
cudaStream_t
s
,
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spgemm/gemm.h
0 → 100644
View file @
60781119
#include <cutlass/gemm/device/gemm.h>
#include <type_traits>
namespace
spconv
{
template
<
typename
T
>
using
determine_acc_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
cutlass
::
half_t
>::
value
,
float
,
T
>
;
template
<
typename
T
,
bool
TransA
,
bool
TransB
,
bool
TransC
>
cudaError_t
cutlassGemm
(
cudaStream_t
s
,
int
M
,
int
N
,
int
K
,
T
alpha
,
T
const
*
A
,
int
lda
,
T
const
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
{
// Define type definition for single-precision CUTLASS GEMM with column-major
// input matrices and 128x128x8 threadblock tile size (chosen by default).
//
// To keep the interface manageable, several helpers are defined for plausible
// compositions including the following example for single-precision GEMM.
// Typical values are used as default template arguments. See
// `cutlass/gemm/device/default_gemm_configuration.h` for more details.
//
// To view the full gemm device API interface, see
// `cutlass/gemm/device/gemm.h`
using
TAcc
=
determine_acc_t
<
T
>
;
using
ColumnMajor
=
cutlass
::
layout
::
ColumnMajor
;
using
RowMajor
=
cutlass
::
layout
::
RowMajor
;
using
LayoutA
=
std
::
conditional_t
<
TransA
,
ColumnMajor
,
RowMajor
>
;
using
LayoutB
=
std
::
conditional_t
<
TransB
,
ColumnMajor
,
RowMajor
>
;
using
LayoutC
=
std
::
conditional_t
<
TransC
,
ColumnMajor
,
RowMajor
>
;
using
CutlassGemm
=
cutlass
::
gemm
::
device
::
Gemm
<
T
,
// Data-type of A matrix
LayoutA
,
// Layout of A matrix
T
,
// Data-type of B matrix
LayoutB
,
// Layout of B matrix
T
,
// Data-type of C matrix
LayoutC
,
TAcc
>
;
// Layout of C matrix
// Define a CUTLASS GEMM type
CutlassGemm
gemm_operator
;
// Construct the CUTLASS GEMM arguments object.
//
// One of CUTLASS's design patterns is to define gemm argument objects that
// are constructible in host code and passed to kernels by value. These may
// include pointers, strides, scalars, and other arguments needed by Gemm and
// its components.
//
// The benefits of this pattern are (1.) a structured, composable strategy for
// passing host-constructible arguments to kernels and (2.) minimized
// initialization overhead on kernel entry.
//
typename
CutlassGemm
::
Arguments
args
(
{
M
,
N
,
K
},
// Gemm Problem dimensions
{
A
,
lda
},
// Tensor-ref for source matrix A
{
B
,
ldb
},
// Tensor-ref for source matrix B
{
C
,
ldc
},
// Tensor-ref for source matrix C
{
C
,
ldc
},
// Tensor-ref for destination matrix D (may be different memory
// than source C matrix)
{
alpha
,
beta
});
// Scalars used in the Epilogue
//
// Launch the CUTLASS GEMM kernel.
//
cutlass
::
Status
status
=
gemm_operator
(
args
,
nullptr
,
s
);
//
// Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
//
if
(
status
!=
cutlass
::
Status
::
kSuccess
)
{
return
cudaErrorUnknown
;
}
// Return success, if no errors were encountered.
return
cudaSuccess
;
}
}
// namespace spconv
include/spgemm/gemm_th.h
0 → 100644
View file @
60781119
#include <cuda_runtime_api.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace
spconv
{
void
cutlass_mm_out
(
torch
::
Tensor
c
,
torch
::
Tensor
a
,
torch
::
Tensor
b
);
void
cutlass_mm_out
(
cudaStream_t
stream
,
torch
::
Tensor
c
,
torch
::
Tensor
a
,
torch
::
Tensor
b
);
}
// namespace spconv
\ No newline at end of file
include/sphash/hashmap.h
View file @
60781119
#include <tensorview/tensor.h>
namespace
spconv
{
enum
HashTypes
{
kDenseMap
=
0
,
kCUDPPHash
=
1
};
template
<
int
Impl
>
struct
HashMap
;
enum
HashTypes
{
kDenseMap
=
0
,
kCUDPPHash
=
1
};
template
<
>
struct
HashMap
<
kDenseMap
>
{
template
<
int
Impl
>
struct
HashMap
;
};
template
<
>
struct
HashMap
<
kDenseMap
>
{
};
}
\ No newline at end of file
}
// namespace spconv
\ No newline at end of file
include/tensorview/mp_helper.h
View file @
60781119
...
...
@@ -12,7 +12,6 @@ using mp_list_c = mp_list<std::integral_constant<T, I>...>;
template
<
int
...
I
>
using
mp_list_int_c
=
mp_list
<
std
::
integral_constant
<
int
,
I
>
...
>
;
namespace
detail
{
template
<
class
...
Ts
,
class
F
>
...
...
include/tensorview/tensor.h
View file @
60781119
...
...
@@ -524,7 +524,6 @@ struct DispatchIntNoexcept<T<Args...>> {
}
};
constexpr
size_t
kTensorMaxDim
=
10
;
using
TensorShape
=
ShapeBase
<
kTensorMaxDim
,
int64_t
>
;
...
...
spconv/__init__.py
View file @
60781119
...
...
@@ -69,7 +69,7 @@ class SparseConvTensor(object):
self
.
batch_size
=
batch_size
self
.
indice_dict
=
{}
if
grid
is
None
:
grid
=
torch
.
Tensor
()
# empty tensor
grid
=
torch
.
Tensor
()
# empty tensor
self
.
grid
=
grid
@
classmethod
...
...
spconv/ops.py
View file @
60781119
...
...
@@ -89,11 +89,11 @@ def get_indice_pairs(indices,
out_shape
=
spatial_shape
if
grid
is
None
:
grid
=
torch
.
Tensor
()
res
=
torch
.
ops
.
spconv
.
get_indice_pairs
(
indices
,
grid
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
),
int
(
use_hash
))
res
=
torch
.
ops
.
spconv
.
get_indice_pairs
(
indices
,
grid
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
),
int
(
use_hash
))
return
res
...
...
spconv/pool.py
View file @
60781119
...
...
@@ -67,8 +67,16 @@ class SparseMaxPool(SparseModule):
else
:
out_spatial_shape
=
spatial_shape
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
,
grid
=
input
.
grid
)
indices
,
batch_size
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
,
grid
=
input
.
grid
)
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
...
...
src/spconv/CMakeLists.txt
View file @
60781119
...
...
@@ -15,7 +15,7 @@ set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property
(
TARGET spconv PROPERTY CXX_STANDARD 14
)
set_target_properties
(
spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON
)
if
(
SPCONV_BuildCUDA
)
target_link_libraries
(
spconv PRIVATE
${
ALL_LIBS
}
cuhash
)
target_link_libraries
(
spconv PRIVATE
${
ALL_LIBS
}
cuhash
spgemm
)
else
()
target_link_libraries
(
spconv PRIVATE
${
ALL_LIBS
}
)
endif
()
...
...
src/spconv/indice.cu
View file @
60781119
...
...
@@ -61,17 +61,9 @@ int create_conv_indice_pair_p1_cuda(
tv
::
DispatchInt
<
max_kernel_vol_t
>
()(
kernelVolume
,
std
::
less_equal
<
int
>
(),
[
&
](
auto
I2
)
{
constexpr
int
MaxKernelVolume
=
decltype
(
I2
)
::
value
;
if
(
transpose
)
{
prepareDeConvIndicePairsKernel
<
Index
,
NDim
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareDeConvIndicePairsKernel failed"
);
}
else
{
prepareIndicePairsKernel
<
Index
,
NDim
,
MaxKernelVolume
>
tv
::
dispatch_int
<
0
,
1
>
(
int
(
transpose
),
[
&
](
auto
I
)
{
constexpr
bool
UseDeconv
=
decltype
(
I
)
::
value
;
prepareIndicePairsKernel
<
Index
,
NDim
,
UseDeconv
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
...
...
@@ -79,8 +71,7 @@ int create_conv_indice_pair_p1_cuda(
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareIndicePairsKernel failed"
);
}
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
});
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr
;
checkCudaErrors
(
cudaFuncGetAttributes
(
...
...
@@ -230,15 +221,17 @@ int create_submconv_indice_pair_cuda(
namespace
mp11
=
boost
::
mp11
;
using
kernel2_candidates_t
=
mp11
::
mp_product
<
tv
::
mp_list
,
tv
::
mp_list_int
_c
<
1
,
3
,
5
>
,
tv
::
mp_list_int
_c
<
1
,
3
,
5
>>
;
mp11
::
mp_product
<
tv
::
mp_list
,
tv
::
mp_list_
c
<
int
,
1
,
3
,
5
>
,
tv
::
mp_list_
c
<
int
,
1
,
3
,
5
>>
;
using
kernel3_candidates_t
=
mp11
::
mp_product
<
tv
::
mp_list
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>
,
tv
::
mp_list_int_c
<
1
,
3
,
5
>>
;
using
kernel3_candidates_final_t
=
mp11
::
mp_push_back
<
kernel3_candidates_t
>
;
mp11
::
mp_product
<
tv
::
mp_list
,
tv
::
mp_list_c
<
int
,
1
,
3
,
5
>
,
tv
::
mp_list_c
<
int
,
1
,
3
,
5
>
,
tv
::
mp_list_c
<
int
,
1
,
3
,
5
>>
;
using
kernel3_candidates_final_t
=
mp11
::
mp_push_back
<
kernel3_candidates_t
>
;
auto
dispatcher2
=
tv
::
DispatchContainerNoexcept
<
kernel2_candidates_t
>
();
auto
dispatcher3
=
tv
::
DispatchContainerNoexcept
<
kernel3_candidates_final_t
>
();
auto
dispatcher3
=
tv
::
DispatchContainerNoexcept
<
kernel3_candidates_final_t
>
();
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
...
...
src/spconv/reordering.cu
View file @
60781119
...
...
@@ -37,12 +37,12 @@ using half_vec_sadd_t =
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int4
,
int4
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
,
8
>
;
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
)
{
void
sparse_gather_cuda
(
cudaStream_t
stream
,
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
features
.
size
(
1
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
dtype
=
features
.
scalar_type
();
auto
inds_dtype
=
indices
.
scalar_type
();
// auto timer = spconv::CudaContextTimer<>();
...
...
@@ -126,12 +126,18 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
});
}
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
)
{
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
return
sparse_gather_cuda
(
stream
,
buffer
,
features
,
indices
,
size
);
}
void
sparse_scatter_add_cuda
(
cudaStream_t
stream
,
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
outFeatures
.
size
(
1
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
dtype
=
outFeatures
.
scalar_type
();
auto
inds_dtype
=
indices
.
scalar_type
();
...
...
@@ -216,6 +222,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
});
}
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
return
sparse_scatter_add_cuda
(
stream
,
buffer
,
outFeatures
,
indices
,
size
);
}
void
batch_sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
)
{
// indices: [volume, inds_stride]
...
...
src/spconv/spconv_ops.cc
View file @
60781119
#include <spconv/spconv_ops.h>
#include <spgemm/gemm_th.h>
namespace
spconv
{
std
::
vector
<
torch
::
Tensor
>
...
...
@@ -48,9 +50,9 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
gridSize
=
batchSize
;
}
bool
resetGrid
=
gridOut
.
numel
()
!=
0
;
if
(
!
resetGrid
){
gridOut
=
torch
::
full
(
{
gridSize
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
!
resetGrid
)
{
gridOut
=
torch
::
full
(
{
gridSize
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
}
gridOut
=
gridOut
.
view
({
batchSize
,
-
1
});
int64_t
numActOut
=
-
1
;
...
...
@@ -104,7 +106,7 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
padding
,
dilation
,
outSpatialShape
,
transpose
,
resetGrid
,
useHash
);
}
#ifdef TV_CUDA
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
numActOut
=
create_conv_indice_pair_p1_cuda
(
indices
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
);
...
...
@@ -191,7 +193,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// tv::ssprint("first subm gemm time", timer.report() / 1000.0,
// std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// indicePairNumCpu.data_ptr<int>() + kernelVolume));
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
...
...
src/spgemm/CMakeLists.txt
View file @
60781119
set
(
ALL_FILES
${
ALL_FILES
}
gemm.cu
)
add_library
(
spgemm SHARED
${
ALL_FILES
}
)
target_include_directories
(
spgemm PRIVATE
${
ALL_INCLUDE
}
${
MP11_INCLUDE
}
${
CUTLASS_INCLUDE
}
)
set_property
(
TARGET spgemm PROPERTY CUDA_STANDARD 14
)
set_property
(
TARGET spgemm PROPERTY CXX_STANDARD 14
)
set_target_properties
(
spgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON
)
target_link_libraries
(
spgemm PRIVATE
${
ALL_LIBS
}
)
install
(
TARGETS spgemm DESTINATION lib
)
src/spgemm/gemm.cu
0 → 100644
View file @
60781119
#include <spgemm/gemm.h>
#include <spgemm/gemm_th.h>
namespace
spconv
{
template
<
typename
T
>
using
determine_half_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
cutlass
::
half_t
,
T
>
;
void
cutlass_mm_out
(
cudaStream_t
stream
,
torch
::
Tensor
c
,
torch
::
Tensor
a
,
torch
::
Tensor
b
)
{
TV_ASSERT_RT_ERR
(
c
.
dtype
()
==
a
.
dtype
()
&&
c
.
dtype
()
==
b
.
dtype
(),
"dtype must be same"
);
TV_ASSERT_RT_ERR
(
c
.
is_contiguous
()
&&
b
.
is_contiguous
()
&&
a
.
is_contiguous
(),
"error"
);
auto
M
=
a
.
size
(
0
);
auto
K
=
a
.
size
(
1
);
auto
N
=
b
.
size
(
1
);
TV_ASSERT_RT_ERR
(
b
.
size
(
0
)
==
K
&&
c
.
size
(
0
)
==
M
&&
c
.
size
(
1
)
==
N
,
"error"
);
tv
::
dispatch_torch
<
float
,
at
::
Half
>
(
c
.
scalar_type
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
using
HalfT
=
determine_half_t
<
T
>
;
auto
status
=
cutlassGemm
<
HalfT
,
false
,
false
,
false
>
(
stream
,
M
,
N
,
K
,
HalfT
(
1.0
),
reinterpret_cast
<
HalfT
*>
(
a
.
data_ptr
<
T
>
()),
a
.
size
(
1
),
reinterpret_cast
<
HalfT
*>
(
b
.
data_ptr
<
T
>
()),
b
.
size
(
1
),
HalfT
(
0.0
),
reinterpret_cast
<
HalfT
*>
(
c
.
data_ptr
<
T
>
()),
c
.
size
(
1
));
TV_ASSERT_RT_ERR
(
status
==
cudaSuccess
,
"error"
);
});
}
void
cutlass_mm_out
(
torch
::
Tensor
c
,
torch
::
Tensor
a
,
torch
::
Tensor
b
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
return
cutlass_mm_out
(
stream
,
c
,
a
,
b
);
}
}
// namespace spconv
\ No newline at end of file
src/spgemm/torchdev_cutlass.cu
View file @
60781119
#define TV_CUDA
#include <cutlass/gemm/device/gemm.h>
#include <spgemm/gemm.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
int
main
()
{
auto
M
=
100000
;
auto
N
=
128
;
auto
K
=
128
;
auto
a
=
torch
::
rand
({
M
,
K
},
torch
::
dtype
(
torch
::
kFloat32
).
device
(
torch
::
kCUDA
));
auto
b
=
torch
::
rand
({
K
,
N
},
a
.
options
());
auto
c
=
torch
::
zeros
({
a
.
size
(
0
),
b
.
size
(
1
)},
a
.
options
());
auto
c2
=
torch
::
zeros
({
a
.
size
(
0
),
b
.
size
(
1
)},
a
.
options
());
torch
::
mm_out
(
c
,
a
,
b
);
auto
status
=
spconv
::
cutlassGemm
<
float
,
false
,
false
,
false
>
(
0
,
M
,
N
,
K
,
1.0
,
a
.
data_ptr
<
float
>
(),
a
.
size
(
1
),
b
.
data_ptr
<
float
>
(),
b
.
size
(
1
),
0.0
,
c2
.
data_ptr
<
float
>
(),
c2
.
size
(
1
));
auto
err
=
torch
::
norm
(
c2
-
c
);
tv
::
ssprint
(
status
,
"linalg norm"
,
err
);
tv
::
ssprint
((
c
.
view
({
-
1
})
==
0
).
sum
());
auto
timer
=
spconv
::
CudaContextTimer
<>
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
torch
::
mm_out
(
c
,
a
,
b
);
tv
::
ssprint
(
"mm"
,
timer
.
report
()
/
1000.0
);
spconv
::
cutlassGemm
<
float
,
false
,
false
,
false
>
(
0
,
M
,
N
,
K
,
1.0
,
a
.
data_ptr
<
float
>
(),
a
.
size
(
1
),
b
.
data_ptr
<
float
>
(),
b
.
size
(
1
),
0.0
,
c2
.
data_ptr
<
float
>
(),
c2
.
size
(
1
));
tv
::
ssprint
(
"cutlass_mm"
,
timer
.
report
()
/
1000.0
);
}
return
0
;
}
\ No newline at end of file
test/benchmark.py
View file @
60781119
import
t
orch
import
spconv
import
numpy
as
np
from
spconv.utils
import
VoxelGeneratorV2
from
pathlib
import
Path
import
t
ime
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
import
time
import
spconv
from
spconv.utils
import
VoxelGeneratorV2
def
waymo_data
(
batch_size
=
1
):
gen
=
VoxelGeneratorV2
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
1
,
150000
)
gen
=
VoxelGeneratorV2
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
1
,
150000
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
pc
=
data
[
"pc"
]
data
=
gen
.
generate
(
pc
)
...
...
@@ -17,9 +21,9 @@ def waymo_data(batch_size=1):
coors
=
np
.
concatenate
([
np
.
full
([
N
,
1
],
0
,
coors
.
dtype
),
coors
],
axis
=
1
)
return
voxels
,
coors
,
gen
.
grid_size
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
shape
):
def
__init__
(
self
,
shape
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
),
...
...
@@ -57,7 +61,8 @@ class Net(nn.Module):
)
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self
.
grid
=
torch
.
full
([
max_batch_size
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
self
.
grid
=
torch
.
full
([
max_batch_size
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
# self.grid = None
self
.
shape
=
shape
...
...
@@ -87,5 +92,6 @@ def main():
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
main
()
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment