Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
5cff2f1c
Unverified
Commit
5cff2f1c
authored
Aug 27, 2020
by
Zihao Ye
Committed by
GitHub
Aug 27, 2020
Browse files
[Feature] Use new cusparse API to support CUDA 11. (#1979)
* upd * upd * upd * upd * upd * upd * upd * upd
parent
2a107320
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
88 additions
and
4 deletions
+88
-4
cmake/modules/CUDA.cmake
cmake/modules/CUDA.cmake
+3
-3
src/array/cuda/csr_transpose.cc
src/array/cuda/csr_transpose.cc
+2
-1
src/array/cuda/spmm.cu
src/array/cuda/spmm.cu
+41
-0
src/kernel/cuda/binary_reduce_sum.cu
src/kernel/cuda/binary_reduce_sum.cu
+42
-0
No files found.
cmake/modules/CUDA.cmake
View file @
5cff2f1c
...
...
@@ -10,7 +10,7 @@ endif()
include
(
CheckCXXCompilerFlag
)
check_cxx_compiler_flag
(
"-std=c++11"
SUPPORT_CXX11
)
set
(
dgl_known_gpu_archs
"
30
35 50 60 70"
)
set
(
dgl_known_gpu_archs
"35 50 60 70"
)
################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
...
...
@@ -43,7 +43,7 @@ set(CUDA_gpu_detect_output "")
#find vcvarsall.bat and run it building msvc environment
get_filename_component
(
MY_COMPILER_DIR
${
CMAKE_CXX_COMPILER
}
DIRECTORY
)
find_file
(
MY_VCVARSALL_BAT vcvarsall.bat
"
${
MY_COMPILER_DIR
}
/.."
"
${
MY_COMPILER_DIR
}
/../.."
)
execute_process
(
COMMAND
${
MY_VCVARSALL_BAT
}
&&
${
CUDA_NVCC_EXECUTABLE
}
-arch sm_3
0
--run
${
__cufile
}
execute_process
(
COMMAND
${
MY_VCVARSALL_BAT
}
&&
${
CUDA_NVCC_EXECUTABLE
}
-arch sm_3
5
--run
${
__cufile
}
WORKING_DIRECTORY
"
${
PROJECT_BINARY_DIR
}
/CMakeFiles/"
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
OUTPUT_STRIP_TRAILING_WHITESPACE
)
...
...
@@ -51,7 +51,7 @@ set(CUDA_gpu_detect_output "")
if
(
CUDA_LIBRARY_PATH
)
set
(
CUDA_LINK_LIBRARY_PATH
"-L
${
CUDA_LIBRARY_PATH
}
"
)
endif
()
execute_process
(
COMMAND
${
CUDA_NVCC_EXECUTABLE
}
-arch sm_3
0
--run
${
__cufile
}
${
CUDA_LINK_LIBRARY_PATH
}
execute_process
(
COMMAND
${
CUDA_NVCC_EXECUTABLE
}
-arch sm_3
5
--run
${
__cufile
}
${
CUDA_LINK_LIBRARY_PATH
}
WORKING_DIRECTORY
"
${
PROJECT_BINARY_DIR
}
/CMakeFiles/"
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
OUTPUT_STRIP_TRAILING_WHITESPACE
)
...
...
src/array/cuda/csr_transpose.cc
View file @
5cff2f1c
...
...
@@ -47,7 +47,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) {
int32_t
*
t_indices_ptr
=
static_cast
<
int32_t
*>
(
t_indices
->
data
);
void
*
t_data_ptr
=
t_data
->
data
;
#if
__
CUDA
_API
_VERSION >= 10010
#if CUDA
RT
_VERSION >= 10010
auto
device
=
runtime
::
DeviceAPI
::
Get
(
csr
.
indptr
->
ctx
);
// workspace
size_t
workspace_size
;
...
...
@@ -67,6 +67,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) {
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
data_ptr
,
indptr_ptr
,
indices_ptr
,
t_data_ptr
,
t_indptr_ptr
,
t_indices_ptr
,
CUDA_R_32F
,
CUSPARSE_ACTION_NUMERIC
,
CUSPARSE_INDEX_BASE_ZERO
,
CUSPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
...
...
src/array/cuda/spmm.cu
View file @
5cff2f1c
...
...
@@ -28,6 +28,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
namespace
cusparse
{
#if CUDART_VERSION < 11000
template
<
typename
DType
>
cusparseStatus_t
Xcsrmm2
(
cusparseHandle_t
handle
,
cusparseOperation_t
transA
,
cusparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
...
...
@@ -59,6 +60,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
#endif
template
<
typename
DType
>
cublasStatus_t
Xgeam
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
...
...
@@ -127,6 +129,44 @@ void CusparseCsrmm2(
valptr
=
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
}
#if CUDART_VERSION >= 11000
cusparseSpMatDescr_t
matA
;
cusparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
cuda_dtype
=
std
::
is_same
<
DType
,
float
>::
value
?
CUDA_R_32F
:
CUDA_R_64F
;
CUSPARSE_CALL
(
cusparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
CUSPARSE_INDEX_32I
,
CUSPARSE_INDEX_32I
,
CUSPARSE_INDEX_BASE_ZERO
,
cuda_dtype
));
CUSPARSE_CALL
(
cusparseCreateDnMat
(
&
matB
,
n
,
k
,
n
,
const_cast
<
DType
*>
(
B_data
),
cuda_dtype
,
CUSPARSE_ORDER_COL
));
CUSPARSE_CALL
(
cusparseCreateDnMat
(
&
matC
,
m
,
n
,
m
,
trans_out
,
cuda_dtype
,
CUSPARSE_ORDER_COL
));
auto
transA
=
CUSPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CUSPARSE_OPERATION_TRANSPOSE
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cusparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
cuda_dtype
,
CUSPARSE_CSRMM_ALG1
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cusparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
cuda_dtype
,
CUSPARSE_CSRMM_ALG1
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cusparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cusparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cusparseDestroyDnMat
(
matC
));
#else
cusparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cusparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cusparseSetMatType
(
descr
,
CUSPARSE_MATRIX_TYPE_GENERAL
));
...
...
@@ -141,6 +181,7 @@ void CusparseCsrmm2(
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
trans_out
,
m
));
CUSPARSE_CALL
(
cusparseDestroyMatDescr
(
descr
));
#endif
if
(
valptr
)
device
->
FreeWorkspace
(
ctx
,
valptr
);
// transpose the output matrix
...
...
src/kernel/cuda/binary_reduce_sum.cu
View file @
5cff2f1c
...
...
@@ -18,6 +18,7 @@ namespace kernel {
namespace
cuda
{
// specialization for cusparse
#if CUDART_VERSION < 11000
template
<
typename
DType
>
cusparseStatus_t
Xcsrmm2
(
cusparseHandle_t
handle
,
cusparseOperation_t
transA
,
cusparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
...
...
@@ -49,6 +50,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
#endif
template
<
typename
DType
>
cublasStatus_t
Xgeam
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
...
...
@@ -112,6 +114,44 @@ void CusparseCsrmm2(
// all one data array
DType
*
valptr
=
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
rtcfg
.
ctx
,
nnz
*
sizeof
(
DType
)));
utils
::
Fill
<
kDLGPU
>
(
rtcfg
.
ctx
,
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
#if CUDART_VERSION >= 11000
cusparseSpMatDescr_t
matA
;
cusparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
cuda_dtype
=
std
::
is_same
<
DType
,
float
>::
value
?
CUDA_R_32F
:
CUDA_R_64F
;
CUSPARSE_CALL
(
cusparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
CUSPARSE_INDEX_32I
,
CUSPARSE_INDEX_32I
,
CUSPARSE_INDEX_BASE_ZERO
,
cuda_dtype
));
CUSPARSE_CALL
(
cusparseCreateDnMat
(
&
matB
,
n
,
k
,
n
,
const_cast
<
DType
*>
(
B_data
),
cuda_dtype
,
CUSPARSE_ORDER_COL
));
CUSPARSE_CALL
(
cusparseCreateDnMat
(
&
matC
,
m
,
n
,
m
,
trans_out
,
cuda_dtype
,
CUSPARSE_ORDER_COL
));
auto
transA
=
CUSPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CUSPARSE_OPERATION_TRANSPOSE
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cusparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
cuda_dtype
,
CUSPARSE_CSRMM_ALG1
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cusparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
cuda_dtype
,
CUSPARSE_CSRMM_ALG1
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cusparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cusparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cusparseDestroyDnMat
(
matC
));
#else
cusparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cusparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cusparseSetMatType
(
descr
,
CUSPARSE_MATRIX_TYPE_GENERAL
));
...
...
@@ -125,6 +165,8 @@ void CusparseCsrmm2(
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
trans_out
,
m
));
CUSPARSE_CALL
(
cusparseDestroyMatDescr
(
descr
));
#endif
device
->
FreeWorkspace
(
rtcfg
.
ctx
,
valptr
);
// transpose the output matrix
if
(
!
thr_entry
->
cublas_handle
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment