"tests/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "c8ec9ce30557341dd42a7e09623221d2bb7405fd"
Unverified Commit 5cff2f1c authored by Zihao Ye's avatar Zihao Ye Committed by GitHub
Browse files

[Feature] Use new cusparse API to support CUDA 11. (#1979)

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd
parent 2a107320
...@@ -10,7 +10,7 @@ endif() ...@@ -10,7 +10,7 @@ endif()
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11) check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11)
set(dgl_known_gpu_archs "30 35 50 60 70") set(dgl_known_gpu_archs "35 50 60 70")
################################################################################################ ################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled) # A function for automatic detection of GPUs installed (if autodetection is enabled)
...@@ -43,7 +43,7 @@ set(CUDA_gpu_detect_output "") ...@@ -43,7 +43,7 @@ set(CUDA_gpu_detect_output "")
#find vcvarsall.bat and run it building msvc environment #find vcvarsall.bat and run it building msvc environment
get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY) get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..") find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_35 --run ${__cufile}
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
OUTPUT_STRIP_TRAILING_WHITESPACE) OUTPUT_STRIP_TRAILING_WHITESPACE)
...@@ -51,7 +51,7 @@ set(CUDA_gpu_detect_output "") ...@@ -51,7 +51,7 @@ set(CUDA_gpu_detect_output "")
if(CUDA_LIBRARY_PATH) if(CUDA_LIBRARY_PATH)
set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}") set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
endif() endif()
execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH} execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_35 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
OUTPUT_STRIP_TRAILING_WHITESPACE) OUTPUT_STRIP_TRAILING_WHITESPACE)
......
...@@ -47,7 +47,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) { ...@@ -47,7 +47,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) {
int32_t* t_indices_ptr = static_cast<int32_t*>(t_indices->data); int32_t* t_indices_ptr = static_cast<int32_t*>(t_indices->data);
void* t_data_ptr = t_data->data; void* t_data_ptr = t_data->data;
#if __CUDA_API_VERSION >= 10010 #if CUDART_VERSION >= 10010
auto device = runtime::DeviceAPI::Get(csr.indptr->ctx); auto device = runtime::DeviceAPI::Get(csr.indptr->ctx);
// workspace // workspace
size_t workspace_size; size_t workspace_size;
...@@ -67,6 +67,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) { ...@@ -67,6 +67,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) {
csr.num_rows, csr.num_cols, nnz, csr.num_rows, csr.num_cols, nnz,
data_ptr, indptr_ptr, indices_ptr, data_ptr, indptr_ptr, indices_ptr,
t_data_ptr, t_indptr_ptr, t_indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F,
CUSPARSE_ACTION_NUMERIC, CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_INDEX_BASE_ZERO,
CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference
......
...@@ -28,6 +28,7 @@ void _Fill(DType* ptr, size_t length, DType val) { ...@@ -28,6 +28,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
namespace cusparse { namespace cusparse {
#if CUDART_VERSION < 11000
template <typename DType> template <typename DType>
cusparseStatus_t Xcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA, cusparseStatus_t Xcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, cusparseOperation_t transB, int m, int n, int k, int nnz,
...@@ -59,6 +60,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr ...@@ -59,6 +60,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
alpha, descrA, csrValA, csrRowPtrA, csrColIndA, alpha, descrA, csrValA, csrRowPtrA, csrColIndA,
B, ldb, beta, C, ldc); B, ldb, beta, C, ldc);
} }
#endif
template <typename DType> template <typename DType>
cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa, cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
...@@ -127,6 +129,44 @@ void CusparseCsrmm2( ...@@ -127,6 +129,44 @@ void CusparseCsrmm2(
valptr = static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType))); valptr = static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
_Fill(valptr, nnz, static_cast<DType>(1.)); _Fill(valptr, nnz, static_cast<DType>(1.));
} }
#if CUDART_VERSION >= 11000
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC;
constexpr auto cuda_dtype = std::is_same<DType, float>::value ? CUDA_R_32F: CUDA_R_64F;
CUSPARSE_CALL(cusparseCreateCsr(&matA,
m, k, nnz,
static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data),
const_cast<DType*>(valptr? valptr : A_data),
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, cuda_dtype));
CUSPARSE_CALL(cusparseCreateDnMat(&matB,
n, k, n,
const_cast<DType*>(B_data), cuda_dtype, CUSPARSE_ORDER_COL));
CUSPARSE_CALL(cusparseCreateDnMat(&matC,
m, n, m,
trans_out, cuda_dtype, CUSPARSE_ORDER_COL));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_TRANSPOSE;
size_t workspace_size;
CUSPARSE_CALL(cusparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB,
&alpha, matA, matB, &beta, matC,
cuda_dtype, CUSPARSE_CSRMM_ALG1,
&workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM(
thr_entry->cusparse_handle, transA, transB,
&alpha, matA, matB, &beta, matC,
cuda_dtype, CUSPARSE_CSRMM_ALG1,
workspace));
device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC));
#else
cusparseMatDescr_t descr; cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
...@@ -141,6 +181,7 @@ void CusparseCsrmm2( ...@@ -141,6 +181,7 @@ void CusparseCsrmm2(
static_cast<int32_t*>(csr.indices->data), static_cast<int32_t*>(csr.indices->data),
B_data, n, &beta, trans_out, m)); B_data, n, &beta, trans_out, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
#endif
if (valptr) if (valptr)
device->FreeWorkspace(ctx, valptr); device->FreeWorkspace(ctx, valptr);
// transpose the output matrix // transpose the output matrix
......
...@@ -18,6 +18,7 @@ namespace kernel { ...@@ -18,6 +18,7 @@ namespace kernel {
namespace cuda { namespace cuda {
// specialization for cusparse // specialization for cusparse
#if CUDART_VERSION < 11000
template <typename DType> template <typename DType>
cusparseStatus_t Xcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA, cusparseStatus_t Xcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, cusparseOperation_t transB, int m, int n, int k, int nnz,
...@@ -49,6 +50,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr ...@@ -49,6 +50,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
alpha, descrA, csrValA, csrRowPtrA, csrColIndA, alpha, descrA, csrValA, csrRowPtrA, csrColIndA,
B, ldb, beta, C, ldc); B, ldb, beta, C, ldc);
} }
#endif
template <typename DType> template <typename DType>
cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa, cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
...@@ -112,6 +114,44 @@ void CusparseCsrmm2( ...@@ -112,6 +114,44 @@ void CusparseCsrmm2(
// all one data array // all one data array
DType* valptr = static_cast<DType*>(device->AllocWorkspace(rtcfg.ctx, nnz * sizeof(DType))); DType* valptr = static_cast<DType*>(device->AllocWorkspace(rtcfg.ctx, nnz * sizeof(DType)));
utils::Fill<kDLGPU>(rtcfg.ctx, valptr, nnz, static_cast<DType>(1.)); utils::Fill<kDLGPU>(rtcfg.ctx, valptr, nnz, static_cast<DType>(1.));
#if CUDART_VERSION >= 11000
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC;
constexpr auto cuda_dtype = std::is_same<DType, float>::value ? CUDA_R_32F: CUDA_R_64F;
CUSPARSE_CALL(cusparseCreateCsr(&matA,
m, k, nnz,
static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data),
const_cast<DType*>(valptr? valptr : A_data),
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, cuda_dtype));
CUSPARSE_CALL(cusparseCreateDnMat(&matB,
n, k, n,
const_cast<DType*>(B_data), cuda_dtype, CUSPARSE_ORDER_COL));
CUSPARSE_CALL(cusparseCreateDnMat(&matC,
m, n, m,
trans_out, cuda_dtype, CUSPARSE_ORDER_COL));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_TRANSPOSE;
size_t workspace_size;
CUSPARSE_CALL(cusparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB,
&alpha, matA, matB, &beta, matC,
cuda_dtype, CUSPARSE_CSRMM_ALG1,
&workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM(
thr_entry->cusparse_handle, transA, transB,
&alpha, matA, matB, &beta, matC,
cuda_dtype, CUSPARSE_CSRMM_ALG1,
workspace));
device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC));
#else
cusparseMatDescr_t descr; cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
...@@ -125,6 +165,8 @@ void CusparseCsrmm2( ...@@ -125,6 +165,8 @@ void CusparseCsrmm2(
static_cast<int32_t*>(csr.indptr->data), static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), static_cast<int32_t*>(csr.indices->data),
B_data, n, &beta, trans_out, m)); B_data, n, &beta, trans_out, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
#endif
device->FreeWorkspace(rtcfg.ctx, valptr); device->FreeWorkspace(rtcfg.ctx, valptr);
// transpose the output matrix // transpose the output matrix
if (!thr_entry->cublas_handle) { if (!thr_entry->cublas_handle) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment