[Feature] Use new cusparse API to support CUDA 11. (#1979)

* upd * upd * upd * upd * upd * upd * upd * upd

[Feature] Use new cusparse API to support CUDA 11. (#1979)
* upd * upd * upd * upd * upd * upd * upd * upd
5cff2f1c · Zihao Ye · GitHub · 2a107320 · 5cff2f1c · 5cff2f1c
Unverified Commit 5cff2f1c authored Aug 27, 2020 by Zihao Ye Committed by GitHub Aug 27, 2020
4 changed files
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -10,7 +10,7 @@ endif()
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)

-set(dgl_known_gpu_archs "30 35 50 60 70")
+set(dgl_known_gpu_archs "35 50 60 70")

 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@@ -43,7 +43,7 @@ set(CUDA_gpu_detect_output "")
      #find vcvarsall.bat and run it building msvc environment
      get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
      find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
-      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run  ${__cufile}
+      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_35 --run  ${__cufile}
                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
                      OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -51,7 +51,7 @@ set(CUDA_gpu_detect_output "")
      if(CUDA_LIBRARY_PATH)
        set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
      endif()
-      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
+      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_35 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
                      OUTPUT_STRIP_TRAILING_WHITESPACE)

--- a/src/array/cuda/csr_transpose.cc
+++ b/src/array/cuda/csr_transpose.cc
@@ -47,7 +47,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) {
  int32_t* t_indices_ptr = static_cast<int32_t*>(t_indices->data);
  void* t_data_ptr = t_data->data;

-#if __CUDA_API_VERSION >= 10010
+#if CUDART_VERSION >= 10010
  auto device = runtime::DeviceAPI::Get(csr.indptr->ctx);
  // workspace
  size_t workspace_size;
@@ -67,6 +67,7 @@ CSRMatrix CSRTranspose<kDLGPU, int32_t>(CSRMatrix csr) {
      csr.num_rows, csr.num_cols, nnz,
      data_ptr, indptr_ptr, indices_ptr,
      t_data_ptr, t_indptr_ptr, t_indices_ptr,
+      CUDA_R_32F,
      CUSPARSE_ACTION_NUMERIC,
      CUSPARSE_INDEX_BASE_ZERO,
      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
@@ -28,6 +28,7 @@ void _Fill(DType* ptr, size_t length, DType val) {

 namespace cusparse {

+#if CUDART_VERSION < 11000
 template <typename DType>
 cusparseStatus_t Xcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
    cusparseOperation_t transB, int m, int n, int k, int nnz,
@@ -59,6 +60,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
      alpha, descrA, csrValA, csrRowPtrA, csrColIndA,
      B, ldb, beta, C, ldc);
 }
+#endif

 template <typename DType>
 cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
@@ -127,6 +129,44 @@ void CusparseCsrmm2(
    valptr = static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
    _Fill(valptr, nnz, static_cast<DType>(1.));
  }
+#if CUDART_VERSION >= 11000
+  cusparseSpMatDescr_t matA;
+  cusparseDnMatDescr_t matB, matC;
+  constexpr auto cuda_dtype = std::is_same<DType, float>::value ? CUDA_R_32F: CUDA_R_64F;
+  CUSPARSE_CALL(cusparseCreateCsr(&matA,
+      m, k, nnz,
+      static_cast<int32_t*>(csr.indptr->data),
+      static_cast<int32_t*>(csr.indices->data),
+      const_cast<DType*>(valptr? valptr : A_data),
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_BASE_ZERO, cuda_dtype));
+  CUSPARSE_CALL(cusparseCreateDnMat(&matB,
+      n, k, n,
+      const_cast<DType*>(B_data), cuda_dtype, CUSPARSE_ORDER_COL));
+  CUSPARSE_CALL(cusparseCreateDnMat(&matC,
+      m, n, m,
+      trans_out, cuda_dtype, CUSPARSE_ORDER_COL));
+
+  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = CUSPARSE_OPERATION_TRANSPOSE;
+  size_t workspace_size;
+  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+      thr_entry->cusparse_handle, transA, transB,
+      &alpha, matA, matB, &beta, matC,
+      cuda_dtype, CUSPARSE_CSRMM_ALG1,
+      &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUSPARSE_CALL(cusparseSpMM(
+      thr_entry->cusparse_handle, transA, transB,
+      &alpha, matA, matB, &beta, matC,
+      cuda_dtype, CUSPARSE_CSRMM_ALG1,
+      workspace));
+  device->FreeWorkspace(ctx, workspace);
+
+  CUSPARSE_CALL(cusparseDestroySpMat(matA));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+#else
  cusparseMatDescr_t descr;
  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
@@ -141,6 +181,7 @@ void CusparseCsrmm2(
      static_cast<int32_t*>(csr.indices->data),
      B_data, n, &beta, trans_out, m));
  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+#endif
  if (valptr)
    device->FreeWorkspace(ctx, valptr);
  // transpose the output matrix

--- a/src/kernel/cuda/binary_reduce_sum.cu
+++ b/src/kernel/cuda/binary_reduce_sum.cu
@@ -18,6 +18,7 @@ namespace kernel {
 namespace cuda {
 // specialization for cusparse

+#if CUDART_VERSION < 11000
 template <typename DType>
 cusparseStatus_t Xcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
    cusparseOperation_t transB, int m, int n, int k, int nnz,
@@ -49,6 +50,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
      alpha, descrA, csrValA, csrRowPtrA, csrColIndA,
      B, ldb, beta, C, ldc);
 }
+#endif

 template <typename DType>
 cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
@@ -112,6 +114,44 @@ void CusparseCsrmm2(
  // all one data array
  DType* valptr = static_cast<DType*>(device->AllocWorkspace(rtcfg.ctx, nnz * sizeof(DType)));
  utils::Fill<kDLGPU>(rtcfg.ctx, valptr, nnz, static_cast<DType>(1.));
+#if CUDART_VERSION >= 11000
+  cusparseSpMatDescr_t matA;
+  cusparseDnMatDescr_t matB, matC;
+  constexpr auto cuda_dtype = std::is_same<DType, float>::value ? CUDA_R_32F: CUDA_R_64F;
+  CUSPARSE_CALL(cusparseCreateCsr(&matA,
+      m, k, nnz,
+      static_cast<int32_t*>(csr.indptr->data),
+      static_cast<int32_t*>(csr.indices->data),
+      const_cast<DType*>(valptr? valptr : A_data),
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_BASE_ZERO, cuda_dtype));
+  CUSPARSE_CALL(cusparseCreateDnMat(&matB,
+      n, k, n,
+      const_cast<DType*>(B_data), cuda_dtype, CUSPARSE_ORDER_COL));
+  CUSPARSE_CALL(cusparseCreateDnMat(&matC,
+      m, n, m,
+      trans_out, cuda_dtype, CUSPARSE_ORDER_COL));
+
+  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = CUSPARSE_OPERATION_TRANSPOSE;
+  size_t workspace_size;
+  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+      thr_entry->cusparse_handle, transA, transB,
+      &alpha, matA, matB, &beta, matC,
+      cuda_dtype, CUSPARSE_CSRMM_ALG1,
+      &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUSPARSE_CALL(cusparseSpMM(
+      thr_entry->cusparse_handle, transA, transB,
+      &alpha, matA, matB, &beta, matC,
+      cuda_dtype, CUSPARSE_CSRMM_ALG1,
+      workspace));
+  device->FreeWorkspace(ctx, workspace);
+
+  CUSPARSE_CALL(cusparseDestroySpMat(matA));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+#else
  cusparseMatDescr_t descr;
  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
@@ -125,6 +165,8 @@ void CusparseCsrmm2(
      static_cast<int32_t*>(csr.indptr->data),
      static_cast<int32_t*>(csr.indices->data),
      B_data, n, &beta, trans_out, m));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+#endif
  device->FreeWorkspace(rtcfg.ctx, valptr);
  // transpose the output matrix
  if (!thr_entry->cublas_handle) {