Unverified Commit 7bab1365 authored by Zihao Ye's avatar Zihao Ye Committed by GitHub
Browse files

[feature] Supporting half precision floating data type (fp16). (#2552)



* add tvm as submodule

* compilation is ok but calling fails

* can call now

* pack multiple modules, change names

* upd

* upd

* upd

* fix cmake

* upd

* upd

* upd

* upd

* fix

* relative path

* upd

* upd

* upd

* singleton

* upd

* trigger

* fix

* upd

* count reducible

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* only keep related files

* upd

* upd

* upd

* upd

* lint

* lint

* lint

* lint

* pylint

* upd

* upd

* compilation

* fix

* upd

* upd

* upd

* upd

* upd

* upd

* upd doc

* refactor

* fix

* upd number
Co-authored-by: default avatarZhi Lin <linzhilynn@gmail.com>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-42-78.us-east-2.compute.internal>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-21-156.us-east-2.compute.internal>
Co-authored-by: default avatarJinjing Zhou <VoVAllen@users.noreply.github.com>
parent a7e941c3
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <dgl/array.h> #include <dgl/array.h>
#include "./segment_reduce.cuh" #include "./segment_reduce.cuh"
#include "./functor.cuh" #include "./functor.cuh"
#include "./utils.h"
namespace dgl { namespace dgl {
...@@ -13,70 +14,97 @@ using namespace cuda; ...@@ -13,70 +14,97 @@ using namespace cuda;
namespace aten { namespace aten {
template <int XPU, typename IdType, typename DType>
template <int XPU, typename IdType, int bits>
void SegmentReduce(const std::string& op, void SegmentReduce(const std::string& op,
NDArray feat, NDArray feat,
NDArray offsets, NDArray offsets,
NDArray out, NDArray out,
NDArray arg) { NDArray arg) {
if (op == "sum") { SWITCH_BITS(bits, DType, {
cuda::SegmentReduce<IdType, DType, cuda::reduce::Sum<IdType, DType>>( if (op == "sum") {
feat, offsets, out, arg); cuda::SegmentReduce<IdType, DType, cuda::reduce::Sum<IdType, DType>>(
} else if (op == "max") { feat, offsets, out, arg);
cuda::SegmentReduce<IdType, DType, cuda::reduce::Max<IdType, DType>>( } else if (op == "max") {
feat, offsets, out, arg); cuda::SegmentReduce<IdType, DType, cuda::reduce::Max<IdType, DType>>(
} else if (op == "min") { feat, offsets, out, arg);
cuda::SegmentReduce<IdType, DType, cuda::reduce::Min<IdType, DType>>( } else if (op == "min") {
feat, offsets, out, arg); cuda::SegmentReduce<IdType, DType, cuda::reduce::Min<IdType, DType>>(
} else { feat, offsets, out, arg);
LOG(FATAL) << "Not implemented"; } else {
} LOG(FATAL) << "Not implemented";
}
});
} }
template <int XPU, typename IdType, typename DType>
template <int XPU, typename IdType, int bits>
void BackwardSegmentCmp(NDArray feat, void BackwardSegmentCmp(NDArray feat,
NDArray arg, NDArray arg,
NDArray out) { NDArray out) {
cuda::BackwardSegmentCmp<IdType, DType>(feat, arg, out); SWITCH_BITS(bits, DType, {
cuda::BackwardSegmentCmp<IdType, DType>(feat, arg, out);
});
} }
template void SegmentReduce<kDLGPU, int32_t, float>(
template void SegmentReduce<kDLGPU, int32_t, 16>(
const std::string& op, const std::string& op,
NDArray feat, NDArray feat,
NDArray offsets, NDArray offsets,
NDArray out, NDArray out,
NDArray arg); NDArray arg);
template void SegmentReduce<kDLGPU, int64_t, float>( template void SegmentReduce<kDLGPU, int64_t, 16>(
const std::string &op, const std::string &op,
NDArray feat, NDArray feat,
NDArray offsets, NDArray offsets,
NDArray out, NDArray out,
NDArray arg); NDArray arg);
template void SegmentReduce<kDLGPU, int32_t, double>( template void SegmentReduce<kDLGPU, int32_t, 32>(
const std::string& op,
NDArray feat,
NDArray offsets,
NDArray out,
NDArray arg);
template void SegmentReduce<kDLGPU, int64_t, 32>(
const std::string &op, const std::string &op,
NDArray feat, NDArray feat,
NDArray offsets, NDArray offsets,
NDArray out, NDArray out,
NDArray arg); NDArray arg);
template void SegmentReduce<kDLGPU, int64_t, double>( template void SegmentReduce<kDLGPU, int32_t, 64>(
const std::string &op, const std::string &op,
NDArray feat, NDArray feat,
NDArray offsets, NDArray offsets,
NDArray out, NDArray out,
NDArray arg); NDArray arg);
template void BackwardSegmentCmp<kDLGPU, int32_t, float>( template void SegmentReduce<kDLGPU, int64_t, 64>(
const std::string &op,
NDArray feat,
NDArray offsets,
NDArray out,
NDArray arg);
template void BackwardSegmentCmp<kDLGPU, int32_t, 16>(
NDArray feat,
NDArray arg,
NDArray out);
template void BackwardSegmentCmp<kDLGPU, int64_t, 16>(
NDArray feat,
NDArray arg,
NDArray out);
template void BackwardSegmentCmp<kDLGPU, int32_t, 32>(
NDArray feat, NDArray feat,
NDArray arg, NDArray arg,
NDArray out); NDArray out);
template void BackwardSegmentCmp<kDLGPU, int64_t, float>( template void BackwardSegmentCmp<kDLGPU, int64_t, 32>(
NDArray feat, NDArray feat,
NDArray arg, NDArray arg,
NDArray out); NDArray out);
template void BackwardSegmentCmp<kDLGPU, int32_t, double>( template void BackwardSegmentCmp<kDLGPU, int32_t, 64>(
NDArray feat, NDArray feat,
NDArray arg, NDArray arg,
NDArray out); NDArray out);
template void BackwardSegmentCmp<kDLGPU, int64_t, double>( template void BackwardSegmentCmp<kDLGPU, int64_t, 64>(
NDArray feat, NDArray feat,
NDArray arg, NDArray arg,
NDArray out); NDArray out);
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
#define DGL_ARRAY_SEGMENT_REDUCE_CUH_ #define DGL_ARRAY_SEGMENT_REDUCE_CUH_
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh"
#include "./utils.h" #include "./utils.h"
namespace dgl { namespace dgl {
...@@ -31,7 +30,7 @@ __global__ void SegmentReduceKernel( ...@@ -31,7 +30,7 @@ __global__ void SegmentReduceKernel(
int row = blockIdx.x; int row = blockIdx.x;
int col = blockIdx.y * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.x + threadIdx.x;
if (col < dim) { if (col < dim) {
DType local_accum = ReduceOp::zero; DType local_accum = ReduceOp::zero();
IdType local_arg = -1; IdType local_arg = -1;
for (IdType i = offsets[row]; i < offsets[row + 1]; ++i) { for (IdType i = offsets[row]; i < offsets[row + 1]; ++i) {
ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i); ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i);
......
...@@ -14,6 +14,140 @@ namespace dgl { ...@@ -14,6 +14,140 @@ namespace dgl {
using namespace cuda; using namespace cuda;
namespace aten { namespace aten {
namespace {
/*! \brief Call cuBLAS geam API for transpose operation for float and double. */
template <typename DType>
cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
const DType* alpha, const DType* A, int lda,
const DType* beta, const DType* B, int ldb,
DType* C, int ldc) {
LOG(INFO) << "Not supported dtype";
return CUBLAS_STATUS_EXECUTION_FAILED;
}
template <>
cublasStatus_t Xgeam<float>(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
const float* alpha, const float* A, int lda,
const float* beta, const float* B, int ldb,
float* C, int ldc) {
return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda,
beta, B, ldb, C, ldc);
}
template <>
cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
const double* alpha, const double* A, int lda,
const double* beta, const double* B, int ldb,
double* C, int ldc) {
return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda,
beta, B, ldb, C, ldc);
}
/* \brief IndexSelect operator kernel implementation.
* \note duplicate of IndexSelectKernel defined in array_index_select.cu
*/
template <typename DType, typename IdType>
__global__ void _IndexSelectKernel(
const DType* __restrict__ in,
const IdType* __restrict__ idx,
DType* __restrict__ out,
int n, int m) {
int i = blockIdx.x;
for (int j = threadIdx.x; j < m; j += blockDim.x)
out[i * m + j] = in[idx[i] * m + j];
}
/* \brief Transpose operator kernel implementation.
* \note not efficient but it's not a bottleneck, used for float16 dtype.
*/
template <typename DType>
__global__ void _TransposeKernel(
const DType* __restrict__ in,
DType* __restrict__ out,
int n, int m) {
int i = blockIdx.x;
for (int j = threadIdx.x; j < m; j += blockDim.x)
out[i * m + j] = in[j * n + i];
}
/*
* \brief Tranpose the input matrix.
* \param row number of rows of input matrix.
* \param col number of columns of input matrix.
*/
template <typename DType>
void _Transpose(const DType* in, DType* out,
int row, int col) {
DType alpha = 1., beta = 0.;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, thr_entry->stream));
CUBLAS_CALL(Xgeam<DType>(
thr_entry->cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
row, col,
&alpha, in, col,
&beta, nullptr, row,
out, row));
}
/*
* \brief Tranpose the input matrix for data type half.
* \note cuBLAS has no geam API for half data type, fallback to our kernel.
*/
template <>
void _Transpose<half>(const half* in, half* out,
int row, int col) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = FindNumThreads(row);
int nb = col;
CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, thr_entry->stream, in, out, col, row);
}
/*
* \brief
*/
template <typename DType, typename IdType>
__global__ void _IndexSelectKernel(const DType* array, const IdType* index,
int64_t length, DType* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
out[tx] = array[index[tx]];
tx += stride_x;
}
}
/* \brief IndexSelect operator.
* \note duplicate of IndexSelect defined in array_op.h but it can
* not be applied to float16 dtype.
*/
template<typename DType, typename IdType>
NDArray _IndexSelect(NDArray array, NDArray index) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const DType* array_data = static_cast<DType*>(array->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0];
NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
if (len == 0)
return ret;
DType* ret_data = static_cast<DType*>(ret->data);
const int nt = FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(_IndexSelectKernel, nb, nt, 0, thr_entry->stream,
array_data, idx_data, len, ret_data);
return ret;
}
} // namespace
namespace cusparse { namespace cusparse {
#if CUDART_VERSION < 11000 #if CUDART_VERSION < 11000
...@@ -50,38 +184,8 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr ...@@ -50,38 +184,8 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
} }
#endif #endif
template <typename DType>
cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
const DType* alpha, const DType* A, int lda,
const DType* beta, const DType* B, int ldb,
DType* C, int ldc) {
LOG(INFO) << "Not supported dtype";
return CUBLAS_STATUS_EXECUTION_FAILED;
}
template <>
cublasStatus_t Xgeam<float>(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
const float* alpha, const float* A, int lda,
const float* beta, const float* B, int ldb,
float* C, int ldc) {
return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda,
beta, B, ldb, C, ldc);
}
template <>
cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
const double* alpha, const double* A, int lda,
const double* beta, const double* B, int ldb,
double* C, int ldc) {
return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda,
beta, B, ldb, C, ldc);
}
/*! Cusparse implementation of SpMM on Csr format. */ /*! Cusparse implementation of SpMM on Csr format. */
template <typename DType> template <typename DType, typename IdType>
void CusparseCsrmm2( void CusparseCsrmm2(
const DLContext& ctx, const DLContext& ctx,
const CSRMatrix& csr, const CSRMatrix& csr,
...@@ -118,20 +222,21 @@ void CusparseCsrmm2( ...@@ -118,20 +222,21 @@ void CusparseCsrmm2(
#if CUDART_VERSION >= 11000 #if CUDART_VERSION >= 11000
cusparseSpMatDescr_t matA; cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC; cusparseDnMatDescr_t matB, matC;
constexpr auto cuda_dtype = std::is_same<DType, float>::value ? CUDA_R_32F: CUDA_R_64F; constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value;
CUSPARSE_CALL(cusparseCreateCsr(&matA, CUSPARSE_CALL(cusparseCreateCsr(&matA,
m, k, nnz, m, k, nnz,
static_cast<int32_t*>(csr.indptr->data), static_cast<IdType*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), static_cast<IdType*>(csr.indices->data),
const_cast<DType*>(valptr? valptr : A_data), const_cast<DType*>(valptr? valptr : A_data),
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, idtype, idtype,
CUSPARSE_INDEX_BASE_ZERO, cuda_dtype)); CUSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(cusparseCreateDnMat(&matB, CUSPARSE_CALL(cusparseCreateDnMat(&matB,
k, n, n, k, n, n,
const_cast<DType*>(B_data), cuda_dtype, CUSPARSE_ORDER_ROW)); const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
CUSPARSE_CALL(cusparseCreateDnMat(&matC, CUSPARSE_CALL(cusparseCreateDnMat(&matC,
m, n, n, m, n, n,
C_data, cuda_dtype, CUSPARSE_ORDER_ROW)); C_data, dtype, CUSPARSE_ORDER_ROW));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
...@@ -139,13 +244,13 @@ void CusparseCsrmm2( ...@@ -139,13 +244,13 @@ void CusparseCsrmm2(
CUSPARSE_CALL(cusparseSpMM_bufferSize( CUSPARSE_CALL(cusparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB, thr_entry->cusparse_handle, transA, transB,
&alpha, matA, matB, &beta, matC, &alpha, matA, matB, &beta, matC,
cuda_dtype, CUSPARSE_SPMM_CSR_ALG2, dtype, CUSPARSE_SPMM_CSR_ALG2,
&workspace_size)); &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM( CUSPARSE_CALL(cusparseSpMM(
thr_entry->cusparse_handle, transA, transB, thr_entry->cusparse_handle, transA, transB,
&alpha, matA, matB, &beta, matC, &alpha, matA, matB, &beta, matC,
cuda_dtype, CUSPARSE_SPMM_CSR_ALG2, dtype, CUSPARSE_SPMM_CSR_ALG2,
workspace)); workspace));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
...@@ -171,17 +276,7 @@ void CusparseCsrmm2( ...@@ -171,17 +276,7 @@ void CusparseCsrmm2(
B_data, n, &beta, trans_out, m)); B_data, n, &beta, trans_out, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
// transpose the output matrix // transpose the output matrix
if (!thr_entry->cublas_handle) _Transpose(trans_out, C_data, n, m);
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, thr_entry->stream));
CUBLAS_CALL(Xgeam<DType>(
thr_entry->cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
n, m,
&alpha, trans_out, m,
&beta, nullptr, n,
C_data, n));
device->FreeWorkspace(ctx, trans_out); device->FreeWorkspace(ctx, trans_out);
#endif #endif
if (valptr) if (valptr)
...@@ -214,12 +309,29 @@ void CusparseCsrmm2( ...@@ -214,12 +309,29 @@ void CusparseCsrmm2(
} \ } \
} while (0) } while (0)
/*!
* \brief Determine whether cusparse SpMM function is applicable.
*/
template <int bits, typename IdType>
inline bool cusparse_available() {
#if CUDART_VERSION < 11000
if (std::is_same<IdType, int>::value)
if (bits > 16)
return true;
return false;
#else
if (bits == 16)
return false; // cusparse's SpMM on fp16 is slow, temporally disabled.
return true;
#endif
}
/*! /*!
* \brief CUDA implementation of g-SpMM on Csr format. * \brief CUDA implementation of g-SpMM on Csr format.
* \note use cusparse if the reduce operator is `sum` and there is * \note use cusparse if the reduce operator is `sum` and there is
* no broadcast, use dgl's kernel in other cases. * no broadcast, use dgl's kernel in other cases.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void SpMMCsr(const std::string& op, const std::string& reduce, void SpMMCsr(const std::string& op, const std::string& reduce,
const BcastOff& bcast, const BcastOff& bcast,
const CSRMatrix& csr, const CSRMatrix& csr,
...@@ -232,53 +344,67 @@ void SpMMCsr(const std::string& op, const std::string& reduce, ...@@ -232,53 +344,67 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
bool use_efeat = op != "copy_lhs"; bool use_efeat = op != "copy_lhs";
if (reduce == "sum") { if (reduce == "sum") {
if (sizeof(IdType) == 4 && op == "copy_lhs") { // cusparse if (op == "copy_lhs" && cusparse_available<bits, IdType>()) { // cusparse
int64_t x_length = 1; int64_t x_length = 1;
for (int i = 1; i < ufeat->ndim; ++i) for (int i = 1; i < ufeat->ndim; ++i)
x_length *= ufeat->shape[i]; x_length *= ufeat->shape[i];
cusparse::CusparseCsrmm2<DType>( SWITCH_BITS(bits, DType, {
ufeat->ctx, csr, cusparse::CusparseCsrmm2<DType, IdType>(
static_cast<DType*>(ufeat->data), ufeat->ctx, csr,
nullptr, static_cast<DType*>(ufeat->data),
static_cast<DType*>(out->data), nullptr,
x_length); static_cast<DType*>(out->data),
} else if (sizeof(IdType) == 4 && op == "mul" && is_scalar_efeat) { // cusparse x_length);
});
} else if (op == "mul" && is_scalar_efeat && cusparse_available<bits, IdType>()) { // cusparse
int64_t x_length = 1; int64_t x_length = 1;
for (int i = 1; i < ufeat->ndim; ++i) for (int i = 1; i < ufeat->ndim; ++i)
x_length *= ufeat->shape[i]; x_length *= ufeat->shape[i];
if (!IsNullArray(csr.data)) if (!IsNullArray(csr.data)) {
efeat = IndexSelect(efeat, csr.data); SWITCH_BITS(bits, DType, {
cusparse::CusparseCsrmm2<DType>( efeat = _IndexSelect<DType, IdType>(efeat, csr.data);
ufeat->ctx, csr, });
static_cast<DType*>(ufeat->data), }
static_cast<DType*>(efeat->data), SWITCH_BITS(bits, DType, {
static_cast<DType*>(out->data), cusparse::CusparseCsrmm2<DType, IdType>(
x_length); ufeat->ctx, csr,
static_cast<DType*>(ufeat->data),
static_cast<DType*>(efeat->data),
static_cast<DType*>(out->data),
x_length);
});
} else { // general kernel } else { // general kernel
SWITCH_OP(op, Op, { SWITCH_BITS(bits, DType, {
cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Sum<IdType, DType> >( SWITCH_OP(op, Op, {
bcast, csr, ufeat, efeat, out, NullArray(), NullArray()); cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Sum<IdType, DType> >(
bcast, csr, ufeat, efeat, out, NullArray(), NullArray());
});
}); });
} }
} else if (reduce == "max") { } else if (reduce == "max") {
SWITCH_OP(op, Op, { SWITCH_BITS(bits, DType, {
cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Max<IdType, DType> >( SWITCH_OP(op, Op, {
bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]); cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Max<IdType, DType> >(
bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
}); });
} else if (reduce == "min") { } else if (reduce == "min") {
SWITCH_OP(op, Op, { SWITCH_BITS(bits, DType, {
cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Min<IdType, DType> >( SWITCH_OP(op, Op, {
bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]); cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Min<IdType, DType> >(
bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
}); });
} else { } else {
LOG(FATAL) << "Not implemented"; LOG(FATAL) << "Not implemented";
} }
} }
/*! /*!
* \brief CUDA implementation of g-SpMM on Coo format. * \brief CUDA implementation of g-SpMM on Coo format.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void SpMMCoo(const std::string& op, const std::string& reduce, void SpMMCoo(const std::string& op, const std::string& reduce,
const BcastOff& bcast, const BcastOff& bcast,
const COOMatrix& coo, const COOMatrix& coo,
...@@ -287,58 +413,81 @@ void SpMMCoo(const std::string& op, const std::string& reduce, ...@@ -287,58 +413,81 @@ void SpMMCoo(const std::string& op, const std::string& reduce,
NDArray out, NDArray out,
std::vector<NDArray> out_aux) { std::vector<NDArray> out_aux) {
if (reduce == "sum") { if (reduce == "sum") {
SWITCH_OP(op, Op, { SWITCH_BITS(bits, DType, {
cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Sum<IdType, DType, true> > ( SWITCH_OP(op, Op, {
bcast, coo, ufeat, efeat, out, NullArray(), NullArray()); cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Sum<IdType, DType, true> > (
bcast, coo, ufeat, efeat, out, NullArray(), NullArray());
});
}); });
} else if (reduce == "max") { } else if (reduce == "max") {
SWITCH_OP(op, Op, { SWITCH_BITS(bits, DType, {
cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Max<IdType, DType, true> > ( SWITCH_OP(op, Op, {
bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]); cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Max<IdType, DType, true> > (
bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
}); });
} else if (reduce == "min") { } else if (reduce == "min") {
SWITCH_OP(op, Op, { SWITCH_BITS(bits, DType, {
cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Min<IdType, DType, true> > ( SWITCH_OP(op, Op, {
bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]); cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Min<IdType, DType, true> > (
bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
}); });
} else { } else {
LOG(FATAL) << "Not implemented"; LOG(FATAL) << "Not implemented";
} }
} }
template void SpMMCsr<kDLGPU, int32_t, float>( template void SpMMCsr<kDLGPU, int32_t, 16>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const CSRMatrix& csr, const BcastOff& bcast, const CSRMatrix& csr,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCsr<kDLGPU, int64_t, float>( template void SpMMCsr<kDLGPU, int64_t, 16>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const CSRMatrix& csr, const BcastOff& bcast, const CSRMatrix& csr,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCsr<kDLGPU, int32_t, double>( template void SpMMCsr<kDLGPU, int32_t, 32>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const CSRMatrix& csr, const BcastOff& bcast, const CSRMatrix& csr,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCsr<kDLGPU, int64_t, double>( template void SpMMCsr<kDLGPU, int64_t, 32>(
const std::string& op, const std::string& reduce,
const BcastOff& bcast, const CSRMatrix& csr,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCsr<kDLGPU, int32_t, 64>(
const std::string& op, const std::string& reduce,
const BcastOff& bcast, const CSRMatrix& csr,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCsr<kDLGPU, int64_t, 64>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const CSRMatrix& csr, const BcastOff& bcast, const CSRMatrix& csr,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCoo<kDLGPU, int32_t, float>( template void SpMMCoo<kDLGPU, int32_t, 16>(
const std::string& op, const std::string& reduce,
const BcastOff& bcast, const COOMatrix& coo,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCoo<kDLGPU, int64_t, 16>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const COOMatrix& coo, const BcastOff& bcast, const COOMatrix& coo,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCoo<kDLGPU, int64_t, float>( template void SpMMCoo<kDLGPU, int32_t, 32>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const COOMatrix& coo, const BcastOff& bcast, const COOMatrix& coo,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCoo<kDLGPU, int32_t, double>( template void SpMMCoo<kDLGPU, int64_t, 32>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const COOMatrix& coo, const BcastOff& bcast, const COOMatrix& coo,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCoo<kDLGPU, int64_t, double>( template void SpMMCoo<kDLGPU, int32_t, 64>(
const std::string& op, const std::string& reduce, const std::string& op, const std::string& reduce,
const BcastOff& bcast, const COOMatrix& coo, const BcastOff& bcast, const COOMatrix& coo,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux); NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
template void SpMMCoo<kDLGPU, int64_t, 64>(
const std::string& op, const std::string& reduce,
const BcastOff& bcast, const COOMatrix& coo,
NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include "macro.cuh" #include "macro.cuh"
#include "fp16.cuh"
#include "atomic.cuh" #include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "./utils.h"
...@@ -147,7 +148,7 @@ __global__ void SpMMCsrKernel( ...@@ -147,7 +148,7 @@ __global__ void SpMMCsrKernel(
while (ty < num_rows) { while (ty < num_rows) {
int tx = blockIdx.x * blockDim.x + threadIdx.x; int tx = blockIdx.x * blockDim.x + threadIdx.x;
while (tx < out_len) { while (tx < out_len) {
DType local_accum = ReduceOp::zero; DType local_accum = ReduceOp::zero();
Idx local_argu = 0, local_arge = 0; Idx local_argu = 0, local_arge = 0;
const int lhs_add = UseBcast ? ubcast_off[tx] : tx; const int lhs_add = UseBcast ? ubcast_off[tx] : tx;
const int rhs_add = UseBcast ? ebcast_off[tx] : tx; const int rhs_add = UseBcast ? ebcast_off[tx] : tx;
...@@ -191,6 +192,12 @@ void SpMMCoo( ...@@ -191,6 +192,12 @@ void SpMMCoo(
const COOMatrix& coo, const COOMatrix& coo,
NDArray ufeat, NDArray efeat, NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge) { NDArray out, NDArray argu, NDArray arge) {
#if defined(CUDART_VERSION) && CUDART_VERSION <= 10000
if (std::is_same<DType, half>::value)
LOG(FATAL) << "SpMMCoo requires atomicCAS, which is not supported "
<< "for float16 in CUDA 10.0. Please upgrade your CUDA "
<< "to later versions.";
#endif
const Idx *row = coo.row.Ptr<Idx>(), const Idx *row = coo.row.Ptr<Idx>(),
*col = coo.col.Ptr<Idx>(), *col = coo.col.Ptr<Idx>(),
*edge_map = coo.data.Ptr<Idx>(); *edge_map = coo.data.Ptr<Idx>();
...@@ -211,7 +218,7 @@ void SpMMCoo( ...@@ -211,7 +218,7 @@ void SpMMCoo(
const int nt = FindNumThreads(out_size); const int nt = FindNumThreads(out_size);
const int nb = (out_size + nt - 1) / nt; const int nb = (out_size + nt - 1) / nt;
CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream, CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream,
out_data, out_size, ReduceOp::zero); out_data, out_size, ReduceOp::zero());
const int ntx = FindNumThreads(len); const int ntx = FindNumThreads(len);
const int nty = CUDA_MAX_NUM_THREADS / ntx; const int nty = CUDA_MAX_NUM_THREADS / ntx;
......
...@@ -18,6 +18,37 @@ namespace cuda { ...@@ -18,6 +18,37 @@ namespace cuda {
#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF #define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
#define CUDA_MAX_NUM_THREADS 1024 #define CUDA_MAX_NUM_THREADS 1024
#ifdef USE_FP16
#define SWITCH_BITS(bits, DType, ...) \
do { \
if ((bits) == 16) { \
typedef half DType; \
{ __VA_ARGS__ } \
} else if ((bits) == 32) { \
typedef float DType; \
{ __VA_ARGS__ } \
} else if ((bits) == 64) { \
typedef double DType; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Data type not renogized with bits " << bits; \
} \
} while (0)
#else // USE_FP16
#define SWITCH_BITS(bits, DType, ...) \
do { \
if ((bits) == 32) { \
typedef float DType; \
{ __VA_ARGS__ } \
} else if ((bits) == 64) { \
typedef double DType; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Data type not renogized with bits " << bits; \
} \
} while (0)
#endif // USE_FP16
/*! \brief Calculate the number of threads needed given the dimension length. /*! \brief Calculate the number of threads needed given the dimension length.
* *
* It finds the biggest number that is smaller than min(dim, max_nthrs) * It finds the biggest number that is smaller than min(dim, max_nthrs)
......
...@@ -81,13 +81,13 @@ void SpMM(const std::string& op, const std::string& reduce, ...@@ -81,13 +81,13 @@ void SpMM(const std::string& op, const std::string& reduce,
ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SpMM", { ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SpMM", {
ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, { ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
ATEN_FLOAT_TYPE_SWITCH(out->dtype, DType, "Feature data", { ATEN_FLOAT_BITS_SWITCH(out->dtype, bits, "Feature data", {
if (format == SparseFormat::kCSC) { if (format == SparseFormat::kCSC) {
SpMMCsr<XPU, IdType, DType>( SpMMCsr<XPU, IdType, bits>(
op, reduce, bcast, graph->GetCSCMatrix(0), op, reduce, bcast, graph->GetCSCMatrix(0),
ufeat, efeat, out, out_aux); ufeat, efeat, out, out_aux);
} else if (format == SparseFormat::kCOO) { } else if (format == SparseFormat::kCOO) {
SpMMCoo<XPU, IdType, DType>( SpMMCoo<XPU, IdType, bits>(
op, reduce, bcast, graph->GetCOOMatrix(0), op, reduce, bcast, graph->GetCOOMatrix(0),
ufeat, efeat, out, out_aux); ufeat, efeat, out, out_aux);
} else { } else {
...@@ -112,13 +112,13 @@ void SDDMM(const std::string& op, ...@@ -112,13 +112,13 @@ void SDDMM(const std::string& op,
ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SDDMM", { ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SDDMM", {
ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, { ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
ATEN_FLOAT_TYPE_SWITCH(out->dtype, DType, "Feature data", { ATEN_FLOAT_BITS_SWITCH(out->dtype, bits, "Feature data", {
if (format == SparseFormat::kCSR) { if (format == SparseFormat::kCSR) {
SDDMMCsr<XPU, IdType, DType>( SDDMMCsr<XPU, IdType, bits>(
op, bcast, graph->GetCSRMatrix(0), op, bcast, graph->GetCSRMatrix(0),
lhs, rhs, out, lhs_target, rhs_target); lhs, rhs, out, lhs_target, rhs_target);
} else if (format == SparseFormat::kCOO) { } else if (format == SparseFormat::kCOO) {
SDDMMCoo<XPU, IdType, DType>( SDDMMCoo<XPU, IdType, bits>(
op, bcast, graph->GetCOOMatrix(0), op, bcast, graph->GetCOOMatrix(0),
lhs, rhs, out, lhs_target, rhs_target); lhs, rhs, out, lhs_target, rhs_target);
} else { } else {
...@@ -129,6 +129,15 @@ void SDDMM(const std::string& op, ...@@ -129,6 +129,15 @@ void SDDMM(const std::string& op,
}); });
} }
NDArray GetEdgeMapping(HeteroGraphRef graph) {
SparseFormat format = graph->SelectFormat(0, csc_code);
if (format == SparseFormat::kCSC) {
return graph.sptr()->GetCSCMatrix(0).data;
} else {
return NullArray();
}
}
/*! \brief Segment reduce dispatch function. */ /*! \brief Segment reduce dispatch function. */
void SegmentReduceDispatch(const std::string& op, void SegmentReduceDispatch(const std::string& op,
NDArray feat, NDArray feat,
...@@ -137,8 +146,8 @@ void SegmentReduceDispatch(const std::string& op, ...@@ -137,8 +146,8 @@ void SegmentReduceDispatch(const std::string& op,
NDArray arg) { NDArray arg) {
ATEN_XPU_SWITCH_CUDA(feat->ctx.device_type, XPU, "SegmentReduce", { ATEN_XPU_SWITCH_CUDA(feat->ctx.device_type, XPU, "SegmentReduce", {
ATEN_ID_TYPE_SWITCH(offsets->dtype, IdType, { ATEN_ID_TYPE_SWITCH(offsets->dtype, IdType, {
ATEN_FLOAT_TYPE_SWITCH(feat->dtype, DType, "Feature data", { ATEN_FLOAT_BITS_SWITCH(feat->dtype, bits, "Feature data", {
SegmentReduce<XPU, IdType, DType>(op, feat, offsets, out, arg); SegmentReduce<XPU, IdType, bits>(op, feat, offsets, out, arg);
}); });
}); });
}); });
...@@ -148,8 +157,8 @@ void SegmentReduceDispatch(const std::string& op, ...@@ -148,8 +157,8 @@ void SegmentReduceDispatch(const std::string& op,
void BackwardSegmentCmpDispatch(NDArray feat, NDArray arg, NDArray out) { void BackwardSegmentCmpDispatch(NDArray feat, NDArray arg, NDArray out) {
ATEN_XPU_SWITCH_CUDA(feat->ctx.device_type, XPU, "BackwardSegmentCmp", { ATEN_XPU_SWITCH_CUDA(feat->ctx.device_type, XPU, "BackwardSegmentCmp", {
ATEN_ID_TYPE_SWITCH(arg->dtype, IdType, { ATEN_ID_TYPE_SWITCH(arg->dtype, IdType, {
ATEN_FLOAT_TYPE_SWITCH(feat->dtype, DType, "Feature data", { ATEN_FLOAT_BITS_SWITCH(feat->dtype, bits, "Feature data", {
BackwardSegmentCmp<XPU, IdType, DType>(feat, arg, out); BackwardSegmentCmp<XPU, IdType, bits>(feat, arg, out);
}); });
}); });
}); });
...@@ -226,6 +235,12 @@ DGL_REGISTER_GLOBAL("sparse._CAPI_DGLKernelBwdSegmentCmp") ...@@ -226,6 +235,12 @@ DGL_REGISTER_GLOBAL("sparse._CAPI_DGLKernelBwdSegmentCmp")
BackwardSegmentCmpDispatch(feat, arg, out); BackwardSegmentCmpDispatch(feat, arg, out);
}); });
DGL_REGISTER_GLOBAL("sparse._CAPI_DGLKernelGetEdgeMapping")
.set_body([](DGLArgs args, DGLRetValue *rv) {
HeteroGraphRef graph = args[0];
*rv = GetEdgeMapping(graph);
});
#ifdef USE_TVM #ifdef USE_TVM
DGL_REGISTER_GLOBAL("sparse._CAPI_FG_LoadModule") DGL_REGISTER_GLOBAL("sparse._CAPI_FG_LoadModule")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([] (DGLArgs args, DGLRetValue* rv) {
......
...@@ -19,7 +19,7 @@ namespace aten { ...@@ -19,7 +19,7 @@ namespace aten {
/*! /*!
* \brief Generalized Sparse Matrix Dense Matrix Multiplication on Csr format. * \brief Generalized Sparse Matrix Dense Matrix Multiplication on Csr format.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void SpMMCsr(const std::string& op, const std::string& reduce, void SpMMCsr(const std::string& op, const std::string& reduce,
const BcastOff& bcast, const BcastOff& bcast,
const aten::CSRMatrix& csr, const aten::CSRMatrix& csr,
...@@ -31,7 +31,7 @@ void SpMMCsr(const std::string& op, const std::string& reduce, ...@@ -31,7 +31,7 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
/*! /*!
* \brief Generalized Sparse Matrix Dense Matrix Multiplication on Coo format. * \brief Generalized Sparse Matrix Dense Matrix Multiplication on Coo format.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void SpMMCoo(const std::string& op, const std::string& reduce, void SpMMCoo(const std::string& op, const std::string& reduce,
const BcastOff& bcast, const BcastOff& bcast,
const aten::COOMatrix& coo, const aten::COOMatrix& coo,
...@@ -43,7 +43,7 @@ void SpMMCoo(const std::string& op, const std::string& reduce, ...@@ -43,7 +43,7 @@ void SpMMCoo(const std::string& op, const std::string& reduce,
/*! /*!
* \brief Generalized Sampled Dense-Dense Matrix Multiplication on Csr format. * \brief Generalized Sampled Dense-Dense Matrix Multiplication on Csr format.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void SDDMMCsr(const std::string& op, void SDDMMCsr(const std::string& op,
const BcastOff& bcast, const BcastOff& bcast,
const aten::CSRMatrix& csr, const aten::CSRMatrix& csr,
...@@ -56,7 +56,7 @@ void SDDMMCsr(const std::string& op, ...@@ -56,7 +56,7 @@ void SDDMMCsr(const std::string& op,
/*! /*!
* \brief Generalized Sampled Dense-Dense Matrix Multiplication on Coo format. * \brief Generalized Sampled Dense-Dense Matrix Multiplication on Coo format.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void SDDMMCoo(const std::string& op, void SDDMMCoo(const std::string& op,
const BcastOff& bcast, const BcastOff& bcast,
const aten::COOMatrix& coo, const aten::COOMatrix& coo,
...@@ -69,7 +69,7 @@ void SDDMMCoo(const std::string& op, ...@@ -69,7 +69,7 @@ void SDDMMCoo(const std::string& op,
/*! /*!
* \brief Segment reduce. * \brief Segment reduce.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void SegmentReduce(const std::string& op, void SegmentReduce(const std::string& op,
NDArray feat, NDArray feat,
NDArray offsets, NDArray offsets,
...@@ -79,7 +79,7 @@ void SegmentReduce(const std::string& op, ...@@ -79,7 +79,7 @@ void SegmentReduce(const std::string& op,
/*! /*!
* \brief Backward function of segment cmp. * \brief Backward function of segment cmp.
*/ */
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, int bits>
void BackwardSegmentCmp(NDArray feat, void BackwardSegmentCmp(NDArray feat,
NDArray arg, NDArray arg,
NDArray out); NDArray out);
......
...@@ -70,6 +70,46 @@ inline bool is_zero<dim3>(dim3 size) { ...@@ -70,6 +70,46 @@ inline bool is_zero<dim3>(dim3 size) {
CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \ CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
} }
/*
* \brief Cast data type to cudaDataType_t.
*/
template <typename T>
struct cuda_dtype {
static constexpr cudaDataType_t value = CUDA_R_32F;
};
template <>
struct cuda_dtype<half> {
static constexpr cudaDataType_t value = CUDA_R_16F;
};
template <>
struct cuda_dtype<float> {
static constexpr cudaDataType_t value = CUDA_R_32F;
};
template <>
struct cuda_dtype<double> {
static constexpr cudaDataType_t value = CUDA_R_64F;
};
/*
* \brief Cast index data type to cusparseIndexType_t.
*/
template <typename T>
struct cusparse_idtype {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
};
template <>
struct cusparse_idtype<int32_t> {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
};
template <>
struct cusparse_idtype<int64_t> {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
};
/*! \brief Thread local workspace */ /*! \brief Thread local workspace */
class CUDAThreadEntry { class CUDAThreadEntry {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment