Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
74d88bf8
"tools/imglab/vscode:/vscode.git/clone" did not exist on "b52e50a30bc994ab5c04f8b0f8adc1334d5f6a61"
Commit
74d88bf8
authored
Feb 20, 2025
by
sangwz
Browse files
Merge branch 'dtk25.04' of
http://developer.sourcefind.cn/codes/OpenDAS/dgl
into 2.2.1
parents
2a1ac588
314cedc1
Changes
179
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
433 additions
and
369 deletions
+433
-369
src/array/cuda/functor.cuh
src/array/cuda/functor.cuh
+33
-32
src/array/cuda/gather_mm.hip
src/array/cuda/gather_mm.hip
+59
-48
src/array/cuda/ge_spmm.cuh
src/array/cuda/ge_spmm.cuh
+4
-2
src/array/cuda/labor_sampling.hip
src/array/cuda/labor_sampling.hip
+39
-27
src/array/cuda/macro.cuh
src/array/cuda/macro.cuh
+5
-4
src/array/cuda/negative_sampling.hip
src/array/cuda/negative_sampling.hip
+20
-18
src/array/cuda/rowwise_sampling.hip
src/array/cuda/rowwise_sampling.hip
+22
-20
src/array/cuda/rowwise_sampling_prob.hip
src/array/cuda/rowwise_sampling_prob.hip
+39
-34
src/array/cuda/sddmm.cuh
src/array/cuda/sddmm.cuh
+7
-5
src/array/cuda/sddmm.hip
src/array/cuda/sddmm.hip
+7
-6
src/array/cuda/sddmm_hetero_coo.hip
src/array/cuda/sddmm_hetero_coo.hip
+4
-3
src/array/cuda/sddmm_hetero_csr.hip
src/array/cuda/sddmm_hetero_csr.hip
+4
-3
src/array/cuda/segment_reduce.cuh
src/array/cuda/segment_reduce.cuh
+8
-6
src/array/cuda/segment_reduce.hip
src/array/cuda/segment_reduce.hip
+12
-11
src/array/cuda/spmat_op_impl_coo.hip
src/array/cuda/spmat_op_impl_coo.hip
+6
-4
src/array/cuda/spmat_op_impl_csr.hip
src/array/cuda/spmat_op_impl_csr.hip
+21
-18
src/array/cuda/spmm.cuh
src/array/cuda/spmm.cuh
+112
-102
src/array/cuda/spmm.hip
src/array/cuda/spmm.hip
+8
-7
src/array/cuda/spmm_hetero.hip
src/array/cuda/spmm_hetero.hip
+11
-9
src/array/cuda/utils.h
src/array/cuda/utils.h
+12
-10
No files found.
src/array/cuda/functor.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/functor.cuh
...
...
@@ -9,8 +10,8 @@
#include <cmath>
#include <limits>
#include "
./
atomic.cuh"
#include "
./
fp16.cuh"
#include "atomic.cuh"
#include "fp16.cuh"
#include "bf16.cuh"
namespace
dgl
{
...
...
@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
struct
Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
0.
);
struct
Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
(
0.
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
}
...
...
@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
struct
Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
-
std
::
numeric_limits
<
float
>::
infinity
());
struct
Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
(
-
std
::
numeric_limits
<
float
>::
infinity
());
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
}
...
...
@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
struct
Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
std
::
numeric_limits
<
float
>::
infinity
());
struct
Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
(
std
::
numeric_limits
<
float
>::
infinity
());
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
}
...
...
src/array/cuda/gather_mm.
cu
→
src/array/cuda/gather_mm.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/gather_mm.cu
...
...
@@ -7,9 +9,9 @@
#include <algorithm> // std::swap
#include "
./
atomic.cuh"
#include "
./
functor.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "functor.cuh"
#include "utils.h"
namespace dgl {
using namespace cuda;
...
...
@@ -20,54 +22,63 @@ namespace {
/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
*/
template <typename DType>
cu
blasStatus_t
cublasGemm
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const DType* alpha, const DType* A, int lda,
const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype";
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED;
}
template <>
cu
blasStatus_t
cublasGemm
<
__half
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm<__half>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const __half* alpha, const __half* A, int lda,
const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
return
cu
blasHgemm
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
return
hip
blasHgemm(
handle, transa, transb, m, n, k,
(hipblasHalf*)alpha, (hipblasHalf*)A, lda, (hipblasHalf*)B, ldb, (hipblasHalf*)beta, (hipblasHalf*)
C, ldc);
}
// template <>
// hipblasStatus_t cublasGemm<__half>(
// hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
// int m, int n, int k, const __half* alpha, const __half* A, int lda,
// const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
// return hipblasHgemm(
// handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
// }
#if BF16_ENABLED
template <>
cu
blasStatus_t
cublasGemm
<
__
nv
_bfloat16
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
__
nv
_bfloat16
*
alpha
,
const
__
nv
_bfloat16
*
A
,
int
lda
,
const
__
nv
_bfloat16
*
B
,
int
ldb
,
const
__
nv
_bfloat16
*
beta
,
__
nv
_bfloat16
*
C
,
int
ldc
)
{
hip
blasStatus_t cublasGemm<__
hip
_bfloat16>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const __
hip
_bfloat16* alpha, const __
hip
_bfloat16* A,
int lda, const __
hip
_bfloat16* B, int ldb, const __
hip
_bfloat16* beta,
__
hip
_bfloat16* C, int ldc) {
float alpha_float = __bfloat162float(*alpha);
float beta_float = __bfloat162float(*beta);
return
cu
blasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
&
alpha_float
,
A
,
CUDA
_R_16B
F
,
lda
,
B
,
CUDA
_R_16B
F
,
ldb
,
&
beta_float
,
C
,
CUDA
_R_16B
F
,
ldc
,
CU
BLAS_
COMPUTE
_32F
,
CU
BLAS_GEMM_DEFAULT
_TENSOR_OP
);
return
hip
blasGemmEx(
handle, transa, transb, m, n, k, &alpha_float, A,
HIPBLAS
_R_16B, lda, B,
HIPBLAS
_R_16B, ldb, &beta_float, C,
HIPBLAS
_R_16B, ldc,
HIP
BLAS_
R
_32F,
HIP
BLAS_GEMM_DEFAULT);
}
#endif // BF16_ENABLED
template <>
cu
blasStatus_t
cublasGemm
<
float
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm<float>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const float* alpha, const float* A, int lda,
const float* B, int ldb, const float* beta, float* C, int ldc) {
return
cu
blasSgemm
(
return
hip
blasSgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
template <>
cu
blasStatus_t
cublasGemm
<
double
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm<double>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const double* alpha, const double* A, int lda,
const double* B, int ldb, const double* beta, double* C, int ldc) {
return
cu
blasDgemm
(
return
hip
blasDgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
...
...
@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
// Load A in shared mem in a coalesced way
for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
__syncwarp
();
//
__syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private
...
...
@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
/* Load A in shared mem in a coalesced way */
for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
__syncwarp
();
//
__syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private
...
...
@@ -203,7 +214,7 @@ void SegmentMM(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_data = A.Ptr<DType>();
const DType* B_data = B.Ptr<DType>();
const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
...
...
@@ -215,8 +226,8 @@ void SegmentMM(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL(
hip
blasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(
hip
blasSetStream(thr_entry->cublas_handle, stream));
IdType m_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) {
...
...
@@ -226,10 +237,10 @@ void SegmentMM(
n = B->shape[2]; // cols of B
k = B->shape[1]; // cols of A == rows of B
int ldb = n, lda = k, ldc = n;
cu
blasOperation_t
transB
=
CU
BLAS_OP_N
;
cu
blasOperation_t
transA
=
CU
BLAS_OP_N
;
hip
blasOperation_t transB =
HIP
BLAS_OP_N;
hip
blasOperation_t transA =
HIP
BLAS_OP_N;
if (b_trans) {
transB
=
CU
BLAS_OP_T
;
transB =
HIP
BLAS_OP_T;
ldb = n, lda = n, ldc = k;
std::swap(n, k);
}
...
...
@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
void SegmentMMBackwardB(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_data = A.Ptr<DType>();
const DType* dC_data = dC.Ptr<DType>();
const IdType* seglen_data = seglen.Ptr<IdType>();
...
...
@@ -260,8 +271,8 @@ void SegmentMMBackwardB(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL(
hip
blasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(
hip
blasSetStream(thr_entry->cublas_handle, stream));
IdType k_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) {
...
...
@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
CHECK_LE(k_offset + k, A->shape[0])
<< "Segement index out of bound of A->shape[0].";
int lddC = m, ldA = n, lddB = m;
cu
blasOperation_t
trans_dC
=
CU
BLAS_OP_N
;
cu
blasOperation_t
trans_A
=
CU
BLAS_OP_T
;
hip
blasOperation_t trans_dC =
HIP
BLAS_OP_N;
hip
blasOperation_t trans_A =
HIP
BLAS_OP_T;
CUBLAS_CALL(cublasGemm<DType>(
thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
...
...
@@ -299,7 +310,7 @@ void GatherMM(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t out_len = B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A
const int64_t tot_num_rows = A->shape[0];
...
...
@@ -332,7 +343,7 @@ void GatherMMScatter(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const IdType* idx_c_data = idx_c.Ptr<IdType>();
int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A
...
...
@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
#if BF16_ENABLED
template
void
GatherMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void GatherMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
template
void
GatherMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void GatherMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
#endif // BF16_ENABLED
...
...
@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
#if BF16_ENABLED
template
void
GatherMMScatter
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void GatherMMScatter<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
template
void
GatherMMScatter
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void GatherMMScatter<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
#endif // BF16_ENABLED
...
...
@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
#if BF16_ENABLED
template
void
SegmentMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
template
void
SegmentMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
#endif // BF16_ENABLED
...
...
@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#if BF16_ENABLED
template
void
SegmentMMBackwardB
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentMMBackwardB<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
template
void
SegmentMMBackwardB
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#endif // BF16_ENABLED
template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/ge_spmm.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/ge_spmm.cuh
...
...
@@ -7,7 +9,7 @@
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "macro.cuh"
...
...
@@ -121,7 +123,7 @@ void GESpMMCsr(
const
DType
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
int
ntx
=
32
;
const
int
nty
=
32
;
...
...
src/array/cuda/labor_sampling.
cu
→
src/array/cuda/labor_sampling.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*!
* Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
...
@@ -34,17 +36,21 @@
#include <thrust/zip_function.h>
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <limits>
#include <numeric>
#include <type_traits>
#include <utility>
#include "../../array/cuda/utils.h"
#include "atomic.cuh"
#include "../../graph/transform/cuda/cuda_map_edges.cuh"
#include "../../random/continuous_seed.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "
./
spmm.cuh"
#include "functor.cuh"
#include "spmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -129,7 +135,7 @@ struct StencilOpFused {
const IdType* indices;
const IdType* nids;
bool is_pinned;
__device__
auto
operator
()(
IdType
idx
)
{
__host__
__device__ auto operator()(IdType idx) {
const auto in_row = idx_coo[idx];
const auto ps = probs[idx];
IdType rofs = idx - subindptr[in_row];
...
...
@@ -275,7 +281,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
const FloatType* const ds, const FloatType* const d2s,
const IdType* const indptr, const FloatType* const probs,
const FloatType* const A, const IdType* const subindptr) {
typedef
cub
::
BlockReduce
<
FloatType
,
BLOCK_SIZE
>
BlockReduce
;
typedef
hip
cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ FloatType var_1_bcast[BLOCK_CTAS];
...
...
@@ -349,7 +355,7 @@ int log_size(const IdType size) {
template <typename IdType, typename FloatType, typename exec_policy_t>
void compute_importance_sampling_probabilities(
CSRMatrix
mat
,
const
IdType
hop_size
,
cuda
Stream_t
stream
,
CSRMatrix mat, const IdType hop_size,
hip
Stream_t stream,
const continuous_seed seed, const IdType num_rows, const IdType* indptr,
const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
const IdType* nids,
...
...
@@ -396,17 +402,17 @@ void compute_importance_sampling_probabilities(
hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
mat.indptr->dtype);
cub
::
DoubleBuffer
<
IdType
>
hop_b
(
hop_2
.
get
(),
hop_3
.
get
());
hip
cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
{
std::size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortKeys
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortKeys(
nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortKeys
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortKeys(
temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
max_log_num_vertices, stream));
}
...
...
@@ -416,13 +422,13 @@ void compute_importance_sampling_probabilities(
{
std::size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceRunLengthEncode
::
Encode
(
CUDA_CALL(
hip
cub::DeviceRunLengthEncode::Encode(
nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceRunLengthEncode
::
Encode
(
CUDA_CALL(
hip
cub::DeviceRunLengthEncode::Encode(
temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
...
...
@@ -509,7 +515,7 @@ void compute_importance_sampling_probabilities(
/////////////////////////////// CSR ///////////////////////////////
template <DGLDeviceType XPU, typename IdType, typename FloatType>
std
::
pair
<
COOMatrix
,
FloatArray
>
CSRLaborSampling
(
__host__
std::pair<COOMatrix, FloatArray> CSRLaborSampling(
CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
float seed2_contribution, IdArray NIDs) {
...
...
@@ -519,19 +525,25 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
runtime::CUDAWorkspaceAllocator allocator(ctx);
const
auto
stream
=
runtime
::
getCurrent
CUDA
Stream
();
const
auto
exec_policy
=
thrust
::
cuda
::
par_nosync
(
allocator
).
on
(
stream
);
const auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
auto device = runtime::DeviceAPI::Get(ctx);
const IdType num_rows = rows_arr->shape[0];
IdType
*
const
rows
=
rows_arr
.
Ptr
<
IdType
>
();
IdType
*
const
nids
=
IsNullArray
(
NIDs
)
?
nullptr
:
NIDs
.
Ptr
<
IdType
>
();
FloatType
*
const
A
=
prob_arr
.
Ptr
<
FloatType
>
();
IdType
*
const
indptr_
=
mat
.
indptr
.
Ptr
<
IdType
>
();
IdType
*
const
indices_
=
mat
.
indices
.
Ptr
<
IdType
>
();
IdType
*
const
data
=
CSRHasData
(
mat
)
?
mat
.
data
.
Ptr
<
IdType
>
()
:
nullptr
;
// IdType* const rows = rows_arr.Ptr<IdType>();
IdType* const rows = static_cast<IdType*>(GetDevicePointer(rows_arr));
// IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>();
IdType* const nids = IsNullArray(NIDs) ? nullptr : static_cast<IdType*>(GetDevicePointer(NIDs));
// FloatType* const A = prob_arr.Ptr<FloatType>();
FloatType* const A = static_cast<FloatType*>(GetDevicePointer(prob_arr));;
// IdType* const indptr_ = mat.indptr.Ptr<IdType>();
IdType* const indptr_ = static_cast<IdType*>(GetDevicePointer(mat.indptr));
// IdType* const indices_ = mat.indices.Ptr<IdType>();
IdType* const indices_ = static_cast<IdType*>(GetDevicePointer(mat.indices));
// IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
IdType* const data = CSRHasData(mat) ? static_cast<IdType*>(GetDevicePointer(mat.data)) : nullptr;
// Read indptr only once in case it is pinned and access is slow.
auto indptr = allocator.alloc_unique<IdType>(num_rows);
...
...
@@ -567,11 +579,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Reduce
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Reduce(
nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Reduce
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Reduce(
temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
stream));
...
...
@@ -584,11 +596,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
IdType hop_size;
{
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream));
...
...
@@ -617,11 +629,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto modified_in_deg = thrust::make_transform_iterator(
iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
num_rows + 1, stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, modified_in_deg,
subindptr_aligned.get(), num_rows + 1, stream));
...
...
src/array/cuda/macro.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/macro.cuh
...
...
@@ -30,14 +31,14 @@
const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(
cuda
Memcpy( \
CUDA_CALL(
hip
Memcpy( \
(LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(),
cuda
MemcpyHostToDevice)); \
sizeof(int64_t) * info.lhs_offset.size(),
hip
MemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(
cuda
Memcpy( \
CUDA_CALL(
hip
Memcpy( \
(RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(),
cuda
MemcpyHostToDevice)); \
sizeof(int64_t) * info.rhs_offset.size(),
hip
MemcpyHostToDevice)); \
if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \
...
...
src/array/cuda/negative_sampling.
cu
→
src/array/cuda/negative_sampling.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/negative_sampling.cu
* @brief rowwise sampling
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/array_iterator.h>
#include <dgl/random.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
using namespace dgl::runtime;
...
...
@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
cu
randStatePhilox4_32_10_t
hip
randStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time
cu
rand_init
(
random_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
rand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) {
for (int i = 0; i < num_trials; ++i) {
uint4
result
=
cu
rand4
(
&
rng
);
uint4 result =
hip
rand4(&rng);
// Turns out that result.x is always 0 with the above RNG.
uint64_t y_hi = result.y >> 16;
uint64_t y_lo = result.y & 0xFFFF;
...
...
@@ -88,7 +90,7 @@ struct IsNotMinusOne {
template <typename IdType>
void SortOrderedPairs(
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
IdType
*
tmp_major
,
IdType
*
tmp_minor
,
int64_t
n
,
cuda
Stream_t
stream
)
{
IdType* tmp_major, IdType* tmp_minor, int64_t n,
hip
Stream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix
...
...
@@ -98,21 +100,21 @@ void SortOrderedPairs(
void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
// Radix sort by major key next.
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
...
...
@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IdType* out_row_data = out_row.Ptr<IdType>();
IdType* out_col_data = out_col.Ptr<IdType>();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = cuda::FindNumThreads(num_actual_samples);
const int nb = (num_actual_samples + nt - 1) / nt;
std::pair<IdArray, IdArray> result;
...
...
@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL
(
cub
::
DeviceSelect
::
If
(
CUDA_CALL(
hip
cub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL
(
cub
::
DeviceSelect
::
If
(
CUDA_CALL(
hip
cub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
...
...
@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
size_t tmp_size_unique = 0;
void* tmp_unique = nullptr;
CUDA_CALL
(
cub
::
DeviceSelect
::
Unique
(
CUDA_CALL(
hip
cub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size)
? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer
CUDA_CALL
(
cub
::
DeviceSelect
::
Unique
(
CUDA_CALL(
hip
cub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out
=
std
::
min
(
num_samples
,
num_out
);
num_out = ::min(num_samples, num_out);
result = {
unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
} else {
num_out
=
std
::
min
(
num_samples
,
num_out
);
num_out = ::min(num_samples, num_out);
result = {
out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)};
...
...
src/array/cuda/rowwise_sampling.
cu
→
src/array/cuda/rowwise_sampling.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/rowwise_sampling.cu
* @brief uniform rowwise sampling
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include "
../../array/cuda/
atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
using namespace dgl::cuda;
using namespace dgl::aten::cuda;
...
...
@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
__syncthreads();
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const
int
num
=
cu
rand
(
&
rng
)
%
(
idx
+
1
);
const int num =
hip
rand(&rng) % (idx + 1);
if (num < num_picks) {
// use max so as to achieve the replacement order the serial
// algorithm would have
...
...
@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const
int64_t
edge
=
cu
rand
(
&
rng
)
%
deg
;
const int64_t edge =
hip
rand(&rng) % deg;
const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge];
...
...
@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...
...
@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cuda
Event_t
copyEvent
;
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
hip
Event_t copyEvent;
CUDA_CALL(
hip
EventCreate(©Event));
NDArray new_len_tensor;
if (TensorDispatcher::Global()->IsAvailable()) {
...
...
@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
}
// copy using the internal current stream
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
...
...
@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL(
hip
EventDestroy(copyEvent));
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
...
...
src/array/cuda/rowwise_sampling_prob.
cu
→
src/array/cuda/rowwise_sampling_prob.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2022 by Contributors
* @file array/cuda/rowwise_sampling_prob.cu
...
...
@@ -6,20 +8,20 @@
* sampling code rowwise_sampling.cu.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include "
../../array/cuda/
atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
// require CUB 1.17 to use DeviceSegmentedSort
static_assert
(
CUB_VERSION
>=
101700
,
"Require CUB >= 1.17 to use DeviceSegmentedSort"
);
//
static_assert(
//
CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
namespace dgl {
using namespace cuda;
...
...
@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
prob, data, idx, in_row_start, &item_prob);
// compute A-Res value
ares[ares_idx] = static_cast<FloatType>(
__powf
(
cu
rand_uniform
(
&
rng
),
1.0
f
/
item_prob
));
__powf(
hip
rand_uniform(&rng), 1.0f / item_prob));
ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
}
}
...
...
@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
if (deg > 0) {
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef
cub
::
BlockScan
<
FloatType
,
BLOCK_SIZE
>
BlockScan
;
typedef
hip
cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total
...
...
@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get random value
FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType
rand
=
static_cast
<
FloatType
>
(
cu
rand_uniform
(
&
rng
)
*
sum
);
FloatType rand = static_cast<FloatType>(
hip
rand_uniform(&rng) * sum);
// get the offset of the first value within cdf array which is greater
// than random value.
int64_t
item
=
cub
::
UpperBound
<
FloatType
*
,
int64_t
,
FloatType
>
(
int64_t item =
hip
cub::UpperBound<FloatType*, int64_t, FloatType>(
&cdf[cdf_row_start], deg, rand);
item = min(item, deg - 1);
// get in and out index
...
...
@@ -400,18 +402,20 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
const auto idtype = coo.row->dtype;
const auto ctx = coo.row->ctx;
const int64_t nnz = coo.row->shape[0];
const
IdType
*
row
=
coo
.
row
.
Ptr
<
IdType
>
();
const
IdType
*
col
=
coo
.
col
.
Ptr
<
IdType
>
();
// const IdType* row = coo.row.Ptr<IdType>();
const IdType* row = static_cast<IdType*>(GetDevicePointer(coo.row));
// const IdType* col = coo.col.Ptr<IdType>();
const IdType* col = static_cast<IdType*>(GetDevicePointer(coo.col));
const IdArray& eid =
COOHasData(coo) ? coo.data : Range(0, nnz, sizeof(IdType) * 8, ctx);
const
IdType
*
data
=
coo
.
data
.
Ptr
<
IdType
>
();
const IdType* data =
static_cast
<IdType
*
>(
GetDevicePointer(coo.data)
);
IdArray new_row = IdArray::Empty({nnz}, idtype, ctx);
IdArray new_col = IdArray::Empty({nnz}, idtype, ctx);
IdArray new_eid = IdArray::Empty({nnz}, idtype, ctx);
IdType* new_row_data = new_row.Ptr<IdType>();
IdType* new_col_data = new_col.Ptr<IdType>();
IdType* new_eid_data = new_eid.Ptr<IdType>();
auto
stream
=
runtime
::
getCurrent
CUDA
Stream
();
auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
...
...
@@ -439,9 +443,10 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
template <DGLDeviceType XPU, typename IdType, typename DType>
COOMatrix _COORemoveIf(
const COOMatrix& coo, const NDArray& values, DType criteria) {
const
DType
*
val
=
values
.
Ptr
<
DType
>
();
// const DType* val = values.Ptr<DType>();
const DType* val = static_cast<DType*>(GetDevicePointer(values));
auto maskgen = [val, criteria](
int
nb
,
int
nt
,
cuda
Stream_t
stream
,
int64_t
nnz
,
int nb, int nt,
hip
Stream_t stream, int64_t nnz,
const IdType* data, int8_t* flags) {
CUDA_KERNEL_CALL(
(_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
...
...
@@ -481,7 +486,7 @@ COOMatrix _CSRRowWiseSampling(
const FloatArray& prob, bool replace) {
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...
...
@@ -530,10 +535,10 @@ COOMatrix _CSRRowWiseSampling(
IdType* temp_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, temp_deg);
...
...
@@ -551,16 +556,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cuda
Event_t
copyEvent
;
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
hip
Event_t copyEvent;
CUDA_CALL(
hip
EventCreate(©Event));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// wait on a cudaevent
IdType new_len;
...
...
@@ -568,7 +573,7 @@ COOMatrix _CSRRowWiseSampling(
device->CopyDataFromTo(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
// allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one
...
...
@@ -612,16 +617,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* sort_temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
cub
::
DoubleBuffer
<
FloatType
>
sort_keys
(
temp
,
sort_temp
);
cub
::
DoubleBuffer
<
IdType
>
sort_values
(
temp_idxs
,
sort_temp_idxs
);
hip
cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
hip
cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
device->FreeWorkspace(ctx, d_temp_storage);
...
...
@@ -641,8 +646,8 @@ COOMatrix _CSRRowWiseSampling(
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL(
hip
EventDestroy(copyEvent));
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
...
...
src/array/cuda/sddmm.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cuh
...
...
@@ -10,8 +12,8 @@
#include "../../runtime/cuda/cuda_common.h"
#include "../selector.h"
#include "
./
functor.cuh"
#include "
./
utils.h"
#include "functor.cuh"
#include "utils.h"
#include "atomic.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
...
...
@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
}
#pragma unroll
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
_sync
(
full_mask
,
val
,
offset
);
val
+=
__shfl_down
(
val
,
offset
);
if
(
tx
==
0
)
outoff
[
i
]
=
val
;
}
}
...
...
@@ -275,7 +277,7 @@ void SDDMMCoo(
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
...
@@ -337,7 +339,7 @@ void SDDMMCsr(
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
N
=
csr
.
num_rows
,
M
=
csr
.
num_cols
,
E
=
csr
.
indices
->
shape
[
0
];
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
...
...
src/array/cuda/sddmm.
cu
→
src/array/cuda/sddmm.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
...
...
@@ -5,8 +6,8 @@
*/
#include <dgl/array.h>
#include "
./
functor.cuh"
#include "
./
sddmm.cuh"
#include "functor.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
template
void
SDDMMCsr
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCsr<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template
void
SDDMMCsr
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCsr<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
...
...
@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
template
void
SDDMMCoo
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCoo<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template
void
SDDMMCoo
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCoo<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
...
...
src/array/cuda/sddmm_hetero_coo.
cu
→
src/array/cuda/sddmm_hetero_coo.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
...
...
@@ -5,7 +6,7 @@
*/
#include <dgl/array.h>
#include "
./
sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
template
void
SDDMMCooHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCooHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template
void
SDDMMCooHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCooHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
...
...
src/array/cuda/sddmm_hetero_csr.
cu
→
src/array/cuda/sddmm_hetero_csr.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
...
...
@@ -5,7 +6,7 @@
*/
#include <dgl/array.h>
#include "
./
sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
template
void
SDDMMCsrHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCsrHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template
void
SDDMMCsrHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCsrHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
...
...
src/array/cuda/segment_reduce.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cuh
...
...
@@ -10,8 +12,8 @@
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace
dgl
{
...
...
@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
out
->
shape
[
0
];
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
...
@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
const
IdType
*
idx_data
=
idx
.
Ptr
<
IdType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
feat
->
shape
[
0
];
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
...
@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
const
std
::
vector
<
NDArray
>&
list_feat
,
const
std
::
vector
<
NDArray
>&
list_idx
,
const
std
::
vector
<
NDArray
>&
list_idx_types
,
std
::
vector
<
NDArray
>*
list_out
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if
(
op
==
"copy_lhs"
||
op
==
"copy_rhs"
)
{
std
::
vector
<
std
::
vector
<
dgl_id_t
>>
src_dst_ntypes
(
graph
->
NumVertexTypes
(),
std
::
vector
<
dgl_id_t
>
());
...
...
@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
feat
->
shape
[
0
];
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
...
src/array/cuda/segment_reduce.
cu
→
src/array/cuda/segment_reduce.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cu
...
...
@@ -6,9 +7,9 @@
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include "
./
functor.cuh"
#include "
./
segment_reduce.cuh"
#include "
./
utils.h"
#include "functor.cuh"
#include "segment_reduce.cuh"
#include "utils.h"
namespace dgl {
...
...
@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
#if BF16_ENABLED
template
void
SegmentReduce
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentReduce<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
template
void
SegmentReduce
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentReduce<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
#endif // BF16_ENABLED
...
...
@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
template void ScatterAdd<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray idx, NDArray out);
#if BF16_ENABLED
template
void
ScatterAdd
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void ScatterAdd<kDGLCUDA, int32_t, __
hip
_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
template
void
ScatterAdd
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void ScatterAdd<kDGLCUDA, int64_t, __
hip
_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
#endif // BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, float>(
...
...
@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
#if BF16_ENABLED
template
void
UpdateGradMinMax_hetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
template
void
UpdateGradMinMax_hetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
...
...
@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray arg, NDArray out);
#if BF16_ENABLED
template
void
BackwardSegmentCmp
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void BackwardSegmentCmp<kDGLCUDA, int32_t, __
hip
_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
template
void
BackwardSegmentCmp
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __
hip
_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
#endif // BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/spmat_op_impl_coo.
cu
→
src/array/cuda/spmat_op_impl_coo.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by contributors.
* @file array/cuda/spmat_op_impl_coo.cu
...
...
@@ -10,8 +12,8 @@
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace dgl {
...
...
@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType nt = 1024;
...
...
@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType num_rows = coo.num_rows;
...
...
src/array/cuda/spmat_op_impl_csr.
cu
→
src/array/cuda/spmat_op_impl_csr.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmat_op_impl_csr.cu
...
...
@@ -7,14 +9,14 @@
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include <unordered_set>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace dgl {
...
...
@@ -28,7 +30,7 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = csr.indptr->ctx;
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
...
...
@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const
auto
rstlen
=
std
::
max
(
rowlen
,
collen
);
const auto rstlen = ::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
if (rstlen == 0) return rst;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = dgl::cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr;
...
...
@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
bool CSRHasDuplicate(CSRMatrix csr) {
if (!csr.sorted) csr = CSRSort(csr);
const auto& ctx = csr.indptr->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
...
...
@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto len = rows->shape[0];
const IdType* vid_data = rows.Ptr<IdType>();
const IdType* indptr_data =
...
...
@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
...
...
@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const
auto
len
=
std
::
max
(
rowlen
,
collen
);
const auto len = ::max(rowlen, collen);
if (len == 0) return {NullArray(), NullArray(), NullArray()};
const auto& ctx = row->ctx;
...
...
@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
const int64_t nnz = csr.indices->shape[0];
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const IdType* indptr_data =
static_cast<IdType*>(GetDevicePointer(csr.indptr));
...
...
@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
static_cast<IdType>(num_rows));
NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
typedef
cub
::
WarpReduce
<
IdType
>
WarpReduce
;
typedef
hip
cub::WarpReduce<IdType> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
while (out_row < last_row) {
...
...
@@ -546,7 +548,7 @@ __global__ void _SegmentMaskColKernel(
mask[idx] = 1;
}
}
IdType
reduce_count
=
WarpReduce
(
temp_storage
[
warp_id
]).
Sum
(
local_count
);
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
if (laneid == 0) {
count[out_row] = reduce_count;
}
...
...
@@ -557,7 +559,7 @@ __global__ void _SegmentMaskColKernel(
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceMatrix(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = rows->ctx;
const auto& dtype = rows->dtype;
const auto nbits = dtype.bits;
...
...
@@ -582,7 +584,7 @@ CSRMatrix CSRSliceMatrix(
// A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
CUDA_CALL(
cuda
Memset
(
count
.
Ptr
<
IdType
>
(),
0
,
sizeof
(
IdType
)
*
(
csr
.
num_rows
)));
hip
Memset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5);
...
...
@@ -593,7 +595,7 @@ CSRMatrix CSRSliceMatrix(
using it = thrust::counting_iterator<int64_t>;
runtime::CUDAWorkspaceAllocator allocator(ctx);
const
auto
exec_policy
=
thrust
::
cuda
::
par_nosync
(
allocator
).
on
(
stream
);
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
thrust::for_each(
exec_policy, it(0), it(new_ncols),
[key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
...
...
@@ -609,14 +611,15 @@ CSRMatrix CSRSliceMatrix(
// Execute SegmentMaskColKernel
const int64_t num_rows = csr.num_rows;
constexpr
int
WARP_SIZE
=
32
;
constexpr int WARP_SIZE =
64
;
// With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
constexpr
int
TILE_SIZE
=
16
;
constexpr int TILE_SIZE =
32
;
constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
IdType nb =
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb);
CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
...
...
src/array/cuda/spmm.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cuh
...
...
@@ -11,7 +13,7 @@
#include <limits>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
...
...
@@ -28,14 +30,14 @@ namespace aten {
*/
template
<
typename
DType
,
typename
IdType
>
inline
bool
cusparse_available
(
bool
more_nnz_than_matrix_size
)
{
#if
CUDA
RT_VERSION < 11000
#if
DTK
RT_VERSION < 11000
if
(
std
::
is_same
<
IdType
,
int
>::
value
&&
(
std
::
is_same
<
DType
,
float
>::
value
||
std
::
is_same
<
DType
,
double
>::
value
))
return
true
;
return
false
;
#else
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
std
::
is_same
<
DType
,
__
nv
_bfloat16
>::
value
)
std
::
is_same
<
DType
,
__
hip
_bfloat16
>::
value
)
return
false
;
// cusparse's SpMM on fp16 is slow, temporally disabled.
// If the CSR matrix has more NNZ than matrix size, we should not use
// cuSPARSE 11.1.
...
...
@@ -47,54 +49,54 @@ namespace {
/** @brief Call cuBLAS geam API for transpose operation for float and double. */
template
<
typename
DType
>
cu
blasStatus_t
Xgeam
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
DType
*
alpha
,
const
DType
*
A
,
int
lda
,
const
DType
*
beta
,
const
DType
*
B
,
int
ldb
,
DType
*
C
,
int
ldc
)
{
LOG
(
FATAL
)
<<
"Not supported dtype"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
template
<
>
cu
blasStatus_t
Xgeam
<
__half
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
<
__half
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
__half
*
alpha
,
const
__half
*
A
,
int
lda
,
const
__half
*
beta
,
const
__half
*
B
,
int
ldb
,
__half
*
C
,
int
ldc
)
{
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"Xgeam does not support dtype half (FP16)"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
#if BF16_ENABLED
template
<
>
cu
blasStatus_t
Xgeam
<
__
nv
_bfloat16
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
int
m
,
int
n
,
const
__
nv
_bfloat16
*
alpha
,
const
__
nv
_bfloat16
*
A
,
int
lda
,
const
__
nv
_bfloat16
*
beta
,
const
__
nv
_bfloat16
*
B
,
int
ldb
,
__
nv
_bfloat16
*
C
,
int
ldc
)
{
hip
blasStatus_t
Xgeam
<
__
hip
_bfloat16
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
__
hip
_bfloat16
*
alpha
,
const
__
hip
_bfloat16
*
A
,
int
lda
,
const
__
hip
_bfloat16
*
beta
,
const
__
hip
_bfloat16
*
B
,
int
ldb
,
__
hip
_bfloat16
*
C
,
int
ldc
)
{
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"Xgeam does not support dtype bfloat16 (BF16)"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
#endif // BF16_ENABLED
template
<
>
cu
blasStatus_t
Xgeam
<
float
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
<
float
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
beta
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
)
{
return
cu
blasSgeam
(
return
hip
blasSgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
template
<
>
cu
blasStatus_t
Xgeam
<
double
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
<
double
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
double
*
alpha
,
const
double
*
A
,
int
lda
,
const
double
*
beta
,
const
double
*
B
,
int
ldb
,
double
*
C
,
int
ldc
)
{
return
cu
blasDgeam
(
return
hip
blasDgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
...
...
@@ -119,12 +121,12 @@ template <typename DType>
void
_Transpose
(
const
DType
*
in
,
DType
*
out
,
int
row
,
int
col
)
{
DType
alpha
=
1.
,
beta
=
0.
;
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if
(
!
thr_entry
->
cublas_handle
)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL
(
hip
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
hip
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL
(
Xgeam
<
DType
>
(
thr_entry
->
cublas_handle
,
CU
BLAS_OP_T
,
CU
BLAS_OP_N
,
row
,
col
,
&
alpha
,
in
,
thr_entry
->
cublas_handle
,
HIP
BLAS_OP_T
,
HIP
BLAS_OP_N
,
row
,
col
,
&
alpha
,
in
,
col
,
&
beta
,
nullptr
,
row
,
out
,
row
));
}
...
...
@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
*/
template
<
>
void
_Transpose
<
__half
>
(
const
__half
*
in
,
__half
*
out
,
int
row
,
int
col
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
row
);
int
nb
=
col
;
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
...
...
@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
*/
template
<
>
void
_Transpose
<
__
nv
_bfloat16
>
(
const
__
nv
_bfloat16
*
in
,
__
nv
_bfloat16
*
out
,
int
row
,
int
col
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
void
_Transpose
<
__
hip
_bfloat16
>
(
const
__
hip
_bfloat16
*
in
,
__
hip
_bfloat16
*
out
,
int
row
,
int
col
)
{
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
row
);
int
nb
=
col
;
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
}
#endif // BF16_ENABLED
#if
CUDA
RT_VERSION < 11000
#if
DTK
RT_VERSION < 11000
template
<
typename
DType
>
cu
sparseStatus_t
Xcsrmm2
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
DType
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
DType
*
csrValA
,
hip
sparseStatus_t
Xcsrmm2
(
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
DType
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
DType
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
DType
*
B
,
int
ldb
,
const
DType
*
beta
,
DType
*
C
,
int
ldc
)
{
LOG
(
INFO
)
<<
"Not supported dtype"
;
return
CU
SPARSE_STATUS_EXECUTION_FAILED
;
return
HIP
SPARSE_STATUS_EXECUTION_FAILED
;
}
template
<
>
cu
sparseStatus_t
Xcsrmm2
<
float
>
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
float
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
float
*
csrValA
,
hip
sparseStatus_t
Xcsrmm2
<
float
>
(
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
float
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
float
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cu
sparseScsrmm2
(
return
hip
sparseScsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
>
cu
sparseStatus_t
Xcsrmm2
<
double
>
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
double
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
double
*
csrValA
,
hip
sparseStatus_t
Xcsrmm2
<
double
>
(
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
double
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
double
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
double
*
B
,
int
ldb
,
const
double
*
beta
,
double
*
C
,
int
ldc
)
{
return
cu
sparseDcsrmm2
(
return
hip
sparseDcsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
...
...
@@ -214,12 +216,12 @@ void CusparseCsrmm2(
// device
auto
device
=
runtime
::
DeviceAPI
::
Get
(
ctx
);
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
// all one data array
DType
*
valptr
=
nullptr
;
if
(
!
A_data
)
{
...
...
@@ -227,55 +229,59 @@ void CusparseCsrmm2(
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
}
#if
CUDA
RT_VERSION >= 11000
cu
sparseSpMatDescr_t
matA
;
cu
sparseDnMatDescr_t
matB
,
matC
;
#if
DTK
RT_VERSION >= 11000
hip
sparseSpMatDescr_t
matA
;
hip
sparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL
(
hip
sparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
CU
SPARSE_ORDER_ROW
));
HIP
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
hip
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
HIP
SPARSE_ORDER_ROW
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
CU
SPARSE_ORDER_ROW
));
hip
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
HIP
SPARSE_ORDER_ROW
));
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transA
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
size_t
workspace_size
;
cusparseSpMMAlg_t
spmm_alg
=
use_deterministic_alg_only
?
CUSPARSE_SPMM_CSR_ALG3
:
CUSPARSE_SPMM_CSR_ALG2
;
CUSPARSE_CALL
(
cu
sparseSpMM_bufferSize
(
CUSPARSE_CALL
(
hip
sparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
spmm_alg
,
&
workspace_size
));
matC
,
dtype
,
HIPSPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseSpMM
(
CUSPARSE_CALL
(
hip
sparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
spmm_alg
,
workspace
));
matC
,
dtype
,
HIPSPARSE_SPMM_CSR_ALG2
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matC
));
CUSPARSE_CALL
(
hip
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matC
));
#else
// allocate matrix for temporary transposed output
DType
*
trans_out
=
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
m
*
n
*
sizeof
(
DType
)));
cu
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
hip
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
hip
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
hip
sparseSetMatType
(
descr
,
HIP
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
hip
sparseSetMatIndexBase
(
descr
,
HIP
SPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_OPERATION_NON_TRANSPOSE
,
CU
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
thr_entry
->
cusparse_handle
,
HIP
SPARSE_OPERATION_NON_TRANSPOSE
,
HIP
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
trans_out
,
m
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL
(
hip
sparseDestroyMatDescr
(
descr
));
// transpose the output matrix
_Transpose
(
trans_out
,
C_data
,
n
,
m
);
device
->
FreeWorkspace
(
ctx
,
trans_out
);
...
...
@@ -287,8 +293,10 @@ void CusparseCsrmm2(
template
<
typename
DType
,
typename
IdType
>
void
CusparseCsrmm2Hetero
(
const
DGLContext
&
ctx
,
const
CSRMatrix
&
csr
,
const
DType
*
B_data
,
const
DType
*
A_data
,
DType
*
C_data
,
int64_t
x_length
,
cudaStream_t
strm_id
,
bool
use_deterministic_alg_only
=
false
)
{
// We use csrmm2 to perform following operation:
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
// for node feature tensor. However, since cusparse only supports
...
...
@@ -311,9 +319,9 @@ void CusparseCsrmm2Hetero(
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
strm_id
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
strm_id
));
// all one data array
DType
*
valptr
=
nullptr
;
if
(
!
A_data
)
{
...
...
@@ -321,51 +329,53 @@ void CusparseCsrmm2Hetero(
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
}
#if
CUDA
RT_VERSION >= 11000
cu
sparseSpMatDescr_t
matA
;
cu
sparseDnMatDescr_t
matB
,
matC
;
#if
DTK
RT_VERSION >= 11000
hip
sparseSpMatDescr_t
matA
;
hip
sparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL
(
hip
sparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
CU
SPARSE_ORDER_ROW
));
HIP
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
hip
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
HIP
SPARSE_ORDER_ROW
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
CU
SPARSE_ORDER_ROW
));
hip
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
HIP
SPARSE_ORDER_ROW
));
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transA
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
size_t
workspace_size
;
cusparseSpMMAlg_t
spmm_alg
=
use_deterministic_alg_only
?
CUSPARSE_SPMM_CSR_ALG3
:
CUSPARSE_SPMM_CSR_ALG2
;
CUSPARSE_CALL
(
cu
sparseSpMM_bufferSize
(
CUSPARSE_CALL
(
hip
sparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
spmm_alg
,
&
workspace_size
));
matC
,
dtype
,
HIPSPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseSpMM
(
CUSPARSE_CALL
(
hip
sparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
spmm_alg
,
workspace
));
matC
,
dtype
,
HIPSPARSE_SPMM_CSR_ALG2
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matC
));
CUSPARSE_CALL
(
hip
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matC
));
#else
cu
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
hip
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
hip
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
hip
sparseSetMatType
(
descr
,
HIP
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
hip
sparseSetMatIndexBase
(
descr
,
HIP
SPARSE_INDEX_BASE_ZERO
));
CHECK_EQ
(
sizeof
(
IdType
),
sizeof
(
int32_t
));
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_OPERATION_NON_TRANSPOSE
,
CU
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
thr_entry
->
cusparse_handle
,
HIP
SPARSE_OPERATION_NON_TRANSPOSE
,
HIP
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
C_data
,
m
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL
(
hip
sparseDestroyMatDescr
(
descr
));
#endif
if
(
valptr
)
device
->
FreeWorkspace
(
ctx
,
valptr
);
}
...
...
@@ -632,7 +642,7 @@ void SpMMCoo(
*/
#if BF16_ENABLED
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
std
::
is_same
<
DType
,
__
nv
_bfloat16
>::
value
)
std
::
is_same
<
DType
,
__
hip
_bfloat16
>::
value
)
#else
if
(
std
::
is_same
<
DType
,
__half
>::
value
)
#endif // BF16_ENABLED
...
...
@@ -645,7 +655,7 @@ void SpMMCoo(
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
(),
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
int64_t
N
=
coo
.
num_rows
,
M
=
coo
.
num_cols
,
E
=
coo
.
row
->
shape
[
0
];
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
...
...
@@ -710,7 +720,7 @@ void SpMMCsr(
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
...
@@ -771,7 +781,7 @@ void SpMMCmpCsrHetero(
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
...
src/array/cuda/spmm.
cu
→
src/array/cuda/spmm.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
...
...
@@ -8,9 +9,9 @@
#include <cstdlib>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "
./
ge_spmm.cuh"
#include "
./
spmm.cuh"
#include "functor.cuh"
#include "ge_spmm.cuh"
#include "spmm.cuh"
namespace dgl {
...
...
@@ -114,11 +115,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
#if BF16_ENABLED
template
void
SpMMCsr
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCsr<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
template
void
SpMMCsr
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCsr<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
...
...
@@ -149,11 +150,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
#if BF16_ENABLED
template
void
SpMMCoo
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCoo<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
template
void
SpMMCoo
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCoo<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
...
...
src/array/cuda/spmm_hetero.
cu
→
src/array/cuda/spmm_hetero.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
...
...
@@ -8,9 +10,9 @@
#include <cstdlib>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "
./
ge_spmm.cuh"
#include "
./
spmm.cuh"
#include "functor.cuh"
#include "ge_spmm.cuh"
#include "spmm.cuh"
namespace dgl {
...
...
@@ -42,7 +44,7 @@ void SpMMCsrHetero(
use_deterministic_alg_only = true;
bool use_legacy_cusparsemm =
(
CUDA
RT_VERSION
<
11000
)
&&
(
reduce
==
"sum"
)
&&
(
DTK
RT_VERSION < 11000) && (reduce == "sum") &&
// legacy cuSPARSE does not care about NNZ, hence the argument "false".
((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
(op == "mul" && is_scalar_efeat &&
...
...
@@ -55,7 +57,7 @@ void SpMMCsrHetero(
if (m == 0) continue;
DType* out = static_cast<DType*>(device->AllocWorkspace(
vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
CUDA_CALL
(
cuda
Memset
(
out
,
0
,
m
*
n
*
sizeof
(
DType
)));
CUDA_CALL(
hip
Memset(out, 0, m * n * sizeof(DType)));
trans_out[ntype] = out;
}
}
...
...
@@ -116,7 +118,7 @@ void SpMMCsrHetero(
}
}
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
const dgl_type_t src_id = ufeat_ntids[etype];
const dgl_type_t dst_id = out_ntids[etype];
...
...
@@ -128,7 +130,7 @@ void SpMMCsrHetero(
cusparse_available<DType, IdType>(more_nnz)) { // cusparse
/* If CUDA is less than 11.0, put the output in trans_out for later
* transposition */
DType
*
out
=
(
CUDA
RT_VERSION
<
11000
)
DType* out = (
DTK
RT_VERSION < 11000)
? trans_out[dst_id]
: static_cast<DType*>((*vec_out)[dst_id]->data);
CusparseCsrmm2Hetero<DType, IdType>(
...
...
@@ -214,14 +216,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
#if BF16_ENABLED
template
void
SpMMCsrHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCsrHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
std::vector<std::vector<NDArray>>* out_aux,
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
template
void
SpMMCsrHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCsrHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
...
...
src/array/cuda/utils.h
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.h
...
...
@@ -11,7 +13,7 @@
#include <dgl/runtime/ndarray.h>
#include <dmlc/logging.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <type_traits>
#include "../../runtime/cuda/cuda_common.h"
...
...
@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {
template
<
typename
T
>
__device__
__forceinline__
T
_ldg
(
T
*
addr
)
{
#if __
CUDA_ARCH__ >= 350
#if __
HIP_DEVICE_COMPILE__
return
__ldg
(
addr
);
#else
return
*
addr
;
...
...
@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
/** @brief Fill the vector started from ptr of size length with val */
template
<
typename
DType
>
void
_Fill
(
DType
*
ptr
,
size_t
length
,
DType
val
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
length
);
int
nb
=
(
length
+
nt
-
1
)
/
nt
;
// on x-axis, no need to worry about upperbound.
...
...
@@ -185,8 +187,8 @@ template <typename IdType>
__global__
void
_LinearSearchKernel
(
const
IdType
*
indptr
,
const
IdType
*
indices
,
const
IdType
*
data
,
const
IdType
*
row
,
const
IdType
*
col
,
int64_t
row_stride
,
int64_t
col_stride
,
int64_t
length
,
const
__
nv
_bfloat16
*
weights
,
__
nv
_bfloat16
filler
,
__
nv
_bfloat16
*
out
)
{
int64_t
col_stride
,
int64_t
length
,
const
__
hip
_bfloat16
*
weights
,
__
hip
_bfloat16
filler
,
__
hip
_bfloat16
*
out
)
{
int
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
stride_x
=
gridDim
.
x
*
blockDim
.
x
;
while
(
tx
<
length
)
{
...
...
@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
}
else
{
// If the result is saved in bf16, it should be fine to convert it to
// float first
out
[
tx
]
=
weights
?
weights
[
v
]
:
__
nv
_bfloat16
(
static_cast
<
float
>
(
v
));
out
[
tx
]
=
weights
?
weights
[
v
]
:
__
hip
_bfloat16
(
static_cast
<
float
>
(
v
));
}
tx
+=
stride_x
;
}
...
...
@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
void
MaskSelect
(
runtime
::
DeviceAPI
*
device
,
const
DGLContext
&
ctx
,
const
DType
*
input
,
const
BoolType
*
mask
,
DType
*
output
,
int64_t
n
,
int64_t
*
rst
,
cuda
Stream_t
stream
)
{
hip
Stream_t
stream
)
{
size_t
workspace_size
=
0
;
CUDA_CALL
(
cub
::
DeviceSelect
::
Flagged
(
CUDA_CALL
(
hip
cub
::
DeviceSelect
::
Flagged
(
nullptr
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUDA_CALL
(
cub
::
DeviceSelect
::
Flagged
(
CUDA_CALL
(
hip
cub
::
DeviceSelect
::
Flagged
(
workspace
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
}
...
...
@@ -290,7 +292,7 @@ void MaskSelect(
inline
void
*
GetDevicePointer
(
runtime
::
NDArray
array
)
{
void
*
ptr
=
array
->
data
;
if
(
array
.
IsPinned
())
{
CUDA_CALL
(
cuda
HostGetDevicePointer
(
&
ptr
,
ptr
,
0
));
CUDA_CALL
(
hip
HostGetDevicePointer
(
&
ptr
,
ptr
,
0
));
}
return
ptr
;
}
...
...
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment