Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
589 additions
and
439 deletions
+589
-439
src/array/cuda/array_sort.hip
src/array/cuda/array_sort.hip
+9
-5
src/array/cuda/atomic.cuh
src/array/cuda/atomic.cuh
+72
-37
src/array/cuda/bf16.cuh
src/array/cuda/bf16.cuh
+78
-68
src/array/cuda/coo2csr.hip
src/array/cuda/coo2csr.hip
+11
-7
src/array/cuda/coo_sort.hip
src/array/cuda/coo_sort.hip
+7
-3
src/array/cuda/csr2coo.hip
src/array/cuda/csr2coo.hip
+71
-15
src/array/cuda/csr_get_data.hip
src/array/cuda/csr_get_data.hip
+11
-7
src/array/cuda/csr_mm.hip
src/array/cuda/csr_mm.hip
+65
-61
src/array/cuda/csr_sort.hip
src/array/cuda/csr_sort.hip
+18
-16
src/array/cuda/csr_sum.hip
src/array/cuda/csr_sum.hip
+18
-16
src/array/cuda/csr_transpose.cc
src/array/cuda/csr_transpose.cc
+14
-12
src/array/cuda/cuda_filter.hip
src/array/cuda/cuda_filter.hip
+8
-6
src/array/cuda/cusparse_dispatcher.cuh
src/array/cuda/cusparse_dispatcher.cuh
+64
-63
src/array/cuda/disjoint_union.hip
src/array/cuda/disjoint_union.hip
+5
-3
src/array/cuda/fp16.cuh
src/array/cuda/fp16.cuh
+13
-12
src/array/cuda/functor.cuh
src/array/cuda/functor.cuh
+33
-32
src/array/cuda/gather_mm.hip
src/array/cuda/gather_mm.hip
+59
-48
src/array/cuda/ge_spmm.cuh
src/array/cuda/ge_spmm.cuh
+4
-2
src/array/cuda/labor_sampling.hip
src/array/cuda/labor_sampling.hip
+24
-22
src/array/cuda/macro.cuh
src/array/cuda/macro.cuh
+5
-4
No files found.
src/array/cuda/array_sort.
cu
→
src/array/cuda/array_sort.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cpu/array_sort.cu
* @file array/cpu/array_sort.cu
* @brief Array sort GPU implementation
* @brief Array sort GPU implementation
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
using runtime::NDArray;
using runtime::NDArray;
...
@@ -29,20 +33,20 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
...
@@ -29,20 +33,20 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
IdType* keys_out = sorted_array.Ptr<IdType>();
IdType* keys_out = sorted_array.Ptr<IdType>();
int64_t* values_out = sorted_idx.Ptr<int64_t>();
int64_t* values_out = sorted_idx.Ptr<int64_t>();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_bits == 0) {
if (num_bits == 0) {
num_bits = sizeof(IdType) * 8;
num_bits = sizeof(IdType) * 8;
}
}
// Allocate workspace
// Allocate workspace
size_t workspace_size = 0;
size_t workspace_size = 0;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
0, num_bits, stream));
0, num_bits, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
// Compute
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
workspace, workspace_size, keys_in, keys_out, values_in, values_out,
workspace, workspace_size, keys_in, keys_out, values_in, values_out,
nitems, 0, num_bits, stream));
nitems, 0, num_bits, stream));
...
...
src/array/cuda/atomic.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2019 by Contributors
* Copyright (c) 2019 by Contributors
* @file array/cuda/atomic.cuh
* @file array/cuda/atomic.cuh
...
@@ -6,7 +7,7 @@
...
@@ -6,7 +7,7 @@
#ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
#ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
#define DGL_ARRAY_CUDA_ATOMIC_CUH_
#define DGL_ARRAY_CUDA_ATOMIC_CUH_
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <cassert>
#include <cassert>
#include <cstdint>
#include <cstdint>
...
@@ -15,8 +16,8 @@
...
@@ -15,8 +16,8 @@
#include "bf16.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
#include "fp16.cuh"
#if __
CUDA_ARCH__ >= 600
#if __
HIPCC__
#include <
cuda
_fp16.h>
#include <
hip/hip
_fp16.h>
#endif
#endif
namespace
dgl
{
namespace
dgl
{
...
@@ -56,39 +57,39 @@ struct Cast {
...
@@ -56,39 +57,39 @@ struct Cast {
template
<
>
template
<
>
struct
Cast
<
half
>
{
struct
Cast
<
half
>
{
typedef
Code
<
sizeof
(
half
)
>::
Type
Type
;
typedef
half
Type
;
static
__device__
__forceinline__
Type
Encode
(
half
val
)
{
static
__host__
__device__
__forceinline__
Type
Encode
(
half
val
)
{
return
__half_as_ushort
(
val
);
return
__half_as_ushort
(
val
);
}
}
static
__device__
__forceinline__
half
Decode
(
Type
code
)
{
static
__host__
__device__
__forceinline__
half
Decode
(
Type
code
)
{
return
__ushort_as_half
(
code
);
return
__ushort_as_half
(
code
);
}
}
};
};
#if BF16_ENABLED
#if BF16_ENABLED
template
<
>
template
<
>
struct
Cast
<
__
nv
_bfloat16
>
{
struct
Cast
<
__
hip
_bfloat16
>
{
typedef
Code
<
sizeof
(
__nv
_bfloat16
)
>::
Type
Type
;
typedef
__hip
_bfloat16
Type
;
static
__device__
__forceinline__
Type
Encode
(
__
nv
_bfloat16
val
)
{
static
__host__
__device__
__forceinline__
Type
Encode
(
__
hip
_bfloat16
val
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__
HIP_DEVICE_COMPILE__)
return
__bfloat16_as_ushort
(
val
);
return
__bfloat16_as_ushort
(
val
);
#else
#else
printf
(
printf
(
"Atomic operations are not supported for bfloat16 (BF16) "
"Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.
\n
"
);
"on GPUs with compute capability less than 8.0.
\n
"
);
__trap
();
// //
__trap();
return
static_cast
<
Type
>
(
0
);
return
static_cast
<
Type
>
(
0
);
#endif
#endif
}
}
static
__device__
__forceinline__
__
nv
_bfloat16
Decode
(
Type
code
)
{
static
__host__
__device__
__forceinline__
__
hip
_bfloat16
Decode
(
Type
code
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__
HIP_DEVICE_COMPILE__)
return
__ushort_as_bfloat16
(
code
);
return
__ushort_as_bfloat16
(
code
);
#else
#else
printf
(
printf
(
"Atomic operations are not supported for bfloat16 (BF16) "
"Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.
\n
"
);
"on GPUs with compute capability less than 8.0.
\n
"
);
__trap
();
//
__trap();
return
static_cast
<
__
nv
_bfloat16
>
(
0.0
f
);
return
static_cast
<
__
hip
_bfloat16
>
(
0.0
f
);
#endif
#endif
}
}
};
};
...
@@ -116,12 +117,12 @@ struct Cast<double> {
...
@@ -116,12 +117,12 @@ struct Cast<double> {
}
}
};
};
static
__device__
__forceinline__
unsigned
short
int
atomicCASshort
(
// NOLINT
static
__host__
__device__
__forceinline__
unsigned
short
int
atomicCASshort
(
// NOLINT
unsigned
short
int
*
address
,
// NOLINT
unsigned
short
int
*
address
,
// NOLINT
unsigned
short
int
compare
,
// NOLINT
unsigned
short
int
compare
,
// NOLINT
unsigned
short
int
val
)
{
// NOLINT
unsigned
short
int
val
)
{
// NOLINT
static_assert
(
CUDA
RT_VERSION
>=
10000
,
"Requires at least CUDA 10"
);
static_assert
(
DTK
RT_VERSION
>=
10000
,
"Requires at least CUDA 10"
);
#if
(
defined(__
CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
#if defined(__
HIP_DEVICE_COMPILE__) && 0
return
atomicCAS
(
address
,
compare
,
val
);
return
atomicCAS
(
address
,
compare
,
val
);
#else
#else
(
void
)
address
;
(
void
)
address
;
...
@@ -130,9 +131,9 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
...
@@ -130,9 +131,9 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
printf
(
printf
(
"Atomic operations are not supported for half precision (FP16) "
"Atomic operations are not supported for half precision (FP16) "
"on this GPU.
\n
"
);
"on this GPU.
\n
"
);
__trap
();
abort
();
return
val
;
return
val
;
#endif // (defined(__
CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
#endif // (defined(__
HIP_DEVICE_COMPILE__)
}
}
#define DEFINE_ATOMIC(NAME) \
#define DEFINE_ATOMIC(NAME) \
...
@@ -168,19 +169,53 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
...
@@ -168,19 +169,53 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
return Cast<dtype>::Decode(old); \
return Cast<dtype>::Decode(old); \
}
}
#define OP(a, b) max(a, b)
#define DEFINE_ATOMIC_16BIT_BF(NAME, dtype) \
template <> \
__device__ __forceinline__ dtype Atomic##NAME<dtype>( \
dtype * addr, dtype val) { \
typedef uint16_t CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \
CT assumed = old; \
do { \
assumed = old; \
old = atomicCASshort( \
addr_as_ui, assumed, \
Cast<dtype>::Encode(max((double)val, (double)dtype(old)))); \
} while (assumed != old); \
return Cast<dtype>::Decode(old); \
}
#define DEFINE_ATOMIC_16BIT_Min(NAME, dtype) \
template <> \
__device__ __forceinline__ dtype Atomic##NAME<dtype>( \
dtype * addr, dtype val) { \
typedef uint16_t CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \
CT assumed = old; \
do { \
assumed = old; \
old = atomicCASshort( \
addr_as_ui, assumed, \
Cast<dtype>::Encode(min(val, dtype(old)))); \
} while (assumed != old); \
return Cast<dtype>::Decode(old); \
}
#define OP(a, b) max((double)a, (double)b)
DEFINE_ATOMIC
(
Max
)
DEFINE_ATOMIC
(
Max
)
DEFINE_ATOMIC_16BIT
(
Max
,
half
)
DEFINE_ATOMIC_16BIT
(
Max
,
half
)
#if BF16_ENABLED
#if BF16_ENABLED
DEFINE_ATOMIC_16BIT
(
Max
,
__
nv
_bfloat16
)
DEFINE_ATOMIC_16BIT
_BF
(
Max
,
__
hip
_bfloat16
)
#endif // BF16_ENABLED
#endif // BF16_ENABLED
#undef OP
#undef OP
#define OP(a, b) min(
a,
b)
#define OP(a, b) min(
(double)a, (double)
b)
DEFINE_ATOMIC
(
Min
)
DEFINE_ATOMIC
(
Min
)
DEFINE_ATOMIC_16BIT
(
Min
,
half
)
DEFINE_ATOMIC_16BIT
(
Min
,
half
)
#if BF16_ENABLED
#if BF16_ENABLED
DEFINE_ATOMIC_16BIT
(
Min
,
__
nv
_bfloat16
)
DEFINE_ATOMIC_16BIT
_BF
(
Min
,
__
hip
_bfloat16
)
#endif // BF16_ENABLED
#endif // BF16_ENABLED
#undef OP
#undef OP
...
@@ -256,7 +291,7 @@ inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) {
...
@@ -256,7 +291,7 @@ inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) {
template
<
>
template
<
>
__device__
__forceinline__
float
AtomicAdd
<
float
>
(
float
*
addr
,
float
val
)
{
__device__
__forceinline__
float
AtomicAdd
<
float
>
(
float
*
addr
,
float
val
)
{
#if __
CUDA_ARCH__ >= 200
#if __
HIP_DEVICE_COMPILE__
return
atomicAdd
(
addr
,
val
);
return
atomicAdd
(
addr
,
val
);
#else
#else
typedef
float
T
;
typedef
float
T
;
...
@@ -270,12 +305,12 @@ __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
...
@@ -270,12 +305,12 @@ __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
addr_as_ui
,
assumed
,
Cast
<
T
>::
Encode
(
Cast
<
T
>::
Decode
(
old
)
+
val
));
addr_as_ui
,
assumed
,
Cast
<
T
>::
Encode
(
Cast
<
T
>::
Decode
(
old
)
+
val
));
}
while
(
assumed
!=
old
);
}
while
(
assumed
!=
old
);
return
Cast
<
T
>::
Decode
(
old
);
return
Cast
<
T
>::
Decode
(
old
);
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
}
}
template
<
>
template
<
>
__device__
__forceinline__
double
AtomicAdd
<
double
>
(
double
*
addr
,
double
val
)
{
__device__
__forceinline__
double
AtomicAdd
<
double
>
(
double
*
addr
,
double
val
)
{
#if __
CUDA_ARCH__ >= 600
#if __
HIP_DEVICE_COMPILE__
return
atomicAdd
(
addr
,
val
);
return
atomicAdd
(
addr
,
val
);
#else
#else
typedef
double
T
;
typedef
double
T
;
...
@@ -292,11 +327,11 @@ __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
...
@@ -292,11 +327,11 @@ __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
#endif
#endif
}
}
#if defined(
CUDA
RT_VERSION) &&
CUDA
RT_VERSION >= 10000
#if defined(
DTK
RT_VERSION) &&
DTK
RT_VERSION >= 10000
template
<
>
template
<
>
__device__
__forceinline__
half
AtomicAdd
<
half
>
(
half
*
addr
,
half
val
)
{
__device__
__forceinline__
half
AtomicAdd
<
half
>
(
half
*
addr
,
half
val
)
{
// make sure we have half support
// make sure we have half support
#if __
CUDA_ARCH__ >= 700
#if __
HIP_DEVICE_COMPILE__
return
atomicAdd
(
addr
,
val
);
return
atomicAdd
(
addr
,
val
);
#else
#else
(
void
)
addr
;
(
void
)
addr
;
...
@@ -304,18 +339,18 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
...
@@ -304,18 +339,18 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
printf
(
printf
(
"Atomic operations are not supported for half precision (FP16) "
"Atomic operations are not supported for half precision (FP16) "
"on this GPU.
\n
"
);
"on this GPU.
\n
"
);
__trap
();
// //
__trap();
return
val
;
return
val
;
#endif // __
CUDA_ARCH__ >= 700
#endif // __
HIP_DEVICE_COMPILE__
}
}
#endif // defined(
CUDA
RT_VERSION) &&
CUDA
RT_VERSION >= 10000
#endif // defined(
DTK
RT_VERSION) &&
DTK
RT_VERSION >= 10000
#if BF16_ENABLED
#if BF16_ENABLED
template
<
>
template
<
>
__device__
__forceinline__
__
nv
_bfloat16
__device__
__forceinline__
__
hip
_bfloat16
AtomicAdd
<
__
nv
_bfloat16
>
(
__
nv
_bfloat16
*
addr
,
__
nv
_bfloat16
val
)
{
AtomicAdd
<
__
hip
_bfloat16
>
(
__
hip
_bfloat16
*
addr
,
__
hip
_bfloat16
val
)
{
// make sure we have bfloat16 support
// make sure we have bfloat16 support
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__
HIP_DEVICE_COMPILE__)
return
atomicAdd
(
addr
,
val
);
return
atomicAdd
(
addr
,
val
);
#else
#else
(
void
)
addr
;
(
void
)
addr
;
...
@@ -323,9 +358,9 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
...
@@ -323,9 +358,9 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
printf
(
printf
(
"Atomic operations are not supported for bfloat16 (BF16) "
"Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.
\n
"
);
"on GPUs with compute capability less than 8.0.
\n
"
);
__trap
();
//
__trap();
return
val
;
return
val
;
#endif // defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#endif // defined(__
HIP_DEVICE_COMPILE__)
}
}
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
...
src/array/cuda/bf16.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2022 by Contributors
* Copyright (c) 2022 by Contributors
*
*
...
@@ -18,131 +19,140 @@
...
@@ -18,131 +19,140 @@
*/
*/
#ifndef DGL_ARRAY_CUDA_BF16_CUH_
#ifndef DGL_ARRAY_CUDA_BF16_CUH_
#define DGL_ARRAY_CUDA_BF16_CUH_
#define DGL_ARRAY_CUDA_BF16_CUH_
#include <hip/hip_runtime.h>
#if BF16_ENABLED
#if BF16_ENABLED
#include <
cuda
_bf16.h>
#include <
hip/hip
_bf16.h>
#include <algorithm>
#include <algorithm>
static
__device__
__forceinline__
__
nv
_bfloat16
static
__device__
__forceinline__
__
hip
_bfloat16
max
(
__
nv
_bfloat16
a
,
__
nv
_bfloat16
b
)
{
max
(
__
hip
_bfloat16
a
,
__
hip
_bfloat16
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hmax
(
a
,
b
);
return
__hmax
(
a
,
b
);
#else
#else
return
__
nv
_bfloat16
(
max
(
float
(
a
),
float
(
b
)));
// NOLINT
return
__
hip
_bfloat16
(
max
(
float
(
a
),
float
(
b
)));
// NOLINT
#endif
#endif
}
}
static
__device__
__forceinline__
__
nv
_bfloat16
static
__device__
__forceinline__
__
hip
_bfloat16
min
(
__
nv
_bfloat16
a
,
__
nv
_bfloat16
b
)
{
min
(
__
hip
_bfloat16
a
,
__
hip
_bfloat16
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hmin
(
a
,
b
);
return
__hmin
(
a
,
b
);
#else
#else
return
__
nv
_bfloat16
(
min
(
float
(
a
),
float
(
b
)));
// NOLINT
return
__
hip
_bfloat16
(
min
(
float
(
a
),
float
(
b
)));
// NOLINT
#endif
#endif
}
}
#ifdef __
CUDA
CC__
#ifdef __
HIP
CC__
// Arithmetic BF16 operations for architecture >= 8.0 are already defined in
// Arithmetic BF16 operations for architecture >= 8.0 are already defined in
//
cuda_bf
16.h
//
hip/__hip_bfloat
16.h
#if defined(__
CUDA
_ARCH__) && (__
CUDA
_ARCH__ < 800)
//
#if defined(__
DTK
_ARCH__) && (__
DTK
_ARCH__ < 800)
// CUDA 12.2 adds "emulated" support for older architectures.
//
// CUDA 12.2 adds "emulated" support for older architectures.
#if defined(
CUDA
RT_VERSION) && (
CUDA
RT_VERSION < 12020)
//
#if defined(
DTK
RT_VERSION) && (
DTK
RT_VERSION < 12020)
__device__
__forceinline__
__
nv
_bfloat16
__device__
__forceinline__
__
hip
_bfloat16
operator
+
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
operator
+
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
return
__
hip
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
}
}
__device__
__forceinline__
__
nv
_bfloat16
__device__
__forceinline__
__
hip
_bfloat16
operator
-
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
operator
-
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
return
__
hip
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
}
}
__device__
__forceinline__
__
nv
_bfloat16
__device__
__forceinline__
__
hip
_bfloat16
operator
*
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
operator
*
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
return
__
hip
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
}
}
__device__
__forceinline__
__
nv
_bfloat16
__device__
__forceinline__
__
hip
_bfloat16
operator
/
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
operator
/
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
return
__
hip
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
}
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
+=
(
__device__
__forceinline__
__
hip
_bfloat16
&
operator
+=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
return
lh
;
return
lh
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
-=
(
__device__
__forceinline__
__
hip
_bfloat16
&
operator
-=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
return
lh
;
return
lh
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
*=
(
__device__
__forceinline__
__
hip
_bfloat16
&
operator
*=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
return
lh
;
return
lh
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
/=
(
__device__
__forceinline__
__
hip
_bfloat16
&
operator
/=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
return
lh
;
return
lh
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
++
(
__device__
__forceinline__
__
hip
_bfloat16
&
operator
++
(
__
nv
_bfloat16
&
h
)
{
// NOLINT
__
hip
_bfloat16
&
h
)
{
// NOLINT
h
=
__
nv
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
h
=
__
hip
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
return
h
;
return
h
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
--
(
__device__
__forceinline__
__
hip
_bfloat16
&
operator
--
(
__
nv
_bfloat16
&
h
)
{
// NOLINT
__
hip
_bfloat16
&
h
)
{
// NOLINT
h
=
__
nv
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
h
=
__
hip
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
return
h
;
return
h
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
__device__
__forceinline__
__
hip
_bfloat16
operator
++
(
__
nv
_bfloat16
&
h
,
int
)
{
// NOLINT
operator
++
(
__
hip
_bfloat16
&
h
,
int
)
{
// NOLINT
__
nv
_bfloat16
ret
=
h
;
__
hip
_bfloat16
ret
=
h
;
h
=
__
nv
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
h
=
__
hip
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
return
ret
;
return
ret
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
__device__
__forceinline__
__
hip
_bfloat16
operator
--
(
__
nv
_bfloat16
&
h
,
int
)
{
// NOLINT
operator
--
(
__
hip
_bfloat16
&
h
,
int
)
{
// NOLINT
__
nv
_bfloat16
ret
=
h
;
__
hip
_bfloat16
ret
=
h
;
h
=
__
nv
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
h
=
__
hip
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
return
ret
;
return
ret
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
operator
+
(
const
__
nv
_bfloat16
&
h
)
{
__device__
__forceinline__
__
hip
_bfloat16
operator
+
(
const
__
hip
_bfloat16
&
h
)
{
return
h
;
return
h
;
}
}
__device__
__forceinline__
__
nv
_bfloat16
operator
-
(
const
__
nv
_bfloat16
&
h
)
{
__device__
__forceinline__
__
hip
_bfloat16
operator
-
(
const
__
hip
_bfloat16
&
h
)
{
return
__
nv
_bfloat16
(
-
float
(
h
));
// NOLINT
return
__
hip
_bfloat16
(
-
float
(
h
));
// NOLINT
}
}
__device__
__forceinline__
bool
operator
==
(
__device__
__forceinline__
bool
operator
==
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
==
float
(
rh
);
// NOLINT
return
float
(
lh
)
==
float
(
rh
);
// NOLINT
}
}
__device__
__forceinline__
bool
operator
!=
(
__device__
__forceinline__
bool
operator
!=
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
!=
float
(
rh
);
// NOLINT
return
float
(
lh
)
!=
float
(
rh
);
// NOLINT
}
}
__device__
__forceinline__
bool
operator
>
(
__device__
__forceinline__
bool
operator
>
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
>
float
(
rh
);
// NOLINT
return
float
(
lh
)
>
float
(
rh
);
// NOLINT
}
}
__device__
__forceinline__
bool
operator
<
(
__device__
__forceinline__
bool
operator
<
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
<
float
(
rh
);
// NOLINT
return
float
(
lh
)
<
float
(
rh
);
// NOLINT
}
}
__device__
__forceinline__
bool
operator
>=
(
__device__
__forceinline__
bool
operator
>=
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
>=
float
(
rh
);
// NOLINT
return
float
(
lh
)
>=
float
(
rh
);
// NOLINT
}
}
__device__
__forceinline__
bool
operator
<=
(
__device__
__forceinline__
bool
operator
<=
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
<=
float
(
rh
);
// NOLINT
return
float
(
lh
)
<=
float
(
rh
);
// NOLINT
}
}
#endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
// #endif // defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
// #endif // defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800)
#endif // __CUDACC__
__device__
inline
__hip_bfloat16
__shfl_down
(
__hip_bfloat16
var
,
unsigned
int
lane_delta
,
int
width
=
warpSize
)
{
union
{
unsigned
short
s
;
__hip_bfloat16
us
;
}
tmp
;
tmp
.
us
=
var
;
tmp
.
s
=
__shfl_down
(
tmp
.
s
,
lane_delta
,
width
);
return
tmp
.
us
;
}
#endif // __HIPCC__
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
...
src/array/cuda/coo2csr.
cu
→
src/array/cuda/coo2csr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/coo2csr.cc
* @file array/cuda/coo2csr.cc
* @brief COO2CSR
* @brief COO2CSR
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -24,12 +28,12 @@ CSRMatrix COOToCSR(COOMatrix coo) {
...
@@ -24,12 +28,12 @@ CSRMatrix COOToCSR(COOMatrix coo) {
template <>
template <>
CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
bool row_sorted = coo.row_sorted;
bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted;
bool col_sorted = coo.col_sorted;
...
@@ -50,9 +54,9 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
...
@@ -50,9 +54,9 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
NDArray indptr =
NDArray indptr =
aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
CUSPARSE_CALL
(
cu
sparseXcoo2csr
(
CUSPARSE_CALL(
hip
sparseXcoo2csr(
thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
indptr_ptr
,
CU
SPARSE_INDEX_BASE_ZERO
));
indptr_ptr,
HIP
SPARSE_INDEX_BASE_ZERO));
return CSRMatrix(
return CSRMatrix(
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
...
@@ -100,7 +104,7 @@ template <>
...
@@ -100,7 +104,7 @@ template <>
CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
const auto& ctx = coo.row->ctx;
const auto& ctx = coo.row->ctx;
const auto nbits = coo.row->dtype.bits;
const auto nbits = coo.row->dtype.bits;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
bool row_sorted = coo.row_sorted;
bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted;
bool col_sorted = coo.col_sorted;
if (!row_sorted) {
if (!row_sorted) {
...
...
src/array/cuda/coo_sort.
cu
→
src/array/cuda/coo_sort.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/coo_sort.cc
* @file array/cuda/coo_sort.cc
* @brief Sort COO index
* @brief Sort COO index
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../c_api_common.h"
#include "../../c_api_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -65,7 +69,7 @@ __global__ void _COODecodeEdgesKernel(
...
@@ -65,7 +69,7 @@ __global__ void _COODecodeEdgesKernel(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
void COOSort_(COOMatrix* coo, bool sort_column) {
void COOSort_(COOMatrix* coo, bool sort_column) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int row_bits = cuda::_NumberOfBits(coo->num_rows);
const int row_bits = cuda::_NumberOfBits(coo->num_rows);
const int64_t nnz = coo->row->shape[0];
const int64_t nnz = coo->row->shape[0];
...
@@ -138,7 +142,7 @@ template <DGLDeviceType XPU, typename IdType>
...
@@ -138,7 +142,7 @@ template <DGLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0];
const int64_t nnz = coo.row->shape[0];
const auto& ctx = coo.row->ctx;
const auto& ctx = coo.row->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
// should be fine.
// should be fine.
...
...
src/array/cuda/csr2coo.
cu
→
src/array/cuda/csr2coo.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr2coo.cc
* @file array/cuda/csr2coo.cc
...
@@ -8,10 +10,10 @@
...
@@ -8,10 +10,10 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -29,12 +31,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
...
@@ -29,12 +31,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
template <>
template <>
COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
...
@@ -42,9 +44,9 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
...
@@ -42,9 +44,9 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
int32_t* row_ptr = static_cast<int32_t*>(row->data);
int32_t* row_ptr = static_cast<int32_t*>(row->data);
CUSPARSE_CALL
(
cu
sparseXcsr2coo
(
CUSPARSE_CALL(
hip
sparseXcsr2coo(
thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
row_ptr
,
CU
SPARSE_INDEX_BASE_ZERO
));
row_ptr,
HIP
SPARSE_INDEX_BASE_ZERO));
return COOMatrix(
return COOMatrix(
csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
...
@@ -72,10 +74,40 @@ struct AdjacentDifference {
...
@@ -72,10 +74,40 @@ struct AdjacentDifference {
}
}
};
};
/*!
* \brief Repeat elements
* \param val Value to repeat
* \param repeats Number of repeats for each value
* \param pos The position of the output buffer to write the value.
* \param out Output buffer.
* \param length Number of values
*
* For example:
* val = [3, 0, 1]
* repeats = [1, 0, 2]
* pos = [0, 1, 1] # write to output buffer position 0, 1, 1
* then,
* out = [3, 1, 1]
*/
template <typename DType, typename IdType>
__global__ void _RepeatKernel(
const DType* val, const IdType* pos,
DType* out, int64_t n_row, int64_t length) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1;
out[tx] = val[i];
tx += stride_x;
}
}
#if 0
template <>
template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
const auto& ctx = csr.indptr->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t nnz = csr.indices->shape[0];
const int64_t nnz = csr.indices->shape[0];
const auto nbits = csr.indptr->dtype.bits;
const auto nbits = csr.indptr->dtype.bits;
...
@@ -96,14 +128,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
...
@@ -96,14 +128,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
std::size_t temp_storage_bytes = 0;
std::size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceCopy::Batched(
CUDA_CALL(cub::DeviceCopy::Batched(
nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
buffer_sizes
+
i
,
std
::
min
(
csr
.
num_rows
-
i
,
max_copy_at_once
),
buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
stream));
stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL(cub::DeviceCopy::Batched(
CUDA_CALL(cub::DeviceCopy::Batched(
temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
buffer_sizes
+
i
,
std
::
min
(
csr
.
num_rows
-
i
,
max_copy_at_once
),
buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
stream));
stream));
}
}
...
@@ -111,6 +143,30 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
...
@@ -111,6 +143,30 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
csr.sorted);
csr.sorted);
}
}
#else
template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t nnz = csr.indices->shape[0];
const auto nbits = csr.indptr->dtype.bits;
IdArray rowids = Range(0, csr.num_rows, nbits, ctx);
IdArray ret_row = NewIdArray(nnz, ctx, nbits);
const int nt = 256;
const int nb = (nnz + nt - 1) / nt;
CUDA_KERNEL_CALL(_RepeatKernel,
nb, nt, 0, stream,
rowids.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
csr.num_rows, nnz);
return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
}
#endif
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
...
@@ -128,12 +184,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
...
@@ -128,12 +184,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
NDArray row = coo.row, col = coo.col, data = coo.data;
NDArray row = coo.row, col = coo.col, data = coo.data;
int32_t* row_ptr = static_cast<int32_t*>(row->data);
int32_t* row_ptr = static_cast<int32_t*>(row->data);
...
@@ -141,11 +197,11 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
...
@@ -141,11 +197,11 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
int32_t* data_ptr = static_cast<int32_t*>(data->data);
int32_t* data_ptr = static_cast<int32_t*>(data->data);
size_t workspace_size = 0;
size_t workspace_size = 0;
CUSPARSE_CALL
(
cu
sparseXcoosort_bufferSizeExt
(
CUSPARSE_CALL(
hip
sparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, &workspace_size));
data_ptr, row_ptr, &workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL
(
cu
sparseXcoosortByRow
(
CUSPARSE_CALL(
hip
sparseXcoosortByRow(
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, col_ptr, workspace));
data_ptr, row_ptr, col_ptr, workspace));
device->FreeWorkspace(row->ctx, workspace);
device->FreeWorkspace(row->ctx, workspace);
...
...
src/array/cuda/csr_get_data.
cu
→
src/array/cuda/csr_get_data.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file array/cuda/csr_get_data.cu
* @file array/cuda/csr_get_data.cu
* @brief Retrieve entries of a CSR matrix
* @brief Retrieve entries of a CSR matrix
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <numeric>
#include <numeric>
#include <unordered_set>
#include <unordered_set>
#include <vector>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -32,11 +36,11 @@ NDArray CSRGetData(
...
@@ -32,11 +36,11 @@ NDArray CSRGetData(
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const
int64_t
rstlen
=
std
::
max
(
rowlen
,
collen
);
const int64_t rstlen = ::max(rowlen, collen);
IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx);
IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx);
if (rstlen == 0) return rst;
if (rstlen == 0) return rst;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = cuda::FindNumThreads(rstlen);
const int nt = cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
const int nb = (rstlen + nt - 1) / nt;
if (return_eids)
if (return_eids)
...
@@ -67,12 +71,12 @@ template NDArray CSRGetData<kDGLCUDA, int64_t, __half>(
...
@@ -67,12 +71,12 @@ template NDArray CSRGetData<kDGLCUDA, int64_t, __half>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray weights, __half filler);
NDArray weights, __half filler);
#if BF16_ENABLED
#if BF16_ENABLED
template
NDArray
CSRGetData
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template NDArray CSRGetData<kDGLCUDA, int32_t, __
hip
_bfloat16>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray
weights
,
__
nv
_bfloat16
filler
);
NDArray weights, __
hip
_bfloat16 filler);
template
NDArray
CSRGetData
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template NDArray CSRGetData<kDGLCUDA, int64_t, __
hip
_bfloat16>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray
weights
,
__
nv
_bfloat16
filler
);
NDArray weights, __
hip
_bfloat16 filler);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template NDArray CSRGetData<kDGLCUDA, int32_t, float>(
template NDArray CSRGetData<kDGLCUDA, int32_t, float>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
...
...
src/array/cuda/csr_mm.
cu
→
src/array/cuda/csr_mm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr_mm.cu
* @file array/cuda/csr_mm.cu
* @brief SpSpMM/SpGEMM C APIs and definitions.
* @brief SpSpMM/SpGEMM C APIs and definitions.
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <limits>
#include <limits>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
cusparse_dispatcher.cuh"
#include "cusparse_dispatcher.cuh"
#include "
./
functor.cuh"
#include "functor.cuh"
namespace dgl {
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::runtime;
...
@@ -18,7 +22,7 @@ using namespace dgl::runtime;
...
@@ -18,7 +22,7 @@ using namespace dgl::runtime;
namespace aten {
namespace aten {
namespace cusparse {
namespace cusparse {
#if
CUDA
RT_VERSION >= 12000
#if
DTK
RT_VERSION >= 12000
/** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */
/** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */
template <typename DType, typename IdType>
template <typename DType, typename IdType>
...
@@ -31,74 +35,74 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
...
@@ -31,74 +35,74 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
const int nnzB = B.indices->shape[0];
const int nnzB = B.indices->shape[0];
const DType alpha = 1.0;
const DType alpha = 1.0;
const DType beta = 0.0;
const DType beta = 0.0;
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto transA =
HIP
SPARSE_OPERATION_NON_TRANSPOSE;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto transB =
HIP
SPARSE_OPERATION_NON_TRANSPOSE;
// device
// device
auto ctx = A.indptr->ctx;
auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
// all one data array
// all one data array
cu
sparseSpMatDescr_t
matA
,
matB
,
matC
;
hip
sparseSpMatDescr_t matA, matB, matC;
IdArray dC_csrOffsets =
IdArray dC_csrOffsets =
IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx);
IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx);
IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
constexpr auto idtype = cusparse_idtype<IdType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value;
constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto dtype = cuda_dtype<DType>::value;
// Create sparse matrix A, B and C in CSR format
// Create sparse matrix A, B and C in CSR format
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL(
hip
sparseCreateCsr(
&matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(),
&matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(),
A.indices.Ptr<IdType>(),
A.indices.Ptr<IdType>(),
//
cu
sparseCreateCsr only accepts non-const pointers.
//
hip
sparseCreateCsr only accepts non-const pointers.
const_cast
<
DType
*>
(
A_weights
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
const_cast<DType*>(A_weights), idtype, idtype,
HIP
SPARSE_INDEX_BASE_ZERO,
dtype));
dtype));
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL(
hip
sparseCreateCsr(
&matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(),
&matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(),
B.indices.Ptr<IdType>(),
B.indices.Ptr<IdType>(),
//
cu
sparseCreateCsr only accepts non-const pointers.
//
hip
sparseCreateCsr only accepts non-const pointers.
const_cast
<
DType
*>
(
B_weights
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
const_cast<DType*>(B_weights), idtype, idtype,
HIP
SPARSE_INDEX_BASE_ZERO,
dtype));
dtype));
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL(
hip
sparseCreateCsr(
&matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr,
&matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr,
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
idtype, idtype,
HIP
SPARSE_INDEX_BASE_ZERO, dtype));
// SpGEMM Computation
// SpGEMM Computation
cu
sparseSpGEMMDescr_t
spgemmDesc
;
hip
sparseSpGEMMDescr_t spgemmDesc;
cusparseSpGEMMAlg_t
alg
=
CU
SPARSE_SPGEMM_DEFAULT
;
cusparseSpGEMMAlg_t alg =
HIP
SPARSE_SPGEMM_DEFAULT;
CUSPARSE_CALL
(
cu
sparseSpGEMM_createDescr
(
&
spgemmDesc
));
CUSPARSE_CALL(
hip
sparseSpGEMM_createDescr(&spgemmDesc));
size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0;
size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0;
// ask bufferSize1 bytes for external memory
// ask bufferSize1 bytes for external memory
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
// inspect the matrices A and B to understand the memory requiremnent
// inspect the matrices A and B to understand the memory requiremnent
cu
sparseStatus_t
e
=
cu
sparseSpGEMM_workEstimation
(
hip
sparseStatus_t e =
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1);
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1);
//
CU
SPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
//
HIP
SPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
// and throws insufficient memory error within workEstimation call
// and throws insufficient memory error within workEstimation call
if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
// fall back to ALG2 to estimate num_prods
// fall back to ALG2 to estimate num_prods
alg = CUSPARSE_SPGEMM_ALG2;
alg = CUSPARSE_SPGEMM_ALG2;
device->FreeWorkspace(ctx, workspace1);
device->FreeWorkspace(ctx, workspace1);
// rerun
cu
sparseSpGEMM_workEstimation
// rerun
hip
sparseSpGEMM_workEstimation
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
} else {
} else {
CHECK
(
e
==
CU
SPARSE_STATUS_SUCCESS
)
<<
"CUSPARSE ERROR in SpGEMM: "
<<
e
;
CHECK(e ==
HIP
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
}
}
// get the number of intermediate products required for SpGEMM compute
// get the number of intermediate products required for SpGEMM compute
...
@@ -113,22 +117,22 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
...
@@ -113,22 +117,22 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
int64_t LARGE_NUM_PRODUCTS = 800000000; // 800*1000*1000;
int64_t LARGE_NUM_PRODUCTS = 800000000; // 800*1000*1000;
// switch to ALG2/ALG3 for medium & large problem size
// switch to ALG2/ALG3 for medium & large problem size
if
(
alg
==
CU
SPARSE_SPGEMM_DEFAULT
&&
num_prods
>
MEDIUM_NUM_PRODUCTS
)
{
if (alg ==
HIP
SPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
// use ALG3 for very large problem
// use ALG3 for very large problem
alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3
alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3
: CUSPARSE_SPGEMM_ALG2;
: CUSPARSE_SPGEMM_ALG2;
device->FreeWorkspace(ctx, workspace1);
device->FreeWorkspace(ctx, workspace1);
// rerun
cu
sparseSpGEMM_workEstimation
// rerun
hip
sparseSpGEMM_workEstimation
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
} else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
} else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
// no need to rerun
cu
sparseSpGEMM_workEstimation between ALG2 and ALG3
// no need to rerun
hip
sparseSpGEMM_workEstimation between ALG2 and ALG3
alg = CUSPARSE_SPGEMM_ALG3;
alg = CUSPARSE_SPGEMM_ALG3;
}
}
...
@@ -147,40 +151,40 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
...
@@ -147,40 +151,40 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
workspace3, &workspace_size2));
workspace3, &workspace_size2));
device->FreeWorkspace(ctx, workspace3);
device->FreeWorkspace(ctx, workspace3);
} else {
} else {
CUSPARSE_CALL
(
cu
sparseSpGEMM_compute
(
CUSPARSE_CALL(
hip
sparseSpGEMM_compute(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size2, NULL));
matC, dtype, alg, spgemmDesc, &workspace_size2, NULL));
}
}
// ask bufferSize2 bytes for external memory
// ask bufferSize2 bytes for external memory
void* workspace2 = device->AllocWorkspace(ctx, workspace_size2);
void* workspace2 = device->AllocWorkspace(ctx, workspace_size2);
// compute the intermediate product of A * B
// compute the intermediate product of A * B
CUSPARSE_CALL
(
cu
sparseSpGEMM_compute
(
CUSPARSE_CALL(
hip
sparseSpGEMM_compute(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2));
matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2));
// get matrix C non-zero entries C_nnz1
// get matrix C non-zero entries C_nnz1
int64_t C_num_rows1, C_num_cols1, C_nnz1;
int64_t C_num_rows1, C_num_cols1, C_nnz1;
CUSPARSE_CALL(
CUSPARSE_CALL(
cu
sparseSpMatGetSize
(
matC
,
&
C_num_rows1
,
&
C_num_cols1
,
&
C_nnz1
));
hip
sparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx);
IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx);
NDArray dC_weights =
NDArray dC_weights =
NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx);
NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx);
IdType* dC_columns_data = dC_columns.Ptr<IdType>();
IdType* dC_columns_data = dC_columns.Ptr<IdType>();
DType* dC_weights_data = dC_weights.Ptr<DType>();
DType* dC_weights_data = dC_weights.Ptr<DType>();
// update matC with the new pointers
// update matC with the new pointers
CUSPARSE_CALL
(
cu
sparseCsrSetPointers
(
CUSPARSE_CALL(
hip
sparseCsrSetPointers(
matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data));
matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data));
// copy the final products to the matrix C
// copy the final products to the matrix C
CUSPARSE_CALL
(
cu
sparseSpGEMM_copy
(
CUSPARSE_CALL(
hip
sparseSpGEMM_copy(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc));
matC, dtype, alg, spgemmDesc));
device->FreeWorkspace(ctx, workspace1);
device->FreeWorkspace(ctx, workspace1);
device->FreeWorkspace(ctx, workspace2);
device->FreeWorkspace(ctx, workspace2);
// destroy matrix/vector descriptors
// destroy matrix/vector descriptors
CUSPARSE_CALL
(
cu
sparseSpGEMM_destroyDescr
(
spgemmDesc
));
CUSPARSE_CALL(
hip
sparseSpGEMM_destroyDescr(spgemmDesc));
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL(
hip
sparseDestroySpMat(matA));
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matB
));
CUSPARSE_CALL(
hip
sparseDestroySpMat(matB));
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matC
));
CUSPARSE_CALL(
hip
sparseDestroySpMat(matC));
return {
return {
CSRMatrix(
CSRMatrix(
A.num_rows, B.num_cols, dC_csrOffsets, dC_columns,
A.num_rows, B.num_cols, dC_csrOffsets, dC_columns,
...
@@ -188,7 +192,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
...
@@ -188,7 +192,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
dC_weights};
dC_weights};
}
}
#else //
CUDA
RT_VERSION < 12000
#else //
DTK
RT_VERSION < 12000
/** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA
/** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA
* versions */
* versions */
...
@@ -208,25 +212,25 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
...
@@ -208,25 +212,25 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
auto ctx = A.indptr->ctx;
auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto idtype = A.indptr->dtype;
auto idtype = A.indptr->dtype;
auto dtype = A_weights_array->dtype;
auto dtype = A_weights_array->dtype;
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
if (!thr_entry->cusparse_handle) {
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
CUSPARSE_CALL
(
cu
sparseSetPointerMode
(
CUSPARSE_CALL(
hip
sparseSetPointerMode(
thr_entry
->
cusparse_handle
,
CU
SPARSE_POINTER_MODE_HOST
));
thr_entry->cusparse_handle,
HIP
SPARSE_POINTER_MODE_HOST));
CUSPARSE_CALL
(
cu
sparseCreateCsrgemm2Info
(
&
info
));
CUSPARSE_CALL(
hip
sparseCreateCsrgemm2Info(&info));
cu
sparseMatDescr_t
matA
,
matB
,
matC
,
matD
;
hip
sparseMatDescr_t matA, matB, matC, matD;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matA
));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matA));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matB
));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matB));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matC
));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matC));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matD
));
// needed even if D is null
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matD)); // needed even if D is null
CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt(
CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt(
thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA,
thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA,
...
@@ -252,11 +256,11 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
...
@@ -252,11 +256,11 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace));
C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace));
device->FreeWorkspace(ctx, workspace);
device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL
(
cu
sparseDestroyCsrgemm2Info
(
info
));
CUSPARSE_CALL(
hip
sparseDestroyCsrgemm2Info(info));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matA
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matA));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matB
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matB));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matC
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matC));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matD
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matD));
return {
return {
CSRMatrix(
CSRMatrix(
...
@@ -264,7 +268,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
...
@@ -264,7 +268,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
C_weights};
C_weights};
}
}
#endif //
CUDA
RT_VERSION >= 12000
#endif //
DTK
RT_VERSION >= 12000
} // namespace cusparse
} // namespace cusparse
template <int XPU, typename IdType, typename DType>
template <int XPU, typename IdType, typename DType>
...
@@ -314,9 +318,9 @@ template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>(
...
@@ -314,9 +318,9 @@ template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
#if BF16_ENABLED
#if BF16_ENABLED
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/csr_sort.
cu
→
src/array/cuda/csr_sort.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr_sort.cc
* @file array/cuda/csr_sort.cc
...
@@ -5,10 +7,10 @@
...
@@ -5,10 +7,10 @@
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -39,7 +41,7 @@ __global__ void _SegmentIsSorted(
...
@@ -39,7 +41,7 @@ __global__ void _SegmentIsSorted(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr) {
bool CSRIsSorted(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
const auto& ctx = csr.indptr->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
// but should be fine.
...
@@ -67,12 +69,12 @@ template <>
...
@@ -67,12 +69,12 @@ template <>
void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
NDArray indptr = csr->indptr;
NDArray indptr = csr->indptr;
NDArray indices = csr->indices;
NDArray indices = csr->indices;
...
@@ -83,16 +85,16 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
...
@@ -83,16 +85,16 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
NDArray data = csr->data;
NDArray data = csr->data;
size_t workspace_size = 0;
size_t workspace_size = 0;
CUSPARSE_CALL
(
cu
sparseXcsrsort_bufferSizeExt
(
CUSPARSE_CALL(
hip
sparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
void* workspace = device->AllocWorkspace(ctx, workspace_size);
cu
sparseMatDescr_t
descr
;
hip
sparseMatDescr_t descr;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&descr));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL(
hip
sparseSetMatType(descr,
HIP
SPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL(
hip
sparseSetMatIndexBase(descr,
HIP
SPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL
(
cu
sparseXcsrsort
(
CUSPARSE_CALL(
hip
sparseXcsrsort(
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
workspace));
workspace));
...
@@ -100,13 +102,13 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
...
@@ -100,13 +102,13 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
csr->sorted = true;
csr->sorted = true;
// free resources
// free resources
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(descr));
device->FreeWorkspace(ctx, workspace);
device->FreeWorkspace(ctx, workspace);
}
}
template <>
template <>
void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
const auto& ctx = csr->indptr->ctx;
const auto& ctx = csr->indptr->ctx;
...
@@ -125,13 +127,13 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
...
@@ -125,13 +127,13 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
// Allocate workspace
// Allocate workspace
size_t workspace_size = 0;
size_t workspace_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairs(
nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
// Compute
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairs(
workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
...
...
src/array/cuda/csr_sum.
cu
→
src/array/cuda/csr_sum.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
* @file array/cuda/spmm.cu
...
@@ -7,8 +9,8 @@
...
@@ -7,8 +9,8 @@
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
cusparse_dispatcher.cuh"
#include "cusparse_dispatcher.cuh"
#include "
./
functor.cuh"
#include "functor.cuh"
namespace dgl {
namespace dgl {
...
@@ -32,21 +34,21 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
...
@@ -32,21 +34,21 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
auto ctx = A.indptr->ctx;
auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle)
if (!thr_entry->cusparse_handle)
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
cu
sparseMatDescr_t
matA
,
matB
,
matC
;
hip
sparseMatDescr_t matA, matB, matC;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matA
));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matA));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matB
));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matB));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matC
));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matC));
cu
sparseSetPointerMode
(
hip
sparseSetPointerMode(
thr_entry
->
cusparse_handle
,
CU
SPARSE_POINTER_MODE_HOST
);
thr_entry->cusparse_handle,
HIP
SPARSE_POINTER_MODE_HOST);
size_t workspace_size = 0;
size_t workspace_size = 0;
/* prepare output C */
/* prepare output C */
IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx);
IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx);
...
@@ -81,9 +83,9 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
...
@@ -81,9 +83,9 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
device->FreeWorkspace(ctx, workspace);
device->FreeWorkspace(ctx, workspace);
// destroy matrix/vector descriptors
// destroy matrix/vector descriptors
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matA
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matA));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matB
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matB));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matC
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matC));
return {
return {
CSRMatrix(
CSRMatrix(
A.num_rows, A.num_cols, dC_csrOffsets, dC_columns,
A.num_rows, A.num_cols, dC_csrOffsets, dC_columns,
...
@@ -159,9 +161,9 @@ template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>(
...
@@ -159,9 +161,9 @@ template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
#if BF16_ENABLED
#if BF16_ENABLED
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRSum
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRSum
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/csr_transpose.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr_transpose.cc
* @file array/cuda/csr_transpose.cc
...
@@ -23,12 +25,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
...
@@ -23,12 +25,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
template
<
>
template
<
>
CSRMatrix
CSRTranspose
<
kDGLCUDA
,
int32_t
>
(
CSRMatrix
csr
)
{
CSRMatrix
CSRTranspose
<
kDGLCUDA
,
int32_t
>
(
CSRMatrix
csr
)
{
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
NDArray
indptr
=
csr
.
indptr
,
indices
=
csr
.
indices
,
data
=
csr
.
data
;
NDArray
indptr
=
csr
.
indptr
,
indices
=
csr
.
indices
,
data
=
csr
.
data
;
const
int64_t
nnz
=
indices
->
shape
[
0
];
const
int64_t
nnz
=
indices
->
shape
[
0
];
...
@@ -48,30 +50,30 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
...
@@ -48,30 +50,30 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
int32_t
*
t_indices_ptr
=
static_cast
<
int32_t
*>
(
t_indices
->
data
);
int32_t
*
t_indices_ptr
=
static_cast
<
int32_t
*>
(
t_indices
->
data
);
void
*
t_data_ptr
=
t_data
->
data
;
void
*
t_data_ptr
=
t_data
->
data
;
#if
CUDA
RT_VERSION >= 10010
#if
DTK
RT_VERSION >= 10010
auto
device
=
runtime
::
DeviceAPI
::
Get
(
csr
.
indptr
->
ctx
);
auto
device
=
runtime
::
DeviceAPI
::
Get
(
csr
.
indptr
->
ctx
);
// workspace
// workspace
size_t
workspace_size
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cu
sparseCsr2cscEx2_bufferSize
(
CUSPARSE_CALL
(
hip
sparseCsr2cscEx2_bufferSize
(
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
data_ptr
,
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
data_ptr
,
indptr_ptr
,
indices_ptr
,
t_data_ptr
,
t_indptr_ptr
,
t_indices_ptr
,
indptr_ptr
,
indices_ptr
,
t_data_ptr
,
t_indptr_ptr
,
t_indices_ptr
,
CUDA
_R_32F
,
CU
SPARSE_ACTION_NUMERIC
,
CU
SPARSE_INDEX_BASE_ZERO
,
HIP
_R_32F
,
HIP
SPARSE_ACTION_NUMERIC
,
HIP
SPARSE_INDEX_BASE_ZERO
,
CU
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
HIP
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
&
workspace_size
));
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseCsr2cscEx2
(
CUSPARSE_CALL
(
hip
sparseCsr2cscEx2
(
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
data_ptr
,
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
data_ptr
,
indptr_ptr
,
indices_ptr
,
t_data_ptr
,
t_indptr_ptr
,
t_indices_ptr
,
indptr_ptr
,
indices_ptr
,
t_data_ptr
,
t_indptr_ptr
,
t_indices_ptr
,
CUDA
_R_32F
,
CU
SPARSE_ACTION_NUMERIC
,
CU
SPARSE_INDEX_BASE_ZERO
,
HIP
_R_32F
,
HIP
SPARSE_ACTION_NUMERIC
,
HIP
SPARSE_INDEX_BASE_ZERO
,
CU
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
HIP
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
workspace
));
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
device
->
FreeWorkspace
(
ctx
,
workspace
);
#else
#else
CUSPARSE_CALL
(
cu
sparseScsr2csc
(
CUSPARSE_CALL
(
hip
sparseScsr2csc
(
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
static_cast
<
const
float
*>
(
data_ptr
),
indptr_ptr
,
indices_ptr
,
static_cast
<
const
float
*>
(
data_ptr
),
indptr_ptr
,
indices_ptr
,
static_cast
<
float
*>
(
t_data_ptr
),
t_indices_ptr
,
t_indptr_ptr
,
static_cast
<
float
*>
(
t_data_ptr
),
t_indices_ptr
,
t_indptr_ptr
,
CU
SPARSE_ACTION_NUMERIC
,
CU
SPARSE_INDEX_BASE_ZERO
));
HIP
SPARSE_ACTION_NUMERIC
,
HIP
SPARSE_INDEX_BASE_ZERO
));
#endif
#endif
return
CSRMatrix
(
return
CSRMatrix
(
...
...
src/array/cuda/cuda_filter.
cu
→
src/array/cuda/cuda_filter.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file array/cuda/cuda_filter.cc
* @file array/cuda/cuda_filter.cc
...
@@ -6,7 +8,7 @@
...
@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_hashtable.cuh"
#include "../../runtime/cuda/cuda_hashtable.cuh"
...
@@ -45,7 +47,7 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
...
@@ -45,7 +47,7 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
const auto& ctx = test->ctx;
const auto& ctx = test->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t size = test->shape[0];
const int64_t size = test->shape[0];
cuda
Stream_t
cudaStream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t cudaStream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (size == 0) {
if (size == 0) {
return test;
return test;
...
@@ -74,12 +76,12 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
...
@@ -74,12 +76,12 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
// generate prefix-sum
// generate prefix-sum
{
{
size_t workspace_bytes;
size_t workspace_bytes;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), size + 1, cudaStream));
static_cast<IdType*>(nullptr), size + 1, cudaStream));
void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
device->FreeWorkspace(ctx, workspace);
device->FreeWorkspace(ctx, workspace);
}
}
...
@@ -108,8 +110,8 @@ template <typename IdType>
...
@@ -108,8 +110,8 @@ template <typename IdType>
class CudaFilterSet : public Filter {
class CudaFilterSet : public Filter {
public:
public:
explicit CudaFilterSet(IdArray array)
explicit CudaFilterSet(IdArray array)
:
table_
(
array
->
shape
[
0
],
array
->
ctx
,
runtime
::
getCurrent
CUDA
Stream
())
{
: table_(array->shape[0], array->ctx, runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
()) {
cuda
Stream_t
cudaStream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t cudaStream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
table_.FillWithUnique(
table_.FillWithUnique(
static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
}
}
...
...
src/array/cuda/cusparse_dispatcher.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/dispatcher.cuh
* @file array/cuda/dispatcher.cuh
...
@@ -7,7 +8,7 @@
...
@@ -7,7 +8,7 @@
#ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#include <
cu
sparse.h>
#include <
hipsparse/hip
sparse.h>
#include <dgl/runtime/c_runtime_api.h>
#include <dgl/runtime/c_runtime_api.h>
#include "bf16.cuh"
#include "bf16.cuh"
...
@@ -20,70 +21,70 @@ namespace aten {
...
@@ -20,70 +21,70 @@ namespace aten {
template
<
typename
DType
>
template
<
typename
DType
>
struct
CSRGEMM
{
struct
CSRGEMM
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
};
};
template
<
>
template
<
>
struct
CSRGEMM
<
__half
>
{
struct
CSRGEMM
<
__half
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// different implementation would be required.
// different implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEMM::bufferSizeExt does not support dtype half (FP16)."
;
LOG
(
FATAL
)
<<
"CSRGEMM::bufferSizeExt does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// implementation would be required.
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEMM::compute does not support dtype half (FP16)."
;
LOG
(
FATAL
)
<<
"CSRGEMM::compute does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
};
};
#if BF16_ENABLED
#if BF16_ENABLED
template
<
>
template
<
>
struct
CSRGEMM
<
__
nv
_bfloat16
>
{
struct
CSRGEMM
<
__
hip
_bfloat16
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// different implementation would be required.
// different implementation would be required.
LOG
(
FATAL
)
LOG
(
FATAL
)
<<
"CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16)."
;
<<
"CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// implementation would be required.
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEMM::compute does not support dtype bfloat16 (BF16)."
;
LOG
(
FATAL
)
<<
"CSRGEMM::compute does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
};
};
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -91,36 +92,36 @@ struct CSRGEMM<__nv_bfloat16> {
...
@@ -91,36 +92,36 @@ struct CSRGEMM<__nv_bfloat16> {
template
<
>
template
<
>
struct
CSRGEMM
<
float
>
{
struct
CSRGEMM
<
float
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseScsrgemm2_bufferSizeExt
(
args
...);
return
hip
sparseScsrgemm2_bufferSizeExt
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseScsrgemm2
(
args
...);
return
hip
sparseScsrgemm2
(
args
...);
}
}
};
};
template
<
>
template
<
>
struct
CSRGEMM
<
double
>
{
struct
CSRGEMM
<
double
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseDcsrgemm2_bufferSizeExt
(
args
...);
return
hip
sparseDcsrgemm2_bufferSizeExt
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseDcsrgemm2
(
args
...);
return
hip
sparseDcsrgemm2
(
args
...);
}
}
};
};
...
@@ -128,70 +129,70 @@ struct CSRGEMM<double> {
...
@@ -128,70 +129,70 @@ struct CSRGEMM<double> {
template
<
typename
DType
>
template
<
typename
DType
>
struct
CSRGEAM
{
struct
CSRGEAM
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
};
};
template
<
>
template
<
>
struct
CSRGEAM
<
__half
>
{
struct
CSRGEAM
<
__half
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// different implementation would be required.
// different implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEAM::bufferSizeExt does not support dtype half (FP16)."
;
LOG
(
FATAL
)
<<
"CSRGEAM::bufferSizeExt does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// implementation would be required.
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEAM::compute does not support dtype half (FP16)."
;
LOG
(
FATAL
)
<<
"CSRGEAM::compute does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
};
};
#if BF16_ENABLED
#if BF16_ENABLED
template
<
>
template
<
>
struct
CSRGEAM
<
__
nv
_bfloat16
>
{
struct
CSRGEAM
<
__
hip
_bfloat16
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// different implementation would be required.
// different implementation would be required.
LOG
(
FATAL
)
LOG
(
FATAL
)
<<
"CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16)."
;
<<
"CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// implementation would be required.
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEAM::compute does not support dtype bfloat16 (BF16)."
;
LOG
(
FATAL
)
<<
"CSRGEAM::compute does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
}
};
};
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -199,36 +200,36 @@ struct CSRGEAM<__nv_bfloat16> {
...
@@ -199,36 +200,36 @@ struct CSRGEAM<__nv_bfloat16> {
template
<
>
template
<
>
struct
CSRGEAM
<
float
>
{
struct
CSRGEAM
<
float
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseScsrgeam2_bufferSizeExt
(
args
...);
return
hip
sparseScsrgeam2_bufferSizeExt
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseScsrgeam2
(
args
...);
return
hip
sparseScsrgeam2
(
args
...);
}
}
};
};
template
<
>
template
<
>
struct
CSRGEAM
<
double
>
{
struct
CSRGEAM
<
double
>
{
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseDcsrgeam2_bufferSizeExt
(
args
...);
return
hip
sparseDcsrgeam2_bufferSizeExt
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseDcsrgeam2
(
args
...);
return
hip
sparseDcsrgeam2
(
args
...);
}
}
};
};
...
...
src/array/cuda/disjoint_union.
cu
→
src/array/cuda/disjoint_union.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022, NVIDIA CORPORATION.
*
*
...
@@ -24,7 +26,7 @@
...
@@ -24,7 +26,7 @@
#include <vector>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
using runtime::NDArray;
using runtime::NDArray;
...
@@ -78,7 +80,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
...
@@ -78,7 +80,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
void _Merge(
void _Merge(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
int
n_elms
,
DGLContext
ctx
,
DGLDataType
dtype
,
cuda
Stream_t
stream
)
{
int n_elms, DGLContext ctx, DGLDataType dtype,
hip
Stream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
int nt = 256;
int nt = 256;
int nb = (n_elms + nt - 1) / nt;
int nb = (n_elms + nt - 1) / nt;
...
@@ -99,7 +101,7 @@ void _Merge(
...
@@ -99,7 +101,7 @@ void _Merge(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
uint64_t src_offset = 0, dst_offset = 0;
uint64_t src_offset = 0, dst_offset = 0;
bool has_data = false;
bool has_data = false;
...
...
src/array/cuda/fp16.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020-2022 by Contributors
* Copyright (c) 2020-2022 by Contributors
*
*
...
@@ -21,12 +22,12 @@
...
@@ -21,12 +22,12 @@
#ifndef DGL_ARRAY_CUDA_FP16_CUH_
#ifndef DGL_ARRAY_CUDA_FP16_CUH_
#define DGL_ARRAY_CUDA_FP16_CUH_
#define DGL_ARRAY_CUDA_FP16_CUH_
#include <
cuda
_fp16.h>
#include <
hip/hip
_fp16.h>
#include <algorithm>
#include <algorithm>
static
__device__
__forceinline__
half
max
(
half
a
,
half
b
)
{
static
__device__
__forceinline__
half
max
(
half
a
,
half
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hgt
(
__half
(
a
),
__half
(
b
))
?
a
:
b
;
return
__hgt
(
__half
(
a
),
__half
(
b
))
?
a
:
b
;
#else
#else
return
__half
(
max
(
float
(
a
),
float
(
b
)));
// NOLINT
return
__half
(
max
(
float
(
a
),
float
(
b
)));
// NOLINT
...
@@ -34,19 +35,19 @@ static __device__ __forceinline__ half max(half a, half b) {
...
@@ -34,19 +35,19 @@ static __device__ __forceinline__ half max(half a, half b) {
}
}
static
__device__
__forceinline__
half
min
(
half
a
,
half
b
)
{
static
__device__
__forceinline__
half
min
(
half
a
,
half
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hlt
(
__half
(
a
),
__half
(
b
))
?
a
:
b
;
return
__hlt
(
__half
(
a
),
__half
(
b
))
?
a
:
b
;
#else
#else
return
__half
(
min
(
float
(
a
),
float
(
b
)));
// NOLINT
return
__half
(
min
(
float
(
a
),
float
(
b
)));
// NOLINT
#endif
#endif
}
}
#if 0
#ifdef __
CUDA
CC__
#ifdef __
HIP
CC__
// Arithmetic FP16 operations for architecture >= 5.3 are already defined in
// Arithmetic FP16 operations for architecture >= 5.3 are already defined in
//
cuda
_fp16.h
//
hip/hip
_fp16.h
#if defined(__
CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
#if defined(__
HIP_DEVICE_COMPILE__)
// CUDA 12.2 adds "emulated" support for older architectures.
// CUDA 12.2 adds "emulated" support for older architectures.
#if defined(
CUDA
RT_VERSION) && (
CUDA
RT_VERSION < 12020)
#if defined(
DTK
RT_VERSION) && (
DTK
RT_VERSION < 12020)
__device__ __forceinline__ __half
__device__ __forceinline__ __half
operator+(const __half& lh, const __half& rh) {
operator+(const __half& lh, const __half& rh) {
return __half(float(lh) + float(rh)); // NOLINT
return __half(float(lh) + float(rh)); // NOLINT
...
@@ -127,8 +128,8 @@ __device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
...
@@ -127,8 +128,8 @@ __device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
return float(lh) <= float(rh); // NOLINT
return float(lh) <= float(rh); // NOLINT
}
}
#endif // defined(
CUDA
RT_VERSION) && (
CUDA
RT_VERSION < 12020)
#endif // defined(
DTK
RT_VERSION) && (
DTK
RT_VERSION < 12020)
#endif // defined(__
CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
#endif // defined(__
HIP_DEVICE_COMPILE__)
#endif // __
CUDA
CC__
#endif // __
HIP
CC__
#endif
#endif // DGL_ARRAY_CUDA_FP16_CUH_
#endif // DGL_ARRAY_CUDA_FP16_CUH_
src/array/cuda/functor.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/functor.cuh
* @file array/cuda/functor.cuh
...
@@ -9,8 +10,8 @@
...
@@ -9,8 +10,8 @@
#include <cmath>
#include <cmath>
#include <limits>
#include <limits>
#include "
./
atomic.cuh"
#include "atomic.cuh"
#include "
./
fp16.cuh"
#include "fp16.cuh"
#include "bf16.cuh"
#include "bf16.cuh"
namespace
dgl
{
namespace
dgl
{
...
@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
...
@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
#if BF16_ENABLED
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
template
<
typename
Idx
,
bool
atomic
>
struct
Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
struct
Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
0.
);
return
__float2bfloat16
(
0.
);
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
}
// sometimes we have to use float in reduction for better precision
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
static_cast
<
float
>
(
val
),
id
);
}
}
...
@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
...
@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
#if BF16_ENABLED
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
template
<
typename
Idx
,
bool
atomic
>
struct
Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
struct
Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
-
std
::
numeric_limits
<
float
>::
infinity
());
return
__float2bfloat16
(
-
std
::
numeric_limits
<
float
>::
infinity
());
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
}
// sometimes we have to use float in reduction for better precision
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
static_cast
<
float
>
(
val
),
id
);
}
}
...
@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
...
@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
#if BF16_ENABLED
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
template
<
typename
Idx
,
bool
atomic
>
struct
Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
struct
Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
std
::
numeric_limits
<
float
>::
infinity
());
return
__float2bfloat16
(
std
::
numeric_limits
<
float
>::
infinity
());
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
}
// sometimes we have to use float in reduction for better precision
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
}
static
__device__
__forceinline__
void
Call
(
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
static_cast
<
float
>
(
val
),
id
);
}
}
...
...
src/array/cuda/gather_mm.
cu
→
src/array/cuda/gather_mm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/gather_mm.cu
* @file array/cuda/gather_mm.cu
...
@@ -7,9 +9,9 @@
...
@@ -7,9 +9,9 @@
#include <algorithm> // std::swap
#include <algorithm> // std::swap
#include "
./
atomic.cuh"
#include "atomic.cuh"
#include "
./
functor.cuh"
#include "functor.cuh"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
using namespace cuda;
using namespace cuda;
...
@@ -20,54 +22,63 @@ namespace {
...
@@ -20,54 +22,63 @@ namespace {
/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
*/
*/
template <typename DType>
template <typename DType>
cu
blasStatus_t
cublasGemm
(
hip
blasStatus_t cublasGemm(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const DType* alpha, const DType* A, int lda,
int m, int n, int k, const DType* alpha, const DType* A, int lda,
const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype";
LOG(INFO) << "Not supported dtype";
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED;
}
}
template <>
template <>
cu
blasStatus_t
cublasGemm
<
__half
>
(
hip
blasStatus_t cublasGemm<__half>(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const __half* alpha, const __half* A, int lda,
int m, int n, int k, const __half* alpha, const __half* A, int lda,
const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
return
cu
blasHgemm
(
return
hip
blasHgemm(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
handle, transa, transb, m, n, k,
(hipblasHalf*)alpha, (hipblasHalf*)A, lda, (hipblasHalf*)B, ldb, (hipblasHalf*)beta, (hipblasHalf*)
C, ldc);
}
}
// template <>
// hipblasStatus_t cublasGemm<__half>(
// hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
// int m, int n, int k, const __half* alpha, const __half* A, int lda,
// const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
// return hipblasHgemm(
// handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
// }
#if BF16_ENABLED
#if BF16_ENABLED
template <>
template <>
cu
blasStatus_t
cublasGemm
<
__
nv
_bfloat16
>
(
hip
blasStatus_t cublasGemm<__
hip
_bfloat16>(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int
m
,
int
n
,
int
k
,
const
__
nv
_bfloat16
*
alpha
,
const
__
nv
_bfloat16
*
A
,
int m, int n, int k, const __
hip
_bfloat16* alpha, const __
hip
_bfloat16* A,
int
lda
,
const
__
nv
_bfloat16
*
B
,
int
ldb
,
const
__
nv
_bfloat16
*
beta
,
int lda, const __
hip
_bfloat16* B, int ldb, const __
hip
_bfloat16* beta,
__
nv
_bfloat16
*
C
,
int
ldc
)
{
__
hip
_bfloat16* C, int ldc) {
float alpha_float = __bfloat162float(*alpha);
float alpha_float = __bfloat162float(*alpha);
float beta_float = __bfloat162float(*beta);
float beta_float = __bfloat162float(*beta);
return
cu
blasGemmEx
(
return
hip
blasGemmEx(
handle
,
transa
,
transb
,
m
,
n
,
k
,
&
alpha_float
,
A
,
CUDA
_R_16B
F
,
lda
,
B
,
handle, transa, transb, m, n, k, &alpha_float, A,
HIPBLAS
_R_16B, lda, B,
CUDA
_R_16B
F
,
ldb
,
&
beta_float
,
C
,
CUDA
_R_16B
F
,
ldc
,
CU
BLAS_
COMPUTE
_32F
,
HIPBLAS
_R_16B, ldb, &beta_float, C,
HIPBLAS
_R_16B, ldc,
HIP
BLAS_
R
_32F,
CU
BLAS_GEMM_DEFAULT
_TENSOR_OP
);
HIP
BLAS_GEMM_DEFAULT);
}
}
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template <>
template <>
cu
blasStatus_t
cublasGemm
<
float
>
(
hip
blasStatus_t cublasGemm<float>(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const float* alpha, const float* A, int lda,
int m, int n, int k, const float* alpha, const float* A, int lda,
const float* B, int ldb, const float* beta, float* C, int ldc) {
const float* B, int ldb, const float* beta, float* C, int ldc) {
return
cu
blasSgemm
(
return
hip
blasSgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
}
template <>
template <>
cu
blasStatus_t
cublasGemm
<
double
>
(
hip
blasStatus_t cublasGemm<double>(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const double* alpha, const double* A, int lda,
int m, int n, int k, const double* alpha, const double* A, int lda,
const double* B, int ldb, const double* beta, double* C, int ldc) {
const double* B, int ldb, const double* beta, double* C, int ldc) {
return
cu
blasDgemm
(
return
hip
blasDgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
}
...
@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
...
@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
// Load A in shared mem in a coalesced way
// Load A in shared mem in a coalesced way
for (unsigned int l = laneId; l < a_tile; l += 32)
for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
__syncwarp
();
//
__syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private
DType out_reg = static_cast<DType>(0.0f); // thread private
...
@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
...
@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
/* Load A in shared mem in a coalesced way */
/* Load A in shared mem in a coalesced way */
for (unsigned int l = laneId; l < a_tile; l += 32)
for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
__syncwarp
();
//
__syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private
DType out_reg = static_cast<DType>(0.0f); // thread private
...
@@ -203,7 +214,7 @@ void SegmentMM(
...
@@ -203,7 +214,7 @@ void SegmentMM(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans) {
bool a_trans, bool b_trans) {
auto device = runtime::DeviceAPI::Get(A->ctx);
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_data = A.Ptr<DType>();
const DType* A_data = A.Ptr<DType>();
const DType* B_data = B.Ptr<DType>();
const DType* B_data = B.Ptr<DType>();
const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
...
@@ -215,8 +226,8 @@ void SegmentMM(
...
@@ -215,8 +226,8 @@ void SegmentMM(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle)
if (!thr_entry->cublas_handle)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL(
hip
blasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL(
hip
blasSetStream(thr_entry->cublas_handle, stream));
IdType m_offset = 0;
IdType m_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) {
for (IdType etype = 0; etype < num_rel; ++etype) {
...
@@ -226,10 +237,10 @@ void SegmentMM(
...
@@ -226,10 +237,10 @@ void SegmentMM(
n = B->shape[2]; // cols of B
n = B->shape[2]; // cols of B
k = B->shape[1]; // cols of A == rows of B
k = B->shape[1]; // cols of A == rows of B
int ldb = n, lda = k, ldc = n;
int ldb = n, lda = k, ldc = n;
cu
blasOperation_t
transB
=
CU
BLAS_OP_N
;
hip
blasOperation_t transB =
HIP
BLAS_OP_N;
cu
blasOperation_t
transA
=
CU
BLAS_OP_N
;
hip
blasOperation_t transA =
HIP
BLAS_OP_N;
if (b_trans) {
if (b_trans) {
transB
=
CU
BLAS_OP_T
;
transB =
HIP
BLAS_OP_T;
ldb = n, lda = n, ldc = k;
ldb = n, lda = n, ldc = k;
std::swap(n, k);
std::swap(n, k);
}
}
...
@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
...
@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
void SegmentMMBackwardB(
void SegmentMMBackwardB(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
auto device = runtime::DeviceAPI::Get(A->ctx);
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_data = A.Ptr<DType>();
const DType* A_data = A.Ptr<DType>();
const DType* dC_data = dC.Ptr<DType>();
const DType* dC_data = dC.Ptr<DType>();
const IdType* seglen_data = seglen.Ptr<IdType>();
const IdType* seglen_data = seglen.Ptr<IdType>();
...
@@ -260,8 +271,8 @@ void SegmentMMBackwardB(
...
@@ -260,8 +271,8 @@ void SegmentMMBackwardB(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle)
if (!thr_entry->cublas_handle)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL(
hip
blasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL(
hip
blasSetStream(thr_entry->cublas_handle, stream));
IdType k_offset = 0;
IdType k_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) {
for (IdType etype = 0; etype < num_rel; ++etype) {
...
@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
...
@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
CHECK_LE(k_offset + k, A->shape[0])
CHECK_LE(k_offset + k, A->shape[0])
<< "Segement index out of bound of A->shape[0].";
<< "Segement index out of bound of A->shape[0].";
int lddC = m, ldA = n, lddB = m;
int lddC = m, ldA = n, lddB = m;
cu
blasOperation_t
trans_dC
=
CU
BLAS_OP_N
;
hip
blasOperation_t trans_dC =
HIP
BLAS_OP_N;
cu
blasOperation_t
trans_A
=
CU
BLAS_OP_T
;
hip
blasOperation_t trans_A =
HIP
BLAS_OP_T;
CUBLAS_CALL(cublasGemm<DType>(
CUBLAS_CALL(cublasGemm<DType>(
thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
...
@@ -299,7 +310,7 @@ void GatherMM(
...
@@ -299,7 +310,7 @@ void GatherMM(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b) {
const NDArray idx_b) {
auto device = runtime::DeviceAPI::Get(A->ctx);
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t out_len = B->shape[2]; // cols of B
int64_t out_len = B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A
int64_t in_len = A->shape[1]; // cols of A
const int64_t tot_num_rows = A->shape[0];
const int64_t tot_num_rows = A->shape[0];
...
@@ -332,7 +343,7 @@ void GatherMMScatter(
...
@@ -332,7 +343,7 @@ void GatherMMScatter(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c) {
const NDArray idx_b, const NDArray idx_c) {
auto device = runtime::DeviceAPI::Get(A->ctx);
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const IdType* idx_c_data = idx_c.Ptr<IdType>();
const IdType* idx_c_data = idx_c.Ptr<IdType>();
int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B
int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A
int64_t in_len = A->shape[1]; // cols of A
...
@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
...
@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
const NDArray idx_b);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
GatherMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void GatherMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
const NDArray idx_b);
template
void
GatherMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void GatherMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
const NDArray idx_b);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
...
@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
const NDArray idx_b, const NDArray idx_c);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
GatherMMScatter
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void GatherMMScatter<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
const NDArray idx_b, const NDArray idx_c);
template
void
GatherMMScatter
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void GatherMMScatter<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
const NDArray idx_b, const NDArray idx_c);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
...
@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
bool a_trans, bool b_trans);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SegmentMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
bool a_trans, bool b_trans);
template
void
SegmentMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
bool a_trans, bool b_trans);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
...
@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SegmentMMBackwardB
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentMMBackwardB<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
template
void
SegmentMMBackwardB
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/ge_spmm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/ge_spmm.cuh
* @file array/cuda/ge_spmm.cuh
...
@@ -7,7 +9,7 @@
...
@@ -7,7 +9,7 @@
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "atomic.cuh"
#include "macro.cuh"
#include "macro.cuh"
...
@@ -121,7 +123,7 @@ void GESpMMCsr(
...
@@ -121,7 +123,7 @@ void GESpMMCsr(
const
DType
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
const
DType
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
int
ntx
=
32
;
const
int
ntx
=
32
;
const
int
nty
=
32
;
const
int
nty
=
32
;
...
...
src/array/cuda/labor_sampling.
cu
→
src/array/cuda/labor_sampling.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*!
/*!
* Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -34,19 +36,19 @@
...
@@ -34,19 +36,19 @@
#include <thrust/zip_function.h>
#include <thrust/zip_function.h>
#include <algorithm>
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <limits>
#include <limits>
#include <numeric>
#include <numeric>
#include <type_traits>
#include <type_traits>
#include <utility>
#include <utility>
#include "
../../array/cuda/
atomic.cuh"
#include "atomic.cuh"
#include "
../../array/cuda/
utils.h"
#include "utils.h"
#include "../../graph/transform/cuda/cuda_map_edges.cuh"
#include "../../graph/transform/cuda/cuda_map_edges.cuh"
#include "../../random/continuous_seed.h"
#include "../../random/continuous_seed.h"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "functor.cuh"
#include "
./
spmm.cuh"
#include "spmm.cuh"
namespace dgl {
namespace dgl {
namespace aten {
namespace aten {
...
@@ -131,7 +133,7 @@ struct StencilOpFused {
...
@@ -131,7 +133,7 @@ struct StencilOpFused {
const IdType* indices;
const IdType* indices;
const IdType* nids;
const IdType* nids;
bool is_pinned;
bool is_pinned;
__device__
auto
operator
()(
IdType
idx
)
{
__host__
__device__ auto operator()(IdType idx) {
const auto in_row = idx_coo[idx];
const auto in_row = idx_coo[idx];
const auto ps = probs[idx];
const auto ps = probs[idx];
IdType rofs = idx - subindptr[in_row];
IdType rofs = idx - subindptr[in_row];
...
@@ -277,7 +279,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
...
@@ -277,7 +279,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
const FloatType* const ds, const FloatType* const d2s,
const FloatType* const ds, const FloatType* const d2s,
const IdType* const indptr, const FloatType* const probs,
const IdType* const indptr, const FloatType* const probs,
const FloatType* const A, const IdType* const subindptr) {
const FloatType* const A, const IdType* const subindptr) {
typedef
cub
::
BlockReduce
<
FloatType
,
BLOCK_SIZE
>
BlockReduce
;
typedef
hip
cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ FloatType var_1_bcast[BLOCK_CTAS];
__shared__ FloatType var_1_bcast[BLOCK_CTAS];
...
@@ -351,7 +353,7 @@ int log_size(const IdType size) {
...
@@ -351,7 +353,7 @@ int log_size(const IdType size) {
template <typename IdType, typename FloatType, typename exec_policy_t>
template <typename IdType, typename FloatType, typename exec_policy_t>
void compute_importance_sampling_probabilities(
void compute_importance_sampling_probabilities(
CSRMatrix
mat
,
const
IdType
hop_size
,
cuda
Stream_t
stream
,
CSRMatrix mat, const IdType hop_size,
hip
Stream_t stream,
const continuous_seed seed, const IdType num_rows, const IdType* indptr,
const continuous_seed seed, const IdType num_rows, const IdType* indptr,
const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
const IdType* nids,
const IdType* nids,
...
@@ -398,17 +400,17 @@ void compute_importance_sampling_probabilities(
...
@@ -398,17 +400,17 @@ void compute_importance_sampling_probabilities(
hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
mat.indptr->dtype);
mat.indptr->dtype);
cub
::
DoubleBuffer
<
IdType
>
hop_b
(
hop_2
.
get
(),
hop_3
.
get
());
hip
cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
{
{
std::size_t temp_storage_bytes = 0;
std::size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortKeys
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortKeys(
nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
stream));
stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortKeys
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortKeys(
temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
max_log_num_vertices, stream));
max_log_num_vertices, stream));
}
}
...
@@ -418,13 +420,13 @@ void compute_importance_sampling_probabilities(
...
@@ -418,13 +420,13 @@ void compute_importance_sampling_probabilities(
{
{
std::size_t temp_storage_bytes = 0;
std::size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceRunLengthEncode
::
Encode
(
CUDA_CALL(
hip
cub::DeviceRunLengthEncode::Encode(
nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceRunLengthEncode
::
Encode
(
CUDA_CALL(
hip
cub::DeviceRunLengthEncode::Encode(
temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
...
@@ -511,7 +513,7 @@ void compute_importance_sampling_probabilities(
...
@@ -511,7 +513,7 @@ void compute_importance_sampling_probabilities(
/////////////////////////////// CSR ///////////////////////////////
/////////////////////////////// CSR ///////////////////////////////
template <DGLDeviceType XPU, typename IdType, typename FloatType>
template <DGLDeviceType XPU, typename IdType, typename FloatType>
std
::
pair
<
COOMatrix
,
FloatArray
>
CSRLaborSampling
(
__host__
std::pair<COOMatrix, FloatArray> CSRLaborSampling(
CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
float seed2_contribution, IdArray NIDs) {
float seed2_contribution, IdArray NIDs) {
...
@@ -521,8 +523,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
...
@@ -521,8 +523,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
runtime::CUDAWorkspaceAllocator allocator(ctx);
runtime::CUDAWorkspaceAllocator allocator(ctx);
const
auto
stream
=
runtime
::
getCurrent
CUDA
Stream
();
const auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
auto
exec_policy
=
thrust
::
cuda
::
par_nosync
(
allocator
).
on
(
stream
);
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
...
@@ -569,11 +571,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
...
@@ -569,11 +571,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
size_t prefix_temp_size = 0;
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Reduce
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Reduce(
nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Reduce
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Reduce(
temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
stream));
stream));
...
@@ -586,11 +588,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
...
@@ -586,11 +588,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
IdType hop_size;
IdType hop_size;
{
{
size_t prefix_temp_size = 0;
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream));
stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream));
stream));
...
@@ -619,11 +621,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
...
@@ -619,11 +621,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto modified_in_deg = thrust::make_transform_iterator(
auto modified_in_deg = thrust::make_transform_iterator(
iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
size_t prefix_temp_size = 0;
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
num_rows + 1, stream));
num_rows + 1, stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, modified_in_deg,
temp.get(), prefix_temp_size, modified_in_deg,
subindptr_aligned.get(), num_rows + 1, stream));
subindptr_aligned.get(), num_rows + 1, stream));
...
...
src/array/cuda/macro.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/macro.cuh
* @file array/cuda/macro.cuh
...
@@ -30,14 +31,14 @@
...
@@ -30,14 +31,14 @@
const auto device = runtime::DeviceAPI::Get(ctx); \
const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
(LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.lhs_offset.size())); \
ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(
cuda
Memcpy( \
CUDA_CALL(
hip
Memcpy( \
(LHS_OFF), &info.lhs_offset[0], \
(LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(),
cuda
MemcpyHostToDevice)); \
sizeof(int64_t) * info.lhs_offset.size(),
hip
MemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
(RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.rhs_offset.size())); \
ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(
cuda
Memcpy( \
CUDA_CALL(
hip
Memcpy( \
(RHS_OFF), &info.rhs_offset[0], \
(RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(),
cuda
MemcpyHostToDevice)); \
sizeof(int64_t) * info.rhs_offset.size(),
hip
MemcpyHostToDevice)); \
if ((EDGE_MAP)) { \
if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \
constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \
{ __VA_ARGS__ } \
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment