Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
"src/vscode:/vscode.git/clone" did not exist on "06125966d7054a53458086f342734ea01dc2faf4"
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
589 additions
and
439 deletions
+589
-439
src/array/cuda/array_sort.hip
src/array/cuda/array_sort.hip
+9
-5
src/array/cuda/atomic.cuh
src/array/cuda/atomic.cuh
+72
-37
src/array/cuda/bf16.cuh
src/array/cuda/bf16.cuh
+78
-68
src/array/cuda/coo2csr.hip
src/array/cuda/coo2csr.hip
+11
-7
src/array/cuda/coo_sort.hip
src/array/cuda/coo_sort.hip
+7
-3
src/array/cuda/csr2coo.hip
src/array/cuda/csr2coo.hip
+71
-15
src/array/cuda/csr_get_data.hip
src/array/cuda/csr_get_data.hip
+11
-7
src/array/cuda/csr_mm.hip
src/array/cuda/csr_mm.hip
+65
-61
src/array/cuda/csr_sort.hip
src/array/cuda/csr_sort.hip
+18
-16
src/array/cuda/csr_sum.hip
src/array/cuda/csr_sum.hip
+18
-16
src/array/cuda/csr_transpose.cc
src/array/cuda/csr_transpose.cc
+14
-12
src/array/cuda/cuda_filter.hip
src/array/cuda/cuda_filter.hip
+8
-6
src/array/cuda/cusparse_dispatcher.cuh
src/array/cuda/cusparse_dispatcher.cuh
+64
-63
src/array/cuda/disjoint_union.hip
src/array/cuda/disjoint_union.hip
+5
-3
src/array/cuda/fp16.cuh
src/array/cuda/fp16.cuh
+13
-12
src/array/cuda/functor.cuh
src/array/cuda/functor.cuh
+33
-32
src/array/cuda/gather_mm.hip
src/array/cuda/gather_mm.hip
+59
-48
src/array/cuda/ge_spmm.cuh
src/array/cuda/ge_spmm.cuh
+4
-2
src/array/cuda/labor_sampling.hip
src/array/cuda/labor_sampling.hip
+24
-22
src/array/cuda/macro.cuh
src/array/cuda/macro.cuh
+5
-4
No files found.
src/array/cuda/array_sort.
cu
→
src/array/cuda/array_sort.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cpu/array_sort.cu
* @brief Array sort GPU implementation
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
using runtime::NDArray;
...
...
@@ -29,20 +33,20 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
IdType* keys_out = sorted_array.Ptr<IdType>();
int64_t* values_out = sorted_idx.Ptr<int64_t>();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_bits == 0) {
num_bits = sizeof(IdType) * 8;
}
// Allocate workspace
size_t workspace_size = 0;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
0, num_bits, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
workspace, workspace_size, keys_in, keys_out, values_in, values_out,
nitems, 0, num_bits, stream));
...
...
src/array/cuda/atomic.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file array/cuda/atomic.cuh
...
...
@@ -6,7 +7,7 @@
#ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
#define DGL_ARRAY_CUDA_ATOMIC_CUH_
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <cassert>
#include <cstdint>
...
...
@@ -15,8 +16,8 @@
#include "bf16.cuh"
#include "fp16.cuh"
#if __
CUDA_ARCH__ >= 600
#include <
cuda
_fp16.h>
#if __
HIPCC__
#include <
hip/hip
_fp16.h>
#endif
namespace
dgl
{
...
...
@@ -56,39 +57,39 @@ struct Cast {
template
<
>
struct
Cast
<
half
>
{
typedef
Code
<
sizeof
(
half
)
>::
Type
Type
;
static
__device__
__forceinline__
Type
Encode
(
half
val
)
{
typedef
half
Type
;
static
__host__
__device__
__forceinline__
Type
Encode
(
half
val
)
{
return
__half_as_ushort
(
val
);
}
static
__device__
__forceinline__
half
Decode
(
Type
code
)
{
static
__host__
__device__
__forceinline__
half
Decode
(
Type
code
)
{
return
__ushort_as_half
(
code
);
}
};
#if BF16_ENABLED
template
<
>
struct
Cast
<
__
nv
_bfloat16
>
{
typedef
Code
<
sizeof
(
__nv
_bfloat16
)
>::
Type
Type
;
static
__device__
__forceinline__
Type
Encode
(
__
nv
_bfloat16
val
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
struct
Cast
<
__
hip
_bfloat16
>
{
typedef
__hip
_bfloat16
Type
;
static
__host__
__device__
__forceinline__
Type
Encode
(
__
hip
_bfloat16
val
)
{
#if defined(__
HIP_DEVICE_COMPILE__)
return
__bfloat16_as_ushort
(
val
);
#else
printf
(
"Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.
\n
"
);
__trap
();
// //
__trap();
return
static_cast
<
Type
>
(
0
);
#endif
}
static
__device__
__forceinline__
__
nv
_bfloat16
Decode
(
Type
code
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static
__host__
__device__
__forceinline__
__
hip
_bfloat16
Decode
(
Type
code
)
{
#if defined(__
HIP_DEVICE_COMPILE__)
return
__ushort_as_bfloat16
(
code
);
#else
printf
(
"Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.
\n
"
);
__trap
();
return
static_cast
<
__
nv
_bfloat16
>
(
0.0
f
);
//
__trap();
return
static_cast
<
__
hip
_bfloat16
>
(
0.0
f
);
#endif
}
};
...
...
@@ -116,12 +117,12 @@ struct Cast<double> {
}
};
static
__device__
__forceinline__
unsigned
short
int
atomicCASshort
(
// NOLINT
static
__host__
__device__
__forceinline__
unsigned
short
int
atomicCASshort
(
// NOLINT
unsigned
short
int
*
address
,
// NOLINT
unsigned
short
int
compare
,
// NOLINT
unsigned
short
int
val
)
{
// NOLINT
static_assert
(
CUDA
RT_VERSION
>=
10000
,
"Requires at least CUDA 10"
);
#if
(
defined(__
CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
static_assert
(
DTK
RT_VERSION
>=
10000
,
"Requires at least CUDA 10"
);
#if defined(__
HIP_DEVICE_COMPILE__) && 0
return
atomicCAS
(
address
,
compare
,
val
);
#else
(
void
)
address
;
...
...
@@ -130,9 +131,9 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
printf
(
"Atomic operations are not supported for half precision (FP16) "
"on this GPU.
\n
"
);
__trap
();
abort
();
return
val
;
#endif // (defined(__
CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
#endif // (defined(__
HIP_DEVICE_COMPILE__)
}
#define DEFINE_ATOMIC(NAME) \
...
...
@@ -168,19 +169,53 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
return Cast<dtype>::Decode(old); \
}
#define OP(a, b) max(a, b)
#define DEFINE_ATOMIC_16BIT_BF(NAME, dtype) \
template <> \
__device__ __forceinline__ dtype Atomic##NAME<dtype>( \
dtype * addr, dtype val) { \
typedef uint16_t CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \
CT assumed = old; \
do { \
assumed = old; \
old = atomicCASshort( \
addr_as_ui, assumed, \
Cast<dtype>::Encode(max((double)val, (double)dtype(old)))); \
} while (assumed != old); \
return Cast<dtype>::Decode(old); \
}
#define DEFINE_ATOMIC_16BIT_Min(NAME, dtype) \
template <> \
__device__ __forceinline__ dtype Atomic##NAME<dtype>( \
dtype * addr, dtype val) { \
typedef uint16_t CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \
CT assumed = old; \
do { \
assumed = old; \
old = atomicCASshort( \
addr_as_ui, assumed, \
Cast<dtype>::Encode(min(val, dtype(old)))); \
} while (assumed != old); \
return Cast<dtype>::Decode(old); \
}
#define OP(a, b) max((double)a, (double)b)
DEFINE_ATOMIC
(
Max
)
DEFINE_ATOMIC_16BIT
(
Max
,
half
)
#if BF16_ENABLED
DEFINE_ATOMIC_16BIT
(
Max
,
__
nv
_bfloat16
)
DEFINE_ATOMIC_16BIT
_BF
(
Max
,
__
hip
_bfloat16
)
#endif // BF16_ENABLED
#undef OP
#define OP(a, b) min(
a,
b)
#define OP(a, b) min(
(double)a, (double)
b)
DEFINE_ATOMIC
(
Min
)
DEFINE_ATOMIC_16BIT
(
Min
,
half
)
#if BF16_ENABLED
DEFINE_ATOMIC_16BIT
(
Min
,
__
nv
_bfloat16
)
DEFINE_ATOMIC_16BIT
_BF
(
Min
,
__
hip
_bfloat16
)
#endif // BF16_ENABLED
#undef OP
...
...
@@ -256,7 +291,7 @@ inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) {
template
<
>
__device__
__forceinline__
float
AtomicAdd
<
float
>
(
float
*
addr
,
float
val
)
{
#if __
CUDA_ARCH__ >= 200
#if __
HIP_DEVICE_COMPILE__
return
atomicAdd
(
addr
,
val
);
#else
typedef
float
T
;
...
...
@@ -270,12 +305,12 @@ __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
addr_as_ui
,
assumed
,
Cast
<
T
>::
Encode
(
Cast
<
T
>::
Decode
(
old
)
+
val
));
}
while
(
assumed
!=
old
);
return
Cast
<
T
>::
Decode
(
old
);
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
}
template
<
>
__device__
__forceinline__
double
AtomicAdd
<
double
>
(
double
*
addr
,
double
val
)
{
#if __
CUDA_ARCH__ >= 600
#if __
HIP_DEVICE_COMPILE__
return
atomicAdd
(
addr
,
val
);
#else
typedef
double
T
;
...
...
@@ -292,11 +327,11 @@ __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
#endif
}
#if defined(
CUDA
RT_VERSION) &&
CUDA
RT_VERSION >= 10000
#if defined(
DTK
RT_VERSION) &&
DTK
RT_VERSION >= 10000
template
<
>
__device__
__forceinline__
half
AtomicAdd
<
half
>
(
half
*
addr
,
half
val
)
{
// make sure we have half support
#if __
CUDA_ARCH__ >= 700
#if __
HIP_DEVICE_COMPILE__
return
atomicAdd
(
addr
,
val
);
#else
(
void
)
addr
;
...
...
@@ -304,18 +339,18 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
printf
(
"Atomic operations are not supported for half precision (FP16) "
"on this GPU.
\n
"
);
__trap
();
// //
__trap();
return
val
;
#endif // __
CUDA_ARCH__ >= 700
#endif // __
HIP_DEVICE_COMPILE__
}
#endif // defined(
CUDA
RT_VERSION) &&
CUDA
RT_VERSION >= 10000
#endif // defined(
DTK
RT_VERSION) &&
DTK
RT_VERSION >= 10000
#if BF16_ENABLED
template
<
>
__device__
__forceinline__
__
nv
_bfloat16
AtomicAdd
<
__
nv
_bfloat16
>
(
__
nv
_bfloat16
*
addr
,
__
nv
_bfloat16
val
)
{
__device__
__forceinline__
__
hip
_bfloat16
AtomicAdd
<
__
hip
_bfloat16
>
(
__
hip
_bfloat16
*
addr
,
__
hip
_bfloat16
val
)
{
// make sure we have bfloat16 support
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__
HIP_DEVICE_COMPILE__)
return
atomicAdd
(
addr
,
val
);
#else
(
void
)
addr
;
...
...
@@ -323,9 +358,9 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
printf
(
"Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.
\n
"
);
__trap
();
//
__trap();
return
val
;
#endif // defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#endif // defined(__
HIP_DEVICE_COMPILE__)
}
#endif // BF16_ENABLED
...
...
src/array/cuda/bf16.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2022 by Contributors
*
...
...
@@ -18,131 +19,140 @@
*/
#ifndef DGL_ARRAY_CUDA_BF16_CUH_
#define DGL_ARRAY_CUDA_BF16_CUH_
#include <hip/hip_runtime.h>
#if BF16_ENABLED
#include <
cuda
_bf16.h>
#include <
hip/hip
_bf16.h>
#include <algorithm>
static
__device__
__forceinline__
__
nv
_bfloat16
max
(
__
nv
_bfloat16
a
,
__
nv
_bfloat16
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static
__device__
__forceinline__
__
hip
_bfloat16
max
(
__
hip
_bfloat16
a
,
__
hip
_bfloat16
b
)
{
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hmax
(
a
,
b
);
#else
return
__
nv
_bfloat16
(
max
(
float
(
a
),
float
(
b
)));
// NOLINT
return
__
hip
_bfloat16
(
max
(
float
(
a
),
float
(
b
)));
// NOLINT
#endif
}
static
__device__
__forceinline__
__
nv
_bfloat16
min
(
__
nv
_bfloat16
a
,
__
nv
_bfloat16
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static
__device__
__forceinline__
__
hip
_bfloat16
min
(
__
hip
_bfloat16
a
,
__
hip
_bfloat16
b
)
{
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hmin
(
a
,
b
);
#else
return
__
nv
_bfloat16
(
min
(
float
(
a
),
float
(
b
)));
// NOLINT
return
__
hip
_bfloat16
(
min
(
float
(
a
),
float
(
b
)));
// NOLINT
#endif
}
#ifdef __
CUDA
CC__
#ifdef __
HIP
CC__
// Arithmetic BF16 operations for architecture >= 8.0 are already defined in
//
cuda_bf
16.h
#if defined(__
CUDA
_ARCH__) && (__
CUDA
_ARCH__ < 800)
// CUDA 12.2 adds "emulated" support for older architectures.
#if defined(
CUDA
RT_VERSION) && (
CUDA
RT_VERSION < 12020)
__device__
__forceinline__
__
nv
_bfloat16
operator
+
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
}
__device__
__forceinline__
__
nv
_bfloat16
operator
-
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
}
__device__
__forceinline__
__
nv
_bfloat16
operator
*
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
}
__device__
__forceinline__
__
nv
_bfloat16
operator
/
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
return
__
nv
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
//
hip/__hip_bfloat
16.h
//
#if defined(__
DTK
_ARCH__) && (__
DTK
_ARCH__ < 800)
//
// CUDA 12.2 adds "emulated" support for older architectures.
//
#if defined(
DTK
RT_VERSION) && (
DTK
RT_VERSION < 12020)
__device__
__forceinline__
__
hip
_bfloat16
operator
+
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
hip
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
}
__device__
__forceinline__
__
hip
_bfloat16
operator
-
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
hip
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
}
__device__
__forceinline__
__
hip
_bfloat16
operator
*
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
hip
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
}
__device__
__forceinline__
__
hip
_bfloat16
operator
/
(
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
__
hip
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
+=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
&
operator
+=
(
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
+
float
(
rh
));
// NOLINT
return
lh
;
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
-=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
&
operator
-=
(
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
-
float
(
rh
));
// NOLINT
return
lh
;
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
*=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
&
operator
*=
(
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
*
float
(
rh
));
// NOLINT
return
lh
;
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
/=
(
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
nv
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
&
operator
/=
(
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
// NOLINT
lh
=
__
hip
_bfloat16
(
float
(
lh
)
/
float
(
rh
));
// NOLINT
return
lh
;
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
++
(
__
nv
_bfloat16
&
h
)
{
// NOLINT
h
=
__
nv
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
&
operator
++
(
__
hip
_bfloat16
&
h
)
{
// NOLINT
h
=
__
hip
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
return
h
;
}
__device__
__forceinline__
__
nv
_bfloat16
&
operator
--
(
__
nv
_bfloat16
&
h
)
{
// NOLINT
h
=
__
nv
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
&
operator
--
(
__
hip
_bfloat16
&
h
)
{
// NOLINT
h
=
__
hip
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
return
h
;
}
__device__
__forceinline__
__
nv
_bfloat16
operator
++
(
__
nv
_bfloat16
&
h
,
int
)
{
// NOLINT
__
nv
_bfloat16
ret
=
h
;
h
=
__
nv
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
operator
++
(
__
hip
_bfloat16
&
h
,
int
)
{
// NOLINT
__
hip
_bfloat16
ret
=
h
;
h
=
__
hip
_bfloat16
(
float
(
h
)
+
1.0
f
);
// NOLINT
return
ret
;
}
__device__
__forceinline__
__
nv
_bfloat16
operator
--
(
__
nv
_bfloat16
&
h
,
int
)
{
// NOLINT
__
nv
_bfloat16
ret
=
h
;
h
=
__
nv
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
operator
--
(
__
hip
_bfloat16
&
h
,
int
)
{
// NOLINT
__
hip
_bfloat16
ret
=
h
;
h
=
__
hip
_bfloat16
(
float
(
h
)
-
1.0
f
);
// NOLINT
return
ret
;
}
__device__
__forceinline__
__
nv
_bfloat16
operator
+
(
const
__
nv
_bfloat16
&
h
)
{
__device__
__forceinline__
__
hip
_bfloat16
operator
+
(
const
__
hip
_bfloat16
&
h
)
{
return
h
;
}
__device__
__forceinline__
__
nv
_bfloat16
operator
-
(
const
__
nv
_bfloat16
&
h
)
{
return
__
nv
_bfloat16
(
-
float
(
h
));
// NOLINT
__device__
__forceinline__
__
hip
_bfloat16
operator
-
(
const
__
hip
_bfloat16
&
h
)
{
return
__
hip
_bfloat16
(
-
float
(
h
));
// NOLINT
}
__device__
__forceinline__
bool
operator
==
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
==
float
(
rh
);
// NOLINT
}
__device__
__forceinline__
bool
operator
!=
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
!=
float
(
rh
);
// NOLINT
}
__device__
__forceinline__
bool
operator
>
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
>
float
(
rh
);
// NOLINT
}
__device__
__forceinline__
bool
operator
<
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
<
float
(
rh
);
// NOLINT
}
__device__
__forceinline__
bool
operator
>=
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
>=
float
(
rh
);
// NOLINT
}
__device__
__forceinline__
bool
operator
<=
(
const
__
nv
_bfloat16
&
lh
,
const
__
nv
_bfloat16
&
rh
)
{
const
__
hip
_bfloat16
&
lh
,
const
__
hip
_bfloat16
&
rh
)
{
return
float
(
lh
)
<=
float
(
rh
);
// NOLINT
}
#endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
#endif // __CUDACC__
// #endif // defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
// #endif // defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800)
__device__
inline
__hip_bfloat16
__shfl_down
(
__hip_bfloat16
var
,
unsigned
int
lane_delta
,
int
width
=
warpSize
)
{
union
{
unsigned
short
s
;
__hip_bfloat16
us
;
}
tmp
;
tmp
.
us
=
var
;
tmp
.
s
=
__shfl_down
(
tmp
.
s
,
lane_delta
,
width
);
return
tmp
.
us
;
}
#endif // __HIPCC__
#endif // BF16_ENABLED
...
...
src/array/cuda/coo2csr.
cu
→
src/array/cuda/coo2csr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/coo2csr.cc
* @brief COO2CSR
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
...
...
@@ -24,12 +28,12 @@ CSRMatrix COOToCSR(COOMatrix coo) {
template <>
CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted;
...
...
@@ -50,9 +54,9 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
NDArray indptr =
aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
CUSPARSE_CALL
(
cu
sparseXcoo2csr
(
CUSPARSE_CALL(
hip
sparseXcoo2csr(
thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
indptr_ptr
,
CU
SPARSE_INDEX_BASE_ZERO
));
indptr_ptr,
HIP
SPARSE_INDEX_BASE_ZERO));
return CSRMatrix(
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
...
...
@@ -100,7 +104,7 @@ template <>
CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
const auto& ctx = coo.row->ctx;
const auto nbits = coo.row->dtype.bits;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted;
if (!row_sorted) {
...
...
src/array/cuda/coo_sort.
cu
→
src/array/cuda/coo_sort.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/coo_sort.cc
* @brief Sort COO index
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../c_api_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
...
...
@@ -65,7 +69,7 @@ __global__ void _COODecodeEdgesKernel(
template <DGLDeviceType XPU, typename IdType>
void COOSort_(COOMatrix* coo, bool sort_column) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int row_bits = cuda::_NumberOfBits(coo->num_rows);
const int64_t nnz = coo->row->shape[0];
...
...
@@ -138,7 +142,7 @@ template <DGLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0];
const auto& ctx = coo.row->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
// should be fine.
...
...
src/array/cuda/csr2coo.
cu
→
src/array/cuda/csr2coo.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr2coo.cc
...
...
@@ -8,10 +10,10 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
...
...
@@ -29,12 +31,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
template <>
COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
...
...
@@ -42,9 +44,9 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
int32_t* row_ptr = static_cast<int32_t*>(row->data);
CUSPARSE_CALL
(
cu
sparseXcsr2coo
(
CUSPARSE_CALL(
hip
sparseXcsr2coo(
thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
row_ptr
,
CU
SPARSE_INDEX_BASE_ZERO
));
row_ptr,
HIP
SPARSE_INDEX_BASE_ZERO));
return COOMatrix(
csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
...
...
@@ -72,10 +74,40 @@ struct AdjacentDifference {
}
};
/*!
* \brief Repeat elements
* \param val Value to repeat
* \param repeats Number of repeats for each value
* \param pos The position of the output buffer to write the value.
* \param out Output buffer.
* \param length Number of values
*
* For example:
* val = [3, 0, 1]
* repeats = [1, 0, 2]
* pos = [0, 1, 1] # write to output buffer position 0, 1, 1
* then,
* out = [3, 1, 1]
*/
template <typename DType, typename IdType>
__global__ void _RepeatKernel(
const DType* val, const IdType* pos,
DType* out, int64_t n_row, int64_t length) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1;
out[tx] = val[i];
tx += stride_x;
}
}
#if 0
template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t nnz = csr.indices->shape[0];
const auto nbits = csr.indptr->dtype.bits;
...
...
@@ -96,14 +128,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
std::size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceCopy::Batched(
nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
buffer_sizes
+
i
,
std
::
min
(
csr
.
num_rows
-
i
,
max_copy_at_once
),
buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL(cub::DeviceCopy::Batched(
temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
buffer_sizes
+
i
,
std
::
min
(
csr
.
num_rows
-
i
,
max_copy_at_once
),
buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
stream));
}
...
...
@@ -111,6 +143,30 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
csr.sorted);
}
#else
template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t nnz = csr.indices->shape[0];
const auto nbits = csr.indptr->dtype.bits;
IdArray rowids = Range(0, csr.num_rows, nbits, ctx);
IdArray ret_row = NewIdArray(nnz, ctx, nbits);
const int nt = 256;
const int nb = (nnz + nt - 1) / nt;
CUDA_KERNEL_CALL(_RepeatKernel,
nb, nt, 0, stream,
rowids.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
csr.num_rows, nnz);
return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
}
#endif
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
...
...
@@ -128,12 +184,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
NDArray row = coo.row, col = coo.col, data = coo.data;
int32_t* row_ptr = static_cast<int32_t*>(row->data);
...
...
@@ -141,11 +197,11 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
int32_t* data_ptr = static_cast<int32_t*>(data->data);
size_t workspace_size = 0;
CUSPARSE_CALL
(
cu
sparseXcoosort_bufferSizeExt
(
CUSPARSE_CALL(
hip
sparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, &workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL
(
cu
sparseXcoosortByRow
(
CUSPARSE_CALL(
hip
sparseXcoosortByRow(
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, col_ptr, workspace));
device->FreeWorkspace(row->ctx, workspace);
...
...
src/array/cuda/csr_get_data.
cu
→
src/array/cuda/csr_get_data.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/csr_get_data.cu
* @brief Retrieve entries of a CSR matrix
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <numeric>
#include <unordered_set>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
...
...
@@ -32,11 +36,11 @@ NDArray CSRGetData(
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const
int64_t
rstlen
=
std
::
max
(
rowlen
,
collen
);
const int64_t rstlen = ::max(rowlen, collen);
IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx);
if (rstlen == 0) return rst;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
if (return_eids)
...
...
@@ -67,12 +71,12 @@ template NDArray CSRGetData<kDGLCUDA, int64_t, __half>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray weights, __half filler);
#if BF16_ENABLED
template
NDArray
CSRGetData
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template NDArray CSRGetData<kDGLCUDA, int32_t, __
hip
_bfloat16>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray
weights
,
__
nv
_bfloat16
filler
);
template
NDArray
CSRGetData
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
NDArray weights, __
hip
_bfloat16 filler);
template NDArray CSRGetData<kDGLCUDA, int64_t, __
hip
_bfloat16>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray
weights
,
__
nv
_bfloat16
filler
);
NDArray weights, __
hip
_bfloat16 filler);
#endif // BF16_ENABLED
template NDArray CSRGetData<kDGLCUDA, int32_t, float>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
...
...
src/array/cuda/csr_mm.
cu
→
src/array/cuda/csr_mm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr_mm.cu
* @brief SpSpMM/SpGEMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <dgl/runtime/device_api.h>
#include <limits>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
cusparse_dispatcher.cuh"
#include "
./
functor.cuh"
#include "cusparse_dispatcher.cuh"
#include "functor.cuh"
namespace dgl {
using namespace dgl::runtime;
...
...
@@ -18,7 +22,7 @@ using namespace dgl::runtime;
namespace aten {
namespace cusparse {
#if
CUDA
RT_VERSION >= 12000
#if
DTK
RT_VERSION >= 12000
/** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */
template <typename DType, typename IdType>
...
...
@@ -31,74 +35,74 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
const int nnzB = B.indices->shape[0];
const DType alpha = 1.0;
const DType beta = 0.0;
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto transA =
HIP
SPARSE_OPERATION_NON_TRANSPOSE;
auto transB =
HIP
SPARSE_OPERATION_NON_TRANSPOSE;
// device
auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
// all one data array
cu
sparseSpMatDescr_t
matA
,
matB
,
matC
;
hip
sparseSpMatDescr_t matA, matB, matC;
IdArray dC_csrOffsets =
IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx);
IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
constexpr auto idtype = cusparse_idtype<IdType>::value;
constexpr auto dtype = cuda_dtype<DType>::value;
// Create sparse matrix A, B and C in CSR format
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL(
hip
sparseCreateCsr(
&matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(),
A.indices.Ptr<IdType>(),
//
cu
sparseCreateCsr only accepts non-const pointers.
const_cast
<
DType
*>
(
A_weights
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
//
hip
sparseCreateCsr only accepts non-const pointers.
const_cast<DType*>(A_weights), idtype, idtype,
HIP
SPARSE_INDEX_BASE_ZERO,
dtype));
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL(
hip
sparseCreateCsr(
&matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(),
B.indices.Ptr<IdType>(),
//
cu
sparseCreateCsr only accepts non-const pointers.
const_cast
<
DType
*>
(
B_weights
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
//
hip
sparseCreateCsr only accepts non-const pointers.
const_cast<DType*>(B_weights), idtype, idtype,
HIP
SPARSE_INDEX_BASE_ZERO,
dtype));
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL(
hip
sparseCreateCsr(
&matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr,
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
idtype, idtype,
HIP
SPARSE_INDEX_BASE_ZERO, dtype));
// SpGEMM Computation
cu
sparseSpGEMMDescr_t
spgemmDesc
;
cusparseSpGEMMAlg_t
alg
=
CU
SPARSE_SPGEMM_DEFAULT
;
hip
sparseSpGEMMDescr_t spgemmDesc;
cusparseSpGEMMAlg_t alg =
HIP
SPARSE_SPGEMM_DEFAULT;
CUSPARSE_CALL
(
cu
sparseSpGEMM_createDescr
(
&
spgemmDesc
));
CUSPARSE_CALL(
hip
sparseSpGEMM_createDescr(&spgemmDesc));
size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0;
// ask bufferSize1 bytes for external memory
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
// inspect the matrices A and B to understand the memory requiremnent
cu
sparseStatus_t
e
=
cu
sparseSpGEMM_workEstimation
(
hip
sparseStatus_t e =
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1);
//
CU
SPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
//
HIP
SPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
// and throws insufficient memory error within workEstimation call
if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
// fall back to ALG2 to estimate num_prods
alg = CUSPARSE_SPGEMM_ALG2;
device->FreeWorkspace(ctx, workspace1);
// rerun
cu
sparseSpGEMM_workEstimation
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
// rerun
hip
sparseSpGEMM_workEstimation
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
} else {
CHECK
(
e
==
CU
SPARSE_STATUS_SUCCESS
)
<<
"CUSPARSE ERROR in SpGEMM: "
<<
e
;
CHECK(e ==
HIP
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
}
// get the number of intermediate products required for SpGEMM compute
...
...
@@ -113,22 +117,22 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
int64_t LARGE_NUM_PRODUCTS = 800000000; // 800*1000*1000;
// switch to ALG2/ALG3 for medium & large problem size
if
(
alg
==
CU
SPARSE_SPGEMM_DEFAULT
&&
num_prods
>
MEDIUM_NUM_PRODUCTS
)
{
if (alg ==
HIP
SPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
// use ALG3 for very large problem
alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3
: CUSPARSE_SPGEMM_ALG2;
device->FreeWorkspace(ctx, workspace1);
// rerun
cu
sparseSpGEMM_workEstimation
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
// rerun
hip
sparseSpGEMM_workEstimation
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
CUSPARSE_CALL
(
cu
sparseSpGEMM_workEstimation
(
CUSPARSE_CALL(
hip
sparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
} else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
// no need to rerun
cu
sparseSpGEMM_workEstimation between ALG2 and ALG3
// no need to rerun
hip
sparseSpGEMM_workEstimation between ALG2 and ALG3
alg = CUSPARSE_SPGEMM_ALG3;
}
...
...
@@ -147,40 +151,40 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
workspace3, &workspace_size2));
device->FreeWorkspace(ctx, workspace3);
} else {
CUSPARSE_CALL
(
cu
sparseSpGEMM_compute
(
CUSPARSE_CALL(
hip
sparseSpGEMM_compute(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size2, NULL));
}
// ask bufferSize2 bytes for external memory
void* workspace2 = device->AllocWorkspace(ctx, workspace_size2);
// compute the intermediate product of A * B
CUSPARSE_CALL
(
cu
sparseSpGEMM_compute
(
CUSPARSE_CALL(
hip
sparseSpGEMM_compute(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2));
// get matrix C non-zero entries C_nnz1
int64_t C_num_rows1, C_num_cols1, C_nnz1;
CUSPARSE_CALL(
cu
sparseSpMatGetSize
(
matC
,
&
C_num_rows1
,
&
C_num_cols1
,
&
C_nnz1
));
hip
sparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx);
NDArray dC_weights =
NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx);
IdType* dC_columns_data = dC_columns.Ptr<IdType>();
DType* dC_weights_data = dC_weights.Ptr<DType>();
// update matC with the new pointers
CUSPARSE_CALL
(
cu
sparseCsrSetPointers
(
CUSPARSE_CALL(
hip
sparseCsrSetPointers(
matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data));
// copy the final products to the matrix C
CUSPARSE_CALL
(
cu
sparseSpGEMM_copy
(
CUSPARSE_CALL(
hip
sparseSpGEMM_copy(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc));
device->FreeWorkspace(ctx, workspace1);
device->FreeWorkspace(ctx, workspace2);
// destroy matrix/vector descriptors
CUSPARSE_CALL
(
cu
sparseSpGEMM_destroyDescr
(
spgemmDesc
));
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matC
));
CUSPARSE_CALL(
hip
sparseSpGEMM_destroyDescr(spgemmDesc));
CUSPARSE_CALL(
hip
sparseDestroySpMat(matA));
CUSPARSE_CALL(
hip
sparseDestroySpMat(matB));
CUSPARSE_CALL(
hip
sparseDestroySpMat(matC));
return {
CSRMatrix(
A.num_rows, B.num_cols, dC_csrOffsets, dC_columns,
...
...
@@ -188,7 +192,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
dC_weights};
}
#else //
CUDA
RT_VERSION < 12000
#else //
DTK
RT_VERSION < 12000
/** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA
* versions */
...
...
@@ -208,25 +212,25 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto idtype = A.indptr->dtype;
auto dtype = A_weights_array->dtype;
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL
(
cu
sparseSetPointerMode
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_POINTER_MODE_HOST
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
CUSPARSE_CALL(
hip
sparseSetPointerMode(
thr_entry->cusparse_handle,
HIP
SPARSE_POINTER_MODE_HOST));
CUSPARSE_CALL
(
cu
sparseCreateCsrgemm2Info
(
&
info
));
CUSPARSE_CALL(
hip
sparseCreateCsrgemm2Info(&info));
cu
sparseMatDescr_t
matA
,
matB
,
matC
,
matD
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matA
));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matB
));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matC
));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matD
));
// needed even if D is null
hip
sparseMatDescr_t matA, matB, matC, matD;
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matA));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matB));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matC));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matD)); // needed even if D is null
CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt(
thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA,
...
...
@@ -252,11 +256,11 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace));
device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL
(
cu
sparseDestroyCsrgemm2Info
(
info
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matC
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matD
));
CUSPARSE_CALL(
hip
sparseDestroyCsrgemm2Info(info));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matA));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matB));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matC));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matD));
return {
CSRMatrix(
...
...
@@ -264,7 +268,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
C_weights};
}
#endif //
CUDA
RT_VERSION >= 12000
#endif //
DTK
RT_VERSION >= 12000
} // namespace cusparse
template <int XPU, typename IdType, typename DType>
...
...
@@ -314,9 +318,9 @@ template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
#if BF16_ENABLED
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
#endif // BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/csr_sort.
cu
→
src/array/cuda/csr_sort.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr_sort.cc
...
...
@@ -5,10 +7,10 @@
*/
#include <dgl/array.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
...
...
@@ -39,7 +41,7 @@ __global__ void _SegmentIsSorted(
template <DGLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
...
...
@@ -67,12 +69,12 @@ template <>
void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
NDArray indptr = csr->indptr;
NDArray indices = csr->indices;
...
...
@@ -83,16 +85,16 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
NDArray data = csr->data;
size_t workspace_size = 0;
CUSPARSE_CALL
(
cu
sparseXcsrsort_bufferSizeExt
(
CUSPARSE_CALL(
hip
sparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
cu
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL
(
cu
sparseXcsrsort
(
hip
sparseMatDescr_t descr;
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&descr));
CUSPARSE_CALL(
hip
sparseSetMatType(descr,
HIP
SPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(
hip
sparseSetMatIndexBase(descr,
HIP
SPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(
hip
sparseXcsrsort(
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
workspace));
...
...
@@ -100,13 +102,13 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
csr->sorted = true;
// free resources
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(descr));
device->FreeWorkspace(ctx, workspace);
}
template <>
void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
const auto& ctx = csr->indptr->ctx;
...
...
@@ -125,13 +127,13 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
// Allocate workspace
size_t workspace_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairs(
nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairs(
workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
...
...
src/array/cuda/csr_sum.
cu
→
src/array/cuda/csr_sum.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
...
...
@@ -7,8 +9,8 @@
#include <dgl/runtime/device_api.h>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
cusparse_dispatcher.cuh"
#include "
./
functor.cuh"
#include "cusparse_dispatcher.cuh"
#include "functor.cuh"
namespace dgl {
...
...
@@ -32,21 +34,21 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle)
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL(
hip
sparseCreate(&(thr_entry->cusparse_handle)));
CUSPARSE_CALL(
hip
sparseSetStream(thr_entry->cusparse_handle, stream));
cu
sparseMatDescr_t
matA
,
matB
,
matC
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matA
));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matB
));
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
matC
));
hip
sparseMatDescr_t matA, matB, matC;
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matA));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matB));
CUSPARSE_CALL(
hip
sparseCreateMatDescr(&matC));
cu
sparseSetPointerMode
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_POINTER_MODE_HOST
);
hip
sparseSetPointerMode(
thr_entry->cusparse_handle,
HIP
SPARSE_POINTER_MODE_HOST);
size_t workspace_size = 0;
/* prepare output C */
IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx);
...
...
@@ -81,9 +83,9 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
device->FreeWorkspace(ctx, workspace);
// destroy matrix/vector descriptors
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
matC
));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matA));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matB));
CUSPARSE_CALL(
hip
sparseDestroyMatDescr(matC));
return {
CSRMatrix(
A.num_rows, A.num_cols, dC_csrOffsets, dC_columns,
...
...
@@ -159,9 +161,9 @@ template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
#if BF16_ENABLED
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRSum
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
template
std
::
pair
<
CSRMatrix
,
NDArray
>
CSRSum
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>
(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
#endif // BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/csr_transpose.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/csr_transpose.cc
...
...
@@ -23,12 +25,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
template
<
>
CSRMatrix
CSRTranspose
<
kDGLCUDA
,
int32_t
>
(
CSRMatrix
csr
)
{
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
NDArray
indptr
=
csr
.
indptr
,
indices
=
csr
.
indices
,
data
=
csr
.
data
;
const
int64_t
nnz
=
indices
->
shape
[
0
];
...
...
@@ -48,30 +50,30 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
int32_t
*
t_indices_ptr
=
static_cast
<
int32_t
*>
(
t_indices
->
data
);
void
*
t_data_ptr
=
t_data
->
data
;
#if
CUDA
RT_VERSION >= 10010
#if
DTK
RT_VERSION >= 10010
auto
device
=
runtime
::
DeviceAPI
::
Get
(
csr
.
indptr
->
ctx
);
// workspace
size_t
workspace_size
;
CUSPARSE_CALL
(
cu
sparseCsr2cscEx2_bufferSize
(
CUSPARSE_CALL
(
hip
sparseCsr2cscEx2_bufferSize
(
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
data_ptr
,
indptr_ptr
,
indices_ptr
,
t_data_ptr
,
t_indptr_ptr
,
t_indices_ptr
,
CUDA
_R_32F
,
CU
SPARSE_ACTION_NUMERIC
,
CU
SPARSE_INDEX_BASE_ZERO
,
CU
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
HIP
_R_32F
,
HIP
SPARSE_ACTION_NUMERIC
,
HIP
SPARSE_INDEX_BASE_ZERO
,
HIP
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseCsr2cscEx2
(
CUSPARSE_CALL
(
hip
sparseCsr2cscEx2
(
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
data_ptr
,
indptr_ptr
,
indices_ptr
,
t_data_ptr
,
t_indptr_ptr
,
t_indices_ptr
,
CUDA
_R_32F
,
CU
SPARSE_ACTION_NUMERIC
,
CU
SPARSE_INDEX_BASE_ZERO
,
CU
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
HIP
_R_32F
,
HIP
SPARSE_ACTION_NUMERIC
,
HIP
SPARSE_INDEX_BASE_ZERO
,
HIP
SPARSE_CSR2CSC_ALG1
,
// see cusparse doc for reference
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
#else
CUSPARSE_CALL
(
cu
sparseScsr2csc
(
CUSPARSE_CALL
(
hip
sparseScsr2csc
(
thr_entry
->
cusparse_handle
,
csr
.
num_rows
,
csr
.
num_cols
,
nnz
,
static_cast
<
const
float
*>
(
data_ptr
),
indptr_ptr
,
indices_ptr
,
static_cast
<
float
*>
(
t_data_ptr
),
t_indices_ptr
,
t_indptr_ptr
,
CU
SPARSE_ACTION_NUMERIC
,
CU
SPARSE_INDEX_BASE_ZERO
));
HIP
SPARSE_ACTION_NUMERIC
,
HIP
SPARSE_INDEX_BASE_ZERO
));
#endif
return
CSRMatrix
(
...
...
src/array/cuda/cuda_filter.
cu
→
src/array/cuda/cuda_filter.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/cuda_filter.cc
...
...
@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_hashtable.cuh"
...
...
@@ -45,7 +47,7 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
const auto& ctx = test->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t size = test->shape[0];
cuda
Stream_t
cudaStream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t cudaStream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (size == 0) {
return test;
...
...
@@ -74,12 +76,12 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
// generate prefix-sum
{
size_t workspace_bytes;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), size + 1, cudaStream));
void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
device->FreeWorkspace(ctx, workspace);
}
...
...
@@ -108,8 +110,8 @@ template <typename IdType>
class CudaFilterSet : public Filter {
public:
explicit CudaFilterSet(IdArray array)
:
table_
(
array
->
shape
[
0
],
array
->
ctx
,
runtime
::
getCurrent
CUDA
Stream
())
{
cuda
Stream_t
cudaStream
=
runtime
::
getCurrent
CUDA
Stream
();
: table_(array->shape[0], array->ctx, runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
()) {
hip
Stream_t cudaStream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
table_.FillWithUnique(
static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
}
...
...
src/array/cuda/cusparse_dispatcher.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/dispatcher.cuh
...
...
@@ -7,7 +8,7 @@
#ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#include <
cu
sparse.h>
#include <
hipsparse/hip
sparse.h>
#include <dgl/runtime/c_runtime_api.h>
#include "bf16.cuh"
...
...
@@ -20,70 +21,70 @@ namespace aten {
template
<
typename
DType
>
struct
CSRGEMM
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
};
template
<
>
struct
CSRGEMM
<
__half
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// different implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEMM::bufferSizeExt does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEMM::compute does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
};
#if BF16_ENABLED
template
<
>
struct
CSRGEMM
<
__
nv
_bfloat16
>
{
struct
CSRGEMM
<
__
hip
_bfloat16
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// different implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEMM::compute does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
};
#endif // BF16_ENABLED
...
...
@@ -91,36 +92,36 @@ struct CSRGEMM<__nv_bfloat16> {
template
<
>
struct
CSRGEMM
<
float
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseScsrgemm2_bufferSizeExt
(
args
...);
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
hip
sparseScsrgemm2_bufferSizeExt
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseScsrgemm2
(
args
...);
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
hip
sparseScsrgemm2
(
args
...);
}
};
template
<
>
struct
CSRGEMM
<
double
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseDcsrgemm2_bufferSizeExt
(
args
...);
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
hip
sparseDcsrgemm2_bufferSizeExt
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgemm2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgemm2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseDcsrgemm2
(
args
...);
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
hip
sparseDcsrgemm2
(
args
...);
}
};
...
...
@@ -128,70 +129,70 @@ struct CSRGEMM<double> {
template
<
typename
DType
>
struct
CSRGEAM
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
};
template
<
>
struct
CSRGEAM
<
__half
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// different implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEAM::bufferSizeExt does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEAM::compute does not support dtype half (FP16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
};
#if BF16_ENABLED
template
<
>
struct
CSRGEAM
<
__
nv
_bfloat16
>
{
struct
CSRGEAM
<
__
hip
_bfloat16
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// different implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"CSRGEAM::compute does not support dtype bfloat16 (BF16)."
;
return
static_cast
<
cu
sparseStatus_t
>
(
0
);
return
static_cast
<
hip
sparseStatus_t
>
(
0
);
}
};
#endif // BF16_ENABLED
...
...
@@ -199,36 +200,36 @@ struct CSRGEAM<__nv_bfloat16> {
template
<
>
struct
CSRGEAM
<
float
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseScsrgeam2_bufferSizeExt
(
args
...);
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
hip
sparseScsrgeam2_bufferSizeExt
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseScsrgeam2
(
args
...);
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
hip
sparseScsrgeam2
(
args
...);
}
};
template
<
>
struct
CSRGEAM
<
double
>
{
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
cu
sparseDcsrgeam2_bufferSizeExt
(
args
...);
static
inline
hip
sparseStatus_t
bufferSizeExt
(
Args
...
args
)
{
return
hip
sparseDcsrgeam2_bufferSizeExt
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
nnz
(
Args
...
args
)
{
return
cu
sparseXcsrgeam2Nnz
(
args
...);
static
inline
hip
sparseStatus_t
nnz
(
Args
...
args
)
{
return
hip
sparseXcsrgeam2Nnz
(
args
...);
}
template
<
typename
...
Args
>
static
inline
cu
sparseStatus_t
compute
(
Args
...
args
)
{
return
cu
sparseDcsrgeam2
(
args
...);
static
inline
hip
sparseStatus_t
compute
(
Args
...
args
)
{
return
hip
sparseDcsrgeam2
(
args
...);
}
};
...
...
src/array/cuda/disjoint_union.
cu
→
src/array/cuda/disjoint_union.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2022, NVIDIA CORPORATION.
*
...
...
@@ -24,7 +26,7 @@
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
using runtime::NDArray;
...
...
@@ -78,7 +80,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
template <DGLDeviceType XPU, typename IdType>
void _Merge(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
int
n_elms
,
DGLContext
ctx
,
DGLDataType
dtype
,
cuda
Stream_t
stream
)
{
int n_elms, DGLContext ctx, DGLDataType dtype,
hip
Stream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx);
int nt = 256;
int nb = (n_elms + nt - 1) / nt;
...
...
@@ -99,7 +101,7 @@ void _Merge(
template <DGLDeviceType XPU, typename IdType>
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
uint64_t src_offset = 0, dst_offset = 0;
bool has_data = false;
...
...
src/array/cuda/fp16.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020-2022 by Contributors
*
...
...
@@ -21,12 +22,12 @@
#ifndef DGL_ARRAY_CUDA_FP16_CUH_
#define DGL_ARRAY_CUDA_FP16_CUH_
#include <
cuda
_fp16.h>
#include <
hip/hip
_fp16.h>
#include <algorithm>
static
__device__
__forceinline__
half
max
(
half
a
,
half
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hgt
(
__half
(
a
),
__half
(
b
))
?
a
:
b
;
#else
return
__half
(
max
(
float
(
a
),
float
(
b
)));
// NOLINT
...
...
@@ -34,19 +35,19 @@ static __device__ __forceinline__ half max(half a, half b) {
}
static
__device__
__forceinline__
half
min
(
half
a
,
half
b
)
{
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__
HIP_DEVICE_COMPILE__)
return
__hlt
(
__half
(
a
),
__half
(
b
))
?
a
:
b
;
#else
return
__half
(
min
(
float
(
a
),
float
(
b
)));
// NOLINT
#endif
}
#ifdef __
CUDA
CC__
#if 0
#ifdef __
HIP
CC__
// Arithmetic FP16 operations for architecture >= 5.3 are already defined in
//
cuda
_fp16.h
#if defined(__
CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
//
hip/hip
_fp16.h
#if defined(__
HIP_DEVICE_COMPILE__)
// CUDA 12.2 adds "emulated" support for older architectures.
#if defined(
CUDA
RT_VERSION) && (
CUDA
RT_VERSION < 12020)
#if defined(
DTK
RT_VERSION) && (
DTK
RT_VERSION < 12020)
__device__ __forceinline__ __half
operator+(const __half& lh, const __half& rh) {
return __half(float(lh) + float(rh)); // NOLINT
...
...
@@ -127,8 +128,8 @@ __device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
return float(lh) <= float(rh); // NOLINT
}
#endif // defined(
CUDA
RT_VERSION) && (
CUDA
RT_VERSION < 12020)
#endif // defined(__
CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
#endif // __
CUDA
CC__
#endif // defined(
DTK
RT_VERSION) && (
DTK
RT_VERSION < 12020)
#endif // defined(__
HIP_DEVICE_COMPILE__)
#endif // __
HIP
CC__
#endif
#endif // DGL_ARRAY_CUDA_FP16_CUH_
src/array/cuda/functor.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/functor.cuh
...
...
@@ -9,8 +10,8 @@
#include <cmath>
#include <limits>
#include "
./
atomic.cuh"
#include "
./
fp16.cuh"
#include "atomic.cuh"
#include "fp16.cuh"
#include "bf16.cuh"
namespace
dgl
{
...
...
@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
struct
Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
0.
);
struct
Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
(
0.
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Sum
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
}
...
...
@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
struct
Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
-
std
::
numeric_limits
<
float
>::
infinity
());
struct
Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
(
-
std
::
numeric_limits
<
float
>::
infinity
());
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Max
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
}
...
...
@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
#if BF16_ENABLED
template
<
typename
Idx
,
bool
atomic
>
struct
Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>
:
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
nv
_bfloat16
zero
()
{
return
__float2bfloat16
_rn
(
std
::
numeric_limits
<
float
>::
infinity
());
struct
Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>
:
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>
{
static
constexpr
__host__
__device__
__forceinline__
__
hip
_bfloat16
zero
()
{
return
__float2bfloat16
(
std
::
numeric_limits
<
float
>::
infinity
());
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
val
,
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
__
nv
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
__
nv
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
__
hip
_bfloat16
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
__
hip
_bfloat16
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
val
,
id
);
}
// sometimes we have to use float in reduction for better precision
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_u_buf
,
Idx
*
arg_e_buf
,
__
nv
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
__
hip
_bfloat16
val
,
Idx
uid
,
Idx
eid
)
{
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_u_buf
,
arg_e_buf
,
static_cast
<
float
>
(
val
),
uid
,
eid
);
}
static
__device__
__forceinline__
void
Call
(
float
*
out_buf
,
Idx
*
arg_buf
,
__
nv
_bfloat16
val
,
Idx
id
)
{
float
*
out_buf
,
Idx
*
arg_buf
,
__
hip
_bfloat16
val
,
Idx
id
)
{
_Min
<
Idx
,
float
,
atomic
>::
Call
(
out_buf
,
arg_buf
,
static_cast
<
float
>
(
val
),
id
);
}
...
...
src/array/cuda/gather_mm.
cu
→
src/array/cuda/gather_mm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/gather_mm.cu
...
...
@@ -7,9 +9,9 @@
#include <algorithm> // std::swap
#include "
./
atomic.cuh"
#include "
./
functor.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "functor.cuh"
#include "utils.h"
namespace dgl {
using namespace cuda;
...
...
@@ -20,54 +22,63 @@ namespace {
/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
*/
template <typename DType>
cu
blasStatus_t
cublasGemm
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const DType* alpha, const DType* A, int lda,
const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype";
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED;
}
template <>
cu
blasStatus_t
cublasGemm
<
__half
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm<__half>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const __half* alpha, const __half* A, int lda,
const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
return
cu
blasHgemm
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
return
hip
blasHgemm(
handle, transa, transb, m, n, k,
(hipblasHalf*)alpha, (hipblasHalf*)A, lda, (hipblasHalf*)B, ldb, (hipblasHalf*)beta, (hipblasHalf*)
C, ldc);
}
// template <>
// hipblasStatus_t cublasGemm<__half>(
// hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
// int m, int n, int k, const __half* alpha, const __half* A, int lda,
// const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
// return hipblasHgemm(
// handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
// }
#if BF16_ENABLED
template <>
cu
blasStatus_t
cublasGemm
<
__
nv
_bfloat16
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
__
nv
_bfloat16
*
alpha
,
const
__
nv
_bfloat16
*
A
,
int
lda
,
const
__
nv
_bfloat16
*
B
,
int
ldb
,
const
__
nv
_bfloat16
*
beta
,
__
nv
_bfloat16
*
C
,
int
ldc
)
{
hip
blasStatus_t cublasGemm<__
hip
_bfloat16>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const __
hip
_bfloat16* alpha, const __
hip
_bfloat16* A,
int lda, const __
hip
_bfloat16* B, int ldb, const __
hip
_bfloat16* beta,
__
hip
_bfloat16* C, int ldc) {
float alpha_float = __bfloat162float(*alpha);
float beta_float = __bfloat162float(*beta);
return
cu
blasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
&
alpha_float
,
A
,
CUDA
_R_16B
F
,
lda
,
B
,
CUDA
_R_16B
F
,
ldb
,
&
beta_float
,
C
,
CUDA
_R_16B
F
,
ldc
,
CU
BLAS_
COMPUTE
_32F
,
CU
BLAS_GEMM_DEFAULT
_TENSOR_OP
);
return
hip
blasGemmEx(
handle, transa, transb, m, n, k, &alpha_float, A,
HIPBLAS
_R_16B, lda, B,
HIPBLAS
_R_16B, ldb, &beta_float, C,
HIPBLAS
_R_16B, ldc,
HIP
BLAS_
R
_32F,
HIP
BLAS_GEMM_DEFAULT);
}
#endif // BF16_ENABLED
template <>
cu
blasStatus_t
cublasGemm
<
float
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm<float>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const float* alpha, const float* A, int lda,
const float* B, int ldb, const float* beta, float* C, int ldc) {
return
cu
blasSgemm
(
return
hip
blasSgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
template <>
cu
blasStatus_t
cublasGemm
<
double
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t cublasGemm<double>(
hip
blasHandle_t handle,
hip
blasOperation_t transa,
hip
blasOperation_t transb,
int m, int n, int k, const double* alpha, const double* A, int lda,
const double* B, int ldb, const double* beta, double* C, int ldc) {
return
cu
blasDgemm
(
return
hip
blasDgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
...
...
@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
// Load A in shared mem in a coalesced way
for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
__syncwarp
();
//
__syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private
...
...
@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
/* Load A in shared mem in a coalesced way */
for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
__syncwarp
();
//
__syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private
...
...
@@ -203,7 +214,7 @@ void SegmentMM(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_data = A.Ptr<DType>();
const DType* B_data = B.Ptr<DType>();
const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
...
...
@@ -215,8 +226,8 @@ void SegmentMM(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL(
hip
blasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(
hip
blasSetStream(thr_entry->cublas_handle, stream));
IdType m_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) {
...
...
@@ -226,10 +237,10 @@ void SegmentMM(
n = B->shape[2]; // cols of B
k = B->shape[1]; // cols of A == rows of B
int ldb = n, lda = k, ldc = n;
cu
blasOperation_t
transB
=
CU
BLAS_OP_N
;
cu
blasOperation_t
transA
=
CU
BLAS_OP_N
;
hip
blasOperation_t transB =
HIP
BLAS_OP_N;
hip
blasOperation_t transA =
HIP
BLAS_OP_N;
if (b_trans) {
transB
=
CU
BLAS_OP_T
;
transB =
HIP
BLAS_OP_T;
ldb = n, lda = n, ldc = k;
std::swap(n, k);
}
...
...
@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
void SegmentMMBackwardB(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* A_data = A.Ptr<DType>();
const DType* dC_data = dC.Ptr<DType>();
const IdType* seglen_data = seglen.Ptr<IdType>();
...
...
@@ -260,8 +271,8 @@ void SegmentMMBackwardB(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL(
hip
blasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(
hip
blasSetStream(thr_entry->cublas_handle, stream));
IdType k_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) {
...
...
@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
CHECK_LE(k_offset + k, A->shape[0])
<< "Segement index out of bound of A->shape[0].";
int lddC = m, ldA = n, lddB = m;
cu
blasOperation_t
trans_dC
=
CU
BLAS_OP_N
;
cu
blasOperation_t
trans_A
=
CU
BLAS_OP_T
;
hip
blasOperation_t trans_dC =
HIP
BLAS_OP_N;
hip
blasOperation_t trans_A =
HIP
BLAS_OP_T;
CUBLAS_CALL(cublasGemm<DType>(
thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
...
...
@@ -299,7 +310,7 @@ void GatherMM(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t out_len = B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A
const int64_t tot_num_rows = A->shape[0];
...
...
@@ -332,7 +343,7 @@ void GatherMMScatter(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c) {
auto device = runtime::DeviceAPI::Get(A->ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const IdType* idx_c_data = idx_c.Ptr<IdType>();
int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A
...
...
@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
#if BF16_ENABLED
template
void
GatherMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void GatherMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
template
void
GatherMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void GatherMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b);
#endif // BF16_ENABLED
...
...
@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
#if BF16_ENABLED
template
void
GatherMMScatter
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void GatherMMScatter<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
template
void
GatherMMScatter
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void GatherMMScatter<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c);
#endif // BF16_ENABLED
...
...
@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
#if BF16_ENABLED
template
void
SegmentMM
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentMM<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
template
void
SegmentMM
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentMM<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans);
#endif // BF16_ENABLED
...
...
@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#if BF16_ENABLED
template
void
SegmentMMBackwardB
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentMMBackwardB<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
template
void
SegmentMMBackwardB
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#endif // BF16_ENABLED
template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/ge_spmm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/ge_spmm.cuh
...
...
@@ -7,7 +9,7 @@
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "macro.cuh"
...
...
@@ -121,7 +123,7 @@ void GESpMMCsr(
const
DType
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
int
ntx
=
32
;
const
int
nty
=
32
;
...
...
src/array/cuda/labor_sampling.
cu
→
src/array/cuda/labor_sampling.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*!
* Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
...
@@ -34,19 +36,19 @@
#include <thrust/zip_function.h>
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <limits>
#include <numeric>
#include <type_traits>
#include <utility>
#include "
../../array/cuda/
atomic.cuh"
#include "
../../array/cuda/
utils.h"
#include "atomic.cuh"
#include "utils.h"
#include "../../graph/transform/cuda/cuda_map_edges.cuh"
#include "../../random/continuous_seed.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "
./
spmm.cuh"
#include "functor.cuh"
#include "spmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -131,7 +133,7 @@ struct StencilOpFused {
const IdType* indices;
const IdType* nids;
bool is_pinned;
__device__
auto
operator
()(
IdType
idx
)
{
__host__
__device__ auto operator()(IdType idx) {
const auto in_row = idx_coo[idx];
const auto ps = probs[idx];
IdType rofs = idx - subindptr[in_row];
...
...
@@ -277,7 +279,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
const FloatType* const ds, const FloatType* const d2s,
const IdType* const indptr, const FloatType* const probs,
const FloatType* const A, const IdType* const subindptr) {
typedef
cub
::
BlockReduce
<
FloatType
,
BLOCK_SIZE
>
BlockReduce
;
typedef
hip
cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ FloatType var_1_bcast[BLOCK_CTAS];
...
...
@@ -351,7 +353,7 @@ int log_size(const IdType size) {
template <typename IdType, typename FloatType, typename exec_policy_t>
void compute_importance_sampling_probabilities(
CSRMatrix
mat
,
const
IdType
hop_size
,
cuda
Stream_t
stream
,
CSRMatrix mat, const IdType hop_size,
hip
Stream_t stream,
const continuous_seed seed, const IdType num_rows, const IdType* indptr,
const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
const IdType* nids,
...
...
@@ -398,17 +400,17 @@ void compute_importance_sampling_probabilities(
hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
mat.indptr->dtype);
cub
::
DoubleBuffer
<
IdType
>
hop_b
(
hop_2
.
get
(),
hop_3
.
get
());
hip
cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
{
std::size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortKeys
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortKeys(
nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortKeys
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortKeys(
temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
max_log_num_vertices, stream));
}
...
...
@@ -418,13 +420,13 @@ void compute_importance_sampling_probabilities(
{
std::size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceRunLengthEncode
::
Encode
(
CUDA_CALL(
hip
cub::DeviceRunLengthEncode::Encode(
nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceRunLengthEncode
::
Encode
(
CUDA_CALL(
hip
cub::DeviceRunLengthEncode::Encode(
temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream));
...
...
@@ -511,7 +513,7 @@ void compute_importance_sampling_probabilities(
/////////////////////////////// CSR ///////////////////////////////
template <DGLDeviceType XPU, typename IdType, typename FloatType>
std
::
pair
<
COOMatrix
,
FloatArray
>
CSRLaborSampling
(
__host__
std::pair<COOMatrix, FloatArray> CSRLaborSampling(
CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
float seed2_contribution, IdArray NIDs) {
...
...
@@ -521,8 +523,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
runtime::CUDAWorkspaceAllocator allocator(ctx);
const
auto
stream
=
runtime
::
getCurrent
CUDA
Stream
();
const
auto
exec_policy
=
thrust
::
cuda
::
par_nosync
(
allocator
).
on
(
stream
);
const auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
auto device = runtime::DeviceAPI::Get(ctx);
...
...
@@ -569,11 +571,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Reduce
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Reduce(
nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Reduce
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Reduce(
temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
stream));
...
...
@@ -586,11 +588,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
IdType hop_size;
{
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream));
...
...
@@ -619,11 +621,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto modified_in_deg = thrust::make_transform_iterator(
iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
num_rows + 1, stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, modified_in_deg,
subindptr_aligned.get(), num_rows + 1, stream));
...
...
src/array/cuda/macro.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/macro.cuh
...
...
@@ -30,14 +31,14 @@
const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(
cuda
Memcpy( \
CUDA_CALL(
hip
Memcpy( \
(LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(),
cuda
MemcpyHostToDevice)); \
sizeof(int64_t) * info.lhs_offset.size(),
hip
MemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(
cuda
Memcpy( \
CUDA_CALL(
hip
Memcpy( \
(RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(),
cuda
MemcpyHostToDevice)); \
sizeof(int64_t) * info.rhs_offset.size(),
hip
MemcpyHostToDevice)); \
if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment