Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cpu/array_sort.cu * @file array/cpu/array_sort.cu
* @brief Array sort GPU implementation * @brief Array sort GPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -29,20 +33,20 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) { ...@@ -29,20 +33,20 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
IdType* keys_out = sorted_array.Ptr<IdType>(); IdType* keys_out = sorted_array.Ptr<IdType>();
int64_t* values_out = sorted_idx.Ptr<int64_t>(); int64_t* values_out = sorted_idx.Ptr<int64_t>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_bits == 0) { if (num_bits == 0) {
num_bits = sizeof(IdType) * 8; num_bits = sizeof(IdType) * 8;
} }
// Allocate workspace // Allocate workspace
size_t workspace_size = 0; size_t workspace_size = 0;
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems, nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
0, num_bits, stream)); 0, num_bits, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute // Compute
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
workspace, workspace_size, keys_in, keys_out, values_in, values_out, workspace, workspace_size, keys_in, keys_out, values_in, values_out,
nitems, 0, num_bits, stream)); nitems, 0, num_bits, stream));
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file array/cuda/atomic.cuh * @file array/cuda/atomic.cuh
...@@ -6,7 +7,7 @@ ...@@ -6,7 +7,7 @@
#ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_ #ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
#define DGL_ARRAY_CUDA_ATOMIC_CUH_ #define DGL_ARRAY_CUDA_ATOMIC_CUH_
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <cassert> #include <cassert>
#include <cstdint> #include <cstdint>
...@@ -15,8 +16,8 @@ ...@@ -15,8 +16,8 @@
#include "bf16.cuh" #include "bf16.cuh"
#include "fp16.cuh" #include "fp16.cuh"
#if __CUDA_ARCH__ >= 600 #if __HIPCC__
#include <cuda_fp16.h> #include <hip/hip_fp16.h>
#endif #endif
namespace dgl { namespace dgl {
...@@ -56,39 +57,39 @@ struct Cast { ...@@ -56,39 +57,39 @@ struct Cast {
template <> template <>
struct Cast<half> { struct Cast<half> {
typedef Code<sizeof(half)>::Type Type; typedef half Type;
static __device__ __forceinline__ Type Encode(half val) { static __host__ __device__ __forceinline__ Type Encode(half val) {
return __half_as_ushort(val); return __half_as_ushort(val);
} }
static __device__ __forceinline__ half Decode(Type code) { static __host__ __device__ __forceinline__ half Decode(Type code) {
return __ushort_as_half(code); return __ushort_as_half(code);
} }
}; };
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
struct Cast<__nv_bfloat16> { struct Cast<__hip_bfloat16> {
typedef Code<sizeof(__nv_bfloat16)>::Type Type; typedef __hip_bfloat16 Type;
static __device__ __forceinline__ Type Encode(__nv_bfloat16 val) { static __host__ __device__ __forceinline__ Type Encode(__hip_bfloat16 val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__HIP_DEVICE_COMPILE__)
return __bfloat16_as_ushort(val); return __bfloat16_as_ushort(val);
#else #else
printf( printf(
"Atomic operations are not supported for bfloat16 (BF16) " "Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.\n"); "on GPUs with compute capability less than 8.0.\n");
__trap(); // //__trap();
return static_cast<Type>(0); return static_cast<Type>(0);
#endif #endif
} }
static __device__ __forceinline__ __nv_bfloat16 Decode(Type code) { static __host__ __device__ __forceinline__ __hip_bfloat16 Decode(Type code) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__HIP_DEVICE_COMPILE__)
return __ushort_as_bfloat16(code); return __ushort_as_bfloat16(code);
#else #else
printf( printf(
"Atomic operations are not supported for bfloat16 (BF16) " "Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.\n"); "on GPUs with compute capability less than 8.0.\n");
__trap(); //__trap();
return static_cast<__nv_bfloat16>(0.0f); return static_cast<__hip_bfloat16>(0.0f);
#endif #endif
} }
}; };
...@@ -116,12 +117,12 @@ struct Cast<double> { ...@@ -116,12 +117,12 @@ struct Cast<double> {
} }
}; };
static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT static __host__ __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
unsigned short int* address, // NOLINT unsigned short int* address, // NOLINT
unsigned short int compare, // NOLINT unsigned short int compare, // NOLINT
unsigned short int val) { // NOLINT unsigned short int val) { // NOLINT
static_assert(CUDART_VERSION >= 10000, "Requires at least CUDA 10"); static_assert(DTKRT_VERSION >= 10000, "Requires at least CUDA 10");
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) #if defined(__HIP_DEVICE_COMPILE__) && 0
return atomicCAS(address, compare, val); return atomicCAS(address, compare, val);
#else #else
(void)address; (void)address;
...@@ -130,9 +131,9 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT ...@@ -130,9 +131,9 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
printf( printf(
"Atomic operations are not supported for half precision (FP16) " "Atomic operations are not supported for half precision (FP16) "
"on this GPU.\n"); "on this GPU.\n");
__trap(); abort();
return val; return val;
#endif // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) #endif // (defined(__HIP_DEVICE_COMPILE__)
} }
#define DEFINE_ATOMIC(NAME) \ #define DEFINE_ATOMIC(NAME) \
...@@ -168,19 +169,53 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT ...@@ -168,19 +169,53 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
return Cast<dtype>::Decode(old); \ return Cast<dtype>::Decode(old); \
} }
#define OP(a, b) max(a, b) #define DEFINE_ATOMIC_16BIT_BF(NAME, dtype) \
template <> \
__device__ __forceinline__ dtype Atomic##NAME<dtype>( \
dtype * addr, dtype val) { \
typedef uint16_t CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \
CT assumed = old; \
do { \
assumed = old; \
old = atomicCASshort( \
addr_as_ui, assumed, \
Cast<dtype>::Encode(max((double)val, (double)dtype(old)))); \
} while (assumed != old); \
return Cast<dtype>::Decode(old); \
}
#define DEFINE_ATOMIC_16BIT_Min(NAME, dtype) \
template <> \
__device__ __forceinline__ dtype Atomic##NAME<dtype>( \
dtype * addr, dtype val) { \
typedef uint16_t CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \
CT assumed = old; \
do { \
assumed = old; \
old = atomicCASshort( \
addr_as_ui, assumed, \
Cast<dtype>::Encode(min(val, dtype(old)))); \
} while (assumed != old); \
return Cast<dtype>::Decode(old); \
}
#define OP(a, b) max((double)a, (double)b)
DEFINE_ATOMIC(Max) DEFINE_ATOMIC(Max)
DEFINE_ATOMIC_16BIT(Max, half) DEFINE_ATOMIC_16BIT(Max, half)
#if BF16_ENABLED #if BF16_ENABLED
DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16) DEFINE_ATOMIC_16BIT_BF(Max, __hip_bfloat16)
#endif // BF16_ENABLED #endif // BF16_ENABLED
#undef OP #undef OP
#define OP(a, b) min(a, b) #define OP(a, b) min((double)a, (double)b)
DEFINE_ATOMIC(Min) DEFINE_ATOMIC(Min)
DEFINE_ATOMIC_16BIT(Min, half) DEFINE_ATOMIC_16BIT(Min, half)
#if BF16_ENABLED #if BF16_ENABLED
DEFINE_ATOMIC_16BIT(Min, __nv_bfloat16) DEFINE_ATOMIC_16BIT_BF(Min, __hip_bfloat16)
#endif // BF16_ENABLED #endif // BF16_ENABLED
#undef OP #undef OP
...@@ -256,7 +291,7 @@ inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) { ...@@ -256,7 +291,7 @@ inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) {
template <> template <>
__device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) { __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
#if __CUDA_ARCH__ >= 200 #if __HIP_DEVICE_COMPILE__
return atomicAdd(addr, val); return atomicAdd(addr, val);
#else #else
typedef float T; typedef float T;
...@@ -270,12 +305,12 @@ __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) { ...@@ -270,12 +305,12 @@ __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
addr_as_ui, assumed, Cast<T>::Encode(Cast<T>::Decode(old) + val)); addr_as_ui, assumed, Cast<T>::Encode(Cast<T>::Decode(old) + val));
} while (assumed != old); } while (assumed != old);
return Cast<T>::Decode(old); return Cast<T>::Decode(old);
#endif // __CUDA_ARCH__ #endif // __HIP_DEVICE_COMPILE__
} }
template <> template <>
__device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) { __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
#if __CUDA_ARCH__ >= 600 #if __HIP_DEVICE_COMPILE__
return atomicAdd(addr, val); return atomicAdd(addr, val);
#else #else
typedef double T; typedef double T;
...@@ -292,11 +327,11 @@ __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) { ...@@ -292,11 +327,11 @@ __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
#endif #endif
} }
#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000 #if defined(DTKRT_VERSION) && DTKRT_VERSION >= 10000
template <> template <>
__device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) { __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
// make sure we have half support // make sure we have half support
#if __CUDA_ARCH__ >= 700 #if __HIP_DEVICE_COMPILE__
return atomicAdd(addr, val); return atomicAdd(addr, val);
#else #else
(void)addr; (void)addr;
...@@ -304,18 +339,18 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) { ...@@ -304,18 +339,18 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
printf( printf(
"Atomic operations are not supported for half precision (FP16) " "Atomic operations are not supported for half precision (FP16) "
"on this GPU.\n"); "on this GPU.\n");
__trap(); // //__trap();
return val; return val;
#endif // __CUDA_ARCH__ >= 700 #endif // __HIP_DEVICE_COMPILE__
} }
#endif // defined(CUDART_VERSION) && CUDART_VERSION >= 10000 #endif // defined(DTKRT_VERSION) && DTKRT_VERSION >= 10000
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
__device__ __forceinline__ __nv_bfloat16 __device__ __forceinline__ __hip_bfloat16
AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) { AtomicAdd<__hip_bfloat16>(__hip_bfloat16* addr, __hip_bfloat16 val) {
// make sure we have bfloat16 support // make sure we have bfloat16 support
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__HIP_DEVICE_COMPILE__)
return atomicAdd(addr, val); return atomicAdd(addr, val);
#else #else
(void)addr; (void)addr;
...@@ -323,9 +358,9 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) { ...@@ -323,9 +358,9 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
printf( printf(
"Atomic operations are not supported for bfloat16 (BF16) " "Atomic operations are not supported for bfloat16 (BF16) "
"on GPUs with compute capability less than 8.0.\n"); "on GPUs with compute capability less than 8.0.\n");
__trap(); //__trap();
return val; return val;
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #endif // defined(__HIP_DEVICE_COMPILE__)
} }
#endif // BF16_ENABLED #endif // BF16_ENABLED
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* *
...@@ -18,131 +19,140 @@ ...@@ -18,131 +19,140 @@
*/ */
#ifndef DGL_ARRAY_CUDA_BF16_CUH_ #ifndef DGL_ARRAY_CUDA_BF16_CUH_
#define DGL_ARRAY_CUDA_BF16_CUH_ #define DGL_ARRAY_CUDA_BF16_CUH_
#include <hip/hip_runtime.h>
#if BF16_ENABLED #if BF16_ENABLED
#include <cuda_bf16.h> #include <hip/hip_bf16.h>
#include <algorithm> #include <algorithm>
static __device__ __forceinline__ __nv_bfloat16 static __device__ __forceinline__ __hip_bfloat16
max(__nv_bfloat16 a, __nv_bfloat16 b) { max(__hip_bfloat16 a, __hip_bfloat16 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__HIP_DEVICE_COMPILE__)
return __hmax(a, b); return __hmax(a, b);
#else #else
return __nv_bfloat16(max(float(a), float(b))); // NOLINT return __hip_bfloat16(max(float(a), float(b))); // NOLINT
#endif #endif
} }
static __device__ __forceinline__ __nv_bfloat16 static __device__ __forceinline__ __hip_bfloat16
min(__nv_bfloat16 a, __nv_bfloat16 b) { min(__hip_bfloat16 a, __hip_bfloat16 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__HIP_DEVICE_COMPILE__)
return __hmin(a, b); return __hmin(a, b);
#else #else
return __nv_bfloat16(min(float(a), float(b))); // NOLINT return __hip_bfloat16(min(float(a), float(b))); // NOLINT
#endif #endif
} }
#ifdef __CUDACC__ #ifdef __HIPCC__
// Arithmetic BF16 operations for architecture >= 8.0 are already defined in // Arithmetic BF16 operations for architecture >= 8.0 are already defined in
// cuda_bf16.h // hip/__hip_bfloat16.h
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) // #if defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800)
// CUDA 12.2 adds "emulated" support for older architectures. // // CUDA 12.2 adds "emulated" support for older architectures.
#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020) // #if defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
__device__ __forceinline__ __nv_bfloat16 __device__ __forceinline__ __hip_bfloat16
operator+(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { operator+(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return __nv_bfloat16(float(lh) + float(rh)); // NOLINT return __hip_bfloat16(float(lh) + float(rh)); // NOLINT
} }
__device__ __forceinline__ __nv_bfloat16 __device__ __forceinline__ __hip_bfloat16
operator-(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { operator-(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return __nv_bfloat16(float(lh) - float(rh)); // NOLINT return __hip_bfloat16(float(lh) - float(rh)); // NOLINT
} }
__device__ __forceinline__ __nv_bfloat16 __device__ __forceinline__ __hip_bfloat16
operator*(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { operator*(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return __nv_bfloat16(float(lh) * float(rh)); // NOLINT return __hip_bfloat16(float(lh) * float(rh)); // NOLINT
} }
__device__ __forceinline__ __nv_bfloat16 __device__ __forceinline__ __hip_bfloat16
operator/(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { operator/(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return __nv_bfloat16(float(lh) / float(rh)); // NOLINT return __hip_bfloat16(float(lh) / float(rh)); // NOLINT
} }
__device__ __forceinline__ __nv_bfloat16& operator+=( __device__ __forceinline__ __hip_bfloat16& operator+=(
__nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT
lh = __nv_bfloat16(float(lh) + float(rh)); // NOLINT lh = __hip_bfloat16(float(lh) + float(rh)); // NOLINT
return lh; return lh;
} }
__device__ __forceinline__ __nv_bfloat16& operator-=( __device__ __forceinline__ __hip_bfloat16& operator-=(
__nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT
lh = __nv_bfloat16(float(lh) - float(rh)); // NOLINT lh = __hip_bfloat16(float(lh) - float(rh)); // NOLINT
return lh; return lh;
} }
__device__ __forceinline__ __nv_bfloat16& operator*=( __device__ __forceinline__ __hip_bfloat16& operator*=(
__nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT
lh = __nv_bfloat16(float(lh) * float(rh)); // NOLINT lh = __hip_bfloat16(float(lh) * float(rh)); // NOLINT
return lh; return lh;
} }
__device__ __forceinline__ __nv_bfloat16& operator/=( __device__ __forceinline__ __hip_bfloat16& operator/=(
__nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT
lh = __nv_bfloat16(float(lh) / float(rh)); // NOLINT lh = __hip_bfloat16(float(lh) / float(rh)); // NOLINT
return lh; return lh;
} }
__device__ __forceinline__ __nv_bfloat16& operator++( __device__ __forceinline__ __hip_bfloat16& operator++(
__nv_bfloat16& h) { // NOLINT __hip_bfloat16& h) { // NOLINT
h = __nv_bfloat16(float(h) + 1.0f); // NOLINT h = __hip_bfloat16(float(h) + 1.0f); // NOLINT
return h; return h;
} }
__device__ __forceinline__ __nv_bfloat16& operator--( __device__ __forceinline__ __hip_bfloat16& operator--(
__nv_bfloat16& h) { // NOLINT __hip_bfloat16& h) { // NOLINT
h = __nv_bfloat16(float(h) - 1.0f); // NOLINT h = __hip_bfloat16(float(h) - 1.0f); // NOLINT
return h; return h;
} }
__device__ __forceinline__ __nv_bfloat16 __device__ __forceinline__ __hip_bfloat16
operator++(__nv_bfloat16& h, int) { // NOLINT operator++(__hip_bfloat16& h, int) { // NOLINT
__nv_bfloat16 ret = h; __hip_bfloat16 ret = h;
h = __nv_bfloat16(float(h) + 1.0f); // NOLINT h = __hip_bfloat16(float(h) + 1.0f); // NOLINT
return ret; return ret;
} }
__device__ __forceinline__ __nv_bfloat16 __device__ __forceinline__ __hip_bfloat16
operator--(__nv_bfloat16& h, int) { // NOLINT operator--(__hip_bfloat16& h, int) { // NOLINT
__nv_bfloat16 ret = h; __hip_bfloat16 ret = h;
h = __nv_bfloat16(float(h) - 1.0f); // NOLINT h = __hip_bfloat16(float(h) - 1.0f); // NOLINT
return ret; return ret;
} }
__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) { __device__ __forceinline__ __hip_bfloat16 operator+(const __hip_bfloat16& h) {
return h; return h;
} }
__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) { __device__ __forceinline__ __hip_bfloat16 operator-(const __hip_bfloat16& h) {
return __nv_bfloat16(-float(h)); // NOLINT return __hip_bfloat16(-float(h)); // NOLINT
} }
__device__ __forceinline__ bool operator==( __device__ __forceinline__ bool operator==(
const __nv_bfloat16& lh, const __nv_bfloat16& rh) { const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return float(lh) == float(rh); // NOLINT return float(lh) == float(rh); // NOLINT
} }
__device__ __forceinline__ bool operator!=( __device__ __forceinline__ bool operator!=(
const __nv_bfloat16& lh, const __nv_bfloat16& rh) { const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return float(lh) != float(rh); // NOLINT return float(lh) != float(rh); // NOLINT
} }
__device__ __forceinline__ bool operator>( __device__ __forceinline__ bool operator>(
const __nv_bfloat16& lh, const __nv_bfloat16& rh) { const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return float(lh) > float(rh); // NOLINT return float(lh) > float(rh); // NOLINT
} }
__device__ __forceinline__ bool operator<( __device__ __forceinline__ bool operator<(
const __nv_bfloat16& lh, const __nv_bfloat16& rh) { const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return float(lh) < float(rh); // NOLINT return float(lh) < float(rh); // NOLINT
} }
__device__ __forceinline__ bool operator>=( __device__ __forceinline__ bool operator>=(
const __nv_bfloat16& lh, const __nv_bfloat16& rh) { const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return float(lh) >= float(rh); // NOLINT return float(lh) >= float(rh); // NOLINT
} }
__device__ __forceinline__ bool operator<=( __device__ __forceinline__ bool operator<=(
const __nv_bfloat16& lh, const __nv_bfloat16& rh) { const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
return float(lh) <= float(rh); // NOLINT return float(lh) <= float(rh); // NOLINT
} }
#endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020) // #endif // defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) // #endif // defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800)
#endif // __CUDACC__ __device__
inline
__hip_bfloat16 __shfl_down(__hip_bfloat16 var, unsigned int lane_delta, int width = warpSize) {
union { unsigned short s; __hip_bfloat16 us; } tmp;
tmp.us = var;
tmp.s = __shfl_down(tmp.s, lane_delta, width);
return tmp.us;
}
#endif // __HIPCC__
#endif // BF16_ENABLED #endif // BF16_ENABLED
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/coo2csr.cc * @file array/cuda/coo2csr.cc
* @brief COO2CSR * @brief COO2CSR
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -24,12 +28,12 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -24,12 +28,12 @@ CSRMatrix COOToCSR(COOMatrix coo) {
template <> template <>
CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) { CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
bool row_sorted = coo.row_sorted; bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted; bool col_sorted = coo.col_sorted;
...@@ -50,9 +54,9 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) { ...@@ -50,9 +54,9 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
NDArray indptr = NDArray indptr =
aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits); aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data); int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
CUSPARSE_CALL(cusparseXcoo2csr( CUSPARSE_CALL(hipsparseXcoo2csr(
thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows, thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
indptr_ptr, CUSPARSE_INDEX_BASE_ZERO)); indptr_ptr, HIPSPARSE_INDEX_BASE_ZERO));
return CSRMatrix( return CSRMatrix(
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted); coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
...@@ -100,7 +104,7 @@ template <> ...@@ -100,7 +104,7 @@ template <>
CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) { CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
const auto& ctx = coo.row->ctx; const auto& ctx = coo.row->ctx;
const auto nbits = coo.row->dtype.bits; const auto nbits = coo.row->dtype.bits;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
bool row_sorted = coo.row_sorted; bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted; bool col_sorted = coo.col_sorted;
if (!row_sorted) { if (!row_sorted) {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/coo_sort.cc * @file array/cuda/coo_sort.cc
* @brief Sort COO index * @brief Sort COO index
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../c_api_common.h" #include "../../c_api_common.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -65,7 +69,7 @@ __global__ void _COODecodeEdgesKernel( ...@@ -65,7 +69,7 @@ __global__ void _COODecodeEdgesKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
void COOSort_(COOMatrix* coo, bool sort_column) { void COOSort_(COOMatrix* coo, bool sort_column) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int row_bits = cuda::_NumberOfBits(coo->num_rows); const int row_bits = cuda::_NumberOfBits(coo->num_rows);
const int64_t nnz = coo->row->shape[0]; const int64_t nnz = coo->row->shape[0];
...@@ -138,7 +142,7 @@ template <DGLDeviceType XPU, typename IdType> ...@@ -138,7 +142,7 @@ template <DGLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo) { std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0]; const int64_t nnz = coo.row->shape[0];
const auto& ctx = coo.row->ctx; const auto& ctx = coo.row->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
// should be fine. // should be fine.
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/csr2coo.cc * @file array/cuda/csr2coo.cc
...@@ -8,10 +10,10 @@ ...@@ -8,10 +10,10 @@
#include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h> #include <thrust/iterator/transform_iterator.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -29,12 +31,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) { ...@@ -29,12 +31,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
template <> template <>
COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) { COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data; NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data); const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
...@@ -42,9 +44,9 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) { ...@@ -42,9 +44,9 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits); aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
int32_t* row_ptr = static_cast<int32_t*>(row->data); int32_t* row_ptr = static_cast<int32_t*>(row->data);
CUSPARSE_CALL(cusparseXcsr2coo( CUSPARSE_CALL(hipsparseXcsr2coo(
thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows, thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
row_ptr, CUSPARSE_INDEX_BASE_ZERO)); row_ptr, HIPSPARSE_INDEX_BASE_ZERO));
return COOMatrix( return COOMatrix(
csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted); csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
...@@ -72,10 +74,40 @@ struct AdjacentDifference { ...@@ -72,10 +74,40 @@ struct AdjacentDifference {
} }
}; };
/*!
* \brief Repeat elements
* \param val Value to repeat
* \param repeats Number of repeats for each value
* \param pos The position of the output buffer to write the value.
* \param out Output buffer.
* \param length Number of values
*
* For example:
* val = [3, 0, 1]
* repeats = [1, 0, 2]
* pos = [0, 1, 1] # write to output buffer position 0, 1, 1
* then,
* out = [3, 1, 1]
*/
template <typename DType, typename IdType>
__global__ void _RepeatKernel(
const DType* val, const IdType* pos,
DType* out, int64_t n_row, int64_t length) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1;
out[tx] = val[i];
tx += stride_x;
}
}
#if 0
template <> template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) { COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx; const auto& ctx = csr.indptr->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t nnz = csr.indices->shape[0]; const int64_t nnz = csr.indices->shape[0];
const auto nbits = csr.indptr->dtype.bits; const auto nbits = csr.indptr->dtype.bits;
...@@ -96,14 +128,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) { ...@@ -96,14 +128,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
std::size_t temp_storage_bytes = 0; std::size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceCopy::Batched( CUDA_CALL(cub::DeviceCopy::Batched(
nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i, nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once), buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
stream)); stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes); auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL(cub::DeviceCopy::Batched( CUDA_CALL(cub::DeviceCopy::Batched(
temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i, temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once), buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
stream)); stream));
} }
...@@ -111,6 +143,30 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) { ...@@ -111,6 +143,30 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true, csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
csr.sorted); csr.sorted);
} }
#else
template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t nnz = csr.indices->shape[0];
const auto nbits = csr.indptr->dtype.bits;
IdArray rowids = Range(0, csr.num_rows, nbits, ctx);
IdArray ret_row = NewIdArray(nnz, ctx, nbits);
const int nt = 256;
const int nb = (nnz + nt - 1) / nt;
CUDA_KERNEL_CALL(_RepeatKernel,
nb, nt, 0, stream,
rowids.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
csr.num_rows, nnz);
return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
}
#endif
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr); template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr); template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
...@@ -128,12 +184,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) { ...@@ -128,12 +184,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx); auto device = runtime::DeviceAPI::Get(coo.row->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
NDArray row = coo.row, col = coo.col, data = coo.data; NDArray row = coo.row, col = coo.col, data = coo.data;
int32_t* row_ptr = static_cast<int32_t*>(row->data); int32_t* row_ptr = static_cast<int32_t*>(row->data);
...@@ -141,11 +197,11 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) { ...@@ -141,11 +197,11 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
int32_t* data_ptr = static_cast<int32_t*>(data->data); int32_t* data_ptr = static_cast<int32_t*>(data->data);
size_t workspace_size = 0; size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt( CUSPARSE_CALL(hipsparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0], thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, &workspace_size)); data_ptr, row_ptr, &workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size); void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow( CUSPARSE_CALL(hipsparseXcoosortByRow(
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0], thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, col_ptr, workspace)); data_ptr, row_ptr, col_ptr, workspace));
device->FreeWorkspace(row->ctx, workspace); device->FreeWorkspace(row->ctx, workspace);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/cuda/csr_get_data.cu * @file array/cuda/csr_get_data.cu
* @brief Retrieve entries of a CSR matrix * @brief Retrieve entries of a CSR matrix
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <numeric> #include <numeric>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -32,11 +36,11 @@ NDArray CSRGetData( ...@@ -32,11 +36,11 @@ NDArray CSRGetData(
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t rstlen = std::max(rowlen, collen); const int64_t rstlen = ::max(rowlen, collen);
IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx); IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx);
if (rstlen == 0) return rst; if (rstlen == 0) return rst;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = cuda::FindNumThreads(rstlen); const int nt = cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt; const int nb = (rstlen + nt - 1) / nt;
if (return_eids) if (return_eids)
...@@ -67,12 +71,12 @@ template NDArray CSRGetData<kDGLCUDA, int64_t, __half>( ...@@ -67,12 +71,12 @@ template NDArray CSRGetData<kDGLCUDA, int64_t, __half>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray weights, __half filler); NDArray weights, __half filler);
#if BF16_ENABLED #if BF16_ENABLED
template NDArray CSRGetData<kDGLCUDA, int32_t, __nv_bfloat16>( template NDArray CSRGetData<kDGLCUDA, int32_t, __hip_bfloat16>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray weights, __nv_bfloat16 filler); NDArray weights, __hip_bfloat16 filler);
template NDArray CSRGetData<kDGLCUDA, int64_t, __nv_bfloat16>( template NDArray CSRGetData<kDGLCUDA, int64_t, __hip_bfloat16>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
NDArray weights, __nv_bfloat16 filler); NDArray weights, __hip_bfloat16 filler);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template NDArray CSRGetData<kDGLCUDA, int32_t, float>( template NDArray CSRGetData<kDGLCUDA, int32_t, float>(
CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/csr_mm.cu * @file array/cuda/csr_mm.cu
* @brief SpSpMM/SpGEMM C APIs and definitions. * @brief SpSpMM/SpGEMM C APIs and definitions.
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <limits> #include <limits>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./cusparse_dispatcher.cuh" #include "cusparse_dispatcher.cuh"
#include "./functor.cuh" #include "functor.cuh"
namespace dgl { namespace dgl {
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -18,7 +22,7 @@ using namespace dgl::runtime; ...@@ -18,7 +22,7 @@ using namespace dgl::runtime;
namespace aten { namespace aten {
namespace cusparse { namespace cusparse {
#if CUDART_VERSION >= 12000 #if DTKRT_VERSION >= 12000
/** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */ /** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */
template <typename DType, typename IdType> template <typename DType, typename IdType>
...@@ -31,74 +35,74 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm( ...@@ -31,74 +35,74 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
const int nnzB = B.indices->shape[0]; const int nnzB = B.indices->shape[0];
const DType alpha = 1.0; const DType alpha = 1.0;
const DType beta = 0.0; const DType beta = 0.0;
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
// device // device
auto ctx = A.indptr->ctx; auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* A_weights = A_weights_array.Ptr<DType>(); const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>(); const DType* B_weights = B_weights_array.Ptr<DType>();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
// all one data array // all one data array
cusparseSpMatDescr_t matA, matB, matC; hipsparseSpMatDescr_t matA, matB, matC;
IdArray dC_csrOffsets = IdArray dC_csrOffsets =
IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx); IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx);
IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>(); IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
constexpr auto idtype = cusparse_idtype<IdType>::value; constexpr auto idtype = cusparse_idtype<IdType>::value;
constexpr auto dtype = cuda_dtype<DType>::value; constexpr auto dtype = cuda_dtype<DType>::value;
// Create sparse matrix A, B and C in CSR format // Create sparse matrix A, B and C in CSR format
CUSPARSE_CALL(cusparseCreateCsr( CUSPARSE_CALL(hipsparseCreateCsr(
&matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(), &matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(),
A.indices.Ptr<IdType>(), A.indices.Ptr<IdType>(),
// cusparseCreateCsr only accepts non-const pointers. // hipsparseCreateCsr only accepts non-const pointers.
const_cast<DType*>(A_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, const_cast<DType*>(A_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO,
dtype)); dtype));
CUSPARSE_CALL(cusparseCreateCsr( CUSPARSE_CALL(hipsparseCreateCsr(
&matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(), &matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(),
B.indices.Ptr<IdType>(), B.indices.Ptr<IdType>(),
// cusparseCreateCsr only accepts non-const pointers. // hipsparseCreateCsr only accepts non-const pointers.
const_cast<DType*>(B_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, const_cast<DType*>(B_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO,
dtype)); dtype));
CUSPARSE_CALL(cusparseCreateCsr( CUSPARSE_CALL(hipsparseCreateCsr(
&matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr, &matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr,
idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, dtype)); idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO, dtype));
// SpGEMM Computation // SpGEMM Computation
cusparseSpGEMMDescr_t spgemmDesc; hipsparseSpGEMMDescr_t spgemmDesc;
cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_DEFAULT; cusparseSpGEMMAlg_t alg = HIPSPARSE_SPGEMM_DEFAULT;
CUSPARSE_CALL(cusparseSpGEMM_createDescr(&spgemmDesc)); CUSPARSE_CALL(hipsparseSpGEMM_createDescr(&spgemmDesc));
size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0; size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0;
// ask bufferSize1 bytes for external memory // ask bufferSize1 bytes for external memory
CUSPARSE_CALL(cusparseSpGEMM_workEstimation( CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
// inspect the matrices A and B to understand the memory requiremnent // inspect the matrices A and B to understand the memory requiremnent
cusparseStatus_t e = cusparseSpGEMM_workEstimation( hipsparseStatus_t e = hipsparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1); matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1);
// CUSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1 // HIPSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
// and throws insufficient memory error within workEstimation call // and throws insufficient memory error within workEstimation call
if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
// fall back to ALG2 to estimate num_prods // fall back to ALG2 to estimate num_prods
alg = CUSPARSE_SPGEMM_ALG2; alg = CUSPARSE_SPGEMM_ALG2;
device->FreeWorkspace(ctx, workspace1); device->FreeWorkspace(ctx, workspace1);
// rerun cusparseSpGEMM_workEstimation // rerun hipsparseSpGEMM_workEstimation
CUSPARSE_CALL(cusparseSpGEMM_workEstimation( CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
CUSPARSE_CALL(cusparseSpGEMM_workEstimation( CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1)); matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
} else { } else {
CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e; CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
} }
// get the number of intermediate products required for SpGEMM compute // get the number of intermediate products required for SpGEMM compute
...@@ -113,22 +117,22 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm( ...@@ -113,22 +117,22 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
int64_t LARGE_NUM_PRODUCTS = 800000000; // 800*1000*1000; int64_t LARGE_NUM_PRODUCTS = 800000000; // 800*1000*1000;
// switch to ALG2/ALG3 for medium & large problem size // switch to ALG2/ALG3 for medium & large problem size
if (alg == CUSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) { if (alg == HIPSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
// use ALG3 for very large problem // use ALG3 for very large problem
alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3 alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3
: CUSPARSE_SPGEMM_ALG2; : CUSPARSE_SPGEMM_ALG2;
device->FreeWorkspace(ctx, workspace1); device->FreeWorkspace(ctx, workspace1);
// rerun cusparseSpGEMM_workEstimation // rerun hipsparseSpGEMM_workEstimation
CUSPARSE_CALL(cusparseSpGEMM_workEstimation( CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
CUSPARSE_CALL(cusparseSpGEMM_workEstimation( CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1)); matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
} else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) { } else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
// no need to rerun cusparseSpGEMM_workEstimation between ALG2 and ALG3 // no need to rerun hipsparseSpGEMM_workEstimation between ALG2 and ALG3
alg = CUSPARSE_SPGEMM_ALG3; alg = CUSPARSE_SPGEMM_ALG3;
} }
...@@ -147,40 +151,40 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm( ...@@ -147,40 +151,40 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
workspace3, &workspace_size2)); workspace3, &workspace_size2));
device->FreeWorkspace(ctx, workspace3); device->FreeWorkspace(ctx, workspace3);
} else { } else {
CUSPARSE_CALL(cusparseSpGEMM_compute( CUSPARSE_CALL(hipsparseSpGEMM_compute(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size2, NULL)); matC, dtype, alg, spgemmDesc, &workspace_size2, NULL));
} }
// ask bufferSize2 bytes for external memory // ask bufferSize2 bytes for external memory
void* workspace2 = device->AllocWorkspace(ctx, workspace_size2); void* workspace2 = device->AllocWorkspace(ctx, workspace_size2);
// compute the intermediate product of A * B // compute the intermediate product of A * B
CUSPARSE_CALL(cusparseSpGEMM_compute( CUSPARSE_CALL(hipsparseSpGEMM_compute(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2)); matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2));
// get matrix C non-zero entries C_nnz1 // get matrix C non-zero entries C_nnz1
int64_t C_num_rows1, C_num_cols1, C_nnz1; int64_t C_num_rows1, C_num_cols1, C_nnz1;
CUSPARSE_CALL( CUSPARSE_CALL(
cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1)); hipsparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx); IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx);
NDArray dC_weights = NDArray dC_weights =
NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx); NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx);
IdType* dC_columns_data = dC_columns.Ptr<IdType>(); IdType* dC_columns_data = dC_columns.Ptr<IdType>();
DType* dC_weights_data = dC_weights.Ptr<DType>(); DType* dC_weights_data = dC_weights.Ptr<DType>();
// update matC with the new pointers // update matC with the new pointers
CUSPARSE_CALL(cusparseCsrSetPointers( CUSPARSE_CALL(hipsparseCsrSetPointers(
matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data)); matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data));
// copy the final products to the matrix C // copy the final products to the matrix C
CUSPARSE_CALL(cusparseSpGEMM_copy( CUSPARSE_CALL(hipsparseSpGEMM_copy(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, alg, spgemmDesc)); matC, dtype, alg, spgemmDesc));
device->FreeWorkspace(ctx, workspace1); device->FreeWorkspace(ctx, workspace1);
device->FreeWorkspace(ctx, workspace2); device->FreeWorkspace(ctx, workspace2);
// destroy matrix/vector descriptors // destroy matrix/vector descriptors
CUSPARSE_CALL(cusparseSpGEMM_destroyDescr(spgemmDesc)); CUSPARSE_CALL(hipsparseSpGEMM_destroyDescr(spgemmDesc));
CUSPARSE_CALL(cusparseDestroySpMat(matA)); CUSPARSE_CALL(hipsparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroySpMat(matB)); CUSPARSE_CALL(hipsparseDestroySpMat(matB));
CUSPARSE_CALL(cusparseDestroySpMat(matC)); CUSPARSE_CALL(hipsparseDestroySpMat(matC));
return { return {
CSRMatrix( CSRMatrix(
A.num_rows, B.num_cols, dC_csrOffsets, dC_columns, A.num_rows, B.num_cols, dC_csrOffsets, dC_columns,
...@@ -188,7 +192,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm( ...@@ -188,7 +192,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
dC_weights}; dC_weights};
} }
#else // CUDART_VERSION < 12000 #else // DTKRT_VERSION < 12000
/** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA /** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA
* versions */ * versions */
...@@ -208,25 +212,25 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm( ...@@ -208,25 +212,25 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
auto ctx = A.indptr->ctx; auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto idtype = A.indptr->dtype; auto idtype = A.indptr->dtype;
auto dtype = A_weights_array->dtype; auto dtype = A_weights_array->dtype;
const DType* A_weights = A_weights_array.Ptr<DType>(); const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>(); const DType* B_weights = B_weights_array.Ptr<DType>();
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
CUSPARSE_CALL(cusparseSetPointerMode( CUSPARSE_CALL(hipsparseSetPointerMode(
thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST)); thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST));
CUSPARSE_CALL(cusparseCreateCsrgemm2Info(&info)); CUSPARSE_CALL(hipsparseCreateCsrgemm2Info(&info));
cusparseMatDescr_t matA, matB, matC, matD; hipsparseMatDescr_t matA, matB, matC, matD;
CUSPARSE_CALL(cusparseCreateMatDescr(&matA)); CUSPARSE_CALL(hipsparseCreateMatDescr(&matA));
CUSPARSE_CALL(cusparseCreateMatDescr(&matB)); CUSPARSE_CALL(hipsparseCreateMatDescr(&matB));
CUSPARSE_CALL(cusparseCreateMatDescr(&matC)); CUSPARSE_CALL(hipsparseCreateMatDescr(&matC));
CUSPARSE_CALL(cusparseCreateMatDescr(&matD)); // needed even if D is null CUSPARSE_CALL(hipsparseCreateMatDescr(&matD)); // needed even if D is null
CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt( CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt(
thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA, thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA,
...@@ -252,11 +256,11 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm( ...@@ -252,11 +256,11 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace)); C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroyCsrgemm2Info(info)); CUSPARSE_CALL(hipsparseDestroyCsrgemm2Info(info));
CUSPARSE_CALL(cusparseDestroyMatDescr(matA)); CUSPARSE_CALL(hipsparseDestroyMatDescr(matA));
CUSPARSE_CALL(cusparseDestroyMatDescr(matB)); CUSPARSE_CALL(hipsparseDestroyMatDescr(matB));
CUSPARSE_CALL(cusparseDestroyMatDescr(matC)); CUSPARSE_CALL(hipsparseDestroyMatDescr(matC));
CUSPARSE_CALL(cusparseDestroyMatDescr(matD)); CUSPARSE_CALL(hipsparseDestroyMatDescr(matD));
return { return {
CSRMatrix( CSRMatrix(
...@@ -264,7 +268,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm( ...@@ -264,7 +268,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
C_weights}; C_weights};
} }
#endif // CUDART_VERSION >= 12000 #endif // DTKRT_VERSION >= 12000
} // namespace cusparse } // namespace cusparse
template <int XPU, typename IdType, typename DType> template <int XPU, typename IdType, typename DType>
...@@ -314,9 +318,9 @@ template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>( ...@@ -314,9 +318,9 @@ template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>( template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
#if BF16_ENABLED #if BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __nv_bfloat16>( template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __hip_bfloat16>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __nv_bfloat16>( template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __hip_bfloat16>(
const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>( template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/csr_sort.cc * @file array/cuda/csr_sort.cc
...@@ -5,10 +7,10 @@ ...@@ -5,10 +7,10 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -39,7 +41,7 @@ __global__ void _SegmentIsSorted( ...@@ -39,7 +41,7 @@ __global__ void _SegmentIsSorted(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr) { bool CSRIsSorted(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx; const auto& ctx = csr.indptr->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory // We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine. // but should be fine.
...@@ -67,12 +69,12 @@ template <> ...@@ -67,12 +69,12 @@ template <>
void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) { void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx); auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
NDArray indptr = csr->indptr; NDArray indptr = csr->indptr;
NDArray indices = csr->indices; NDArray indices = csr->indices;
...@@ -83,16 +85,16 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) { ...@@ -83,16 +85,16 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
NDArray data = csr->data; NDArray data = csr->data;
size_t workspace_size = 0; size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt( CUSPARSE_CALL(hipsparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size)); indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
cusparseMatDescr_t descr; hipsparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort( CUSPARSE_CALL(hipsparseXcsrsort(
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr, thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(), indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
workspace)); workspace));
...@@ -100,13 +102,13 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) { ...@@ -100,13 +102,13 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
csr->sorted = true; csr->sorted = true;
// free resources // free resources
CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
} }
template <> template <>
void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) { void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx); auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
const auto& ctx = csr->indptr->ctx; const auto& ctx = csr->indptr->ctx;
...@@ -125,13 +127,13 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) { ...@@ -125,13 +127,13 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
// Allocate workspace // Allocate workspace
size_t workspace_size = 0; size_t workspace_size = 0;
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs(
nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz, nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream)); csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute // Compute
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs(
workspace, workspace_size, key_in, key_out, value_in, value_out, nnz, workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream)); csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu * @file array/cuda/spmm.cu
...@@ -7,8 +9,8 @@ ...@@ -7,8 +9,8 @@
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./cusparse_dispatcher.cuh" #include "cusparse_dispatcher.cuh"
#include "./functor.cuh" #include "functor.cuh"
namespace dgl { namespace dgl {
...@@ -32,21 +34,21 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2( ...@@ -32,21 +34,21 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
auto ctx = A.indptr->ctx; auto ctx = A.indptr->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* A_weights = A_weights_array.Ptr<DType>(); const DType* A_weights = A_weights_array.Ptr<DType>();
const DType* B_weights = B_weights_array.Ptr<DType>(); const DType* B_weights = B_weights_array.Ptr<DType>();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) if (!thr_entry->cusparse_handle)
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
cusparseMatDescr_t matA, matB, matC; hipsparseMatDescr_t matA, matB, matC;
CUSPARSE_CALL(cusparseCreateMatDescr(&matA)); CUSPARSE_CALL(hipsparseCreateMatDescr(&matA));
CUSPARSE_CALL(cusparseCreateMatDescr(&matB)); CUSPARSE_CALL(hipsparseCreateMatDescr(&matB));
CUSPARSE_CALL(cusparseCreateMatDescr(&matC)); CUSPARSE_CALL(hipsparseCreateMatDescr(&matC));
cusparseSetPointerMode( hipsparseSetPointerMode(
thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST); thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST);
size_t workspace_size = 0; size_t workspace_size = 0;
/* prepare output C */ /* prepare output C */
IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx); IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx);
...@@ -81,9 +83,9 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2( ...@@ -81,9 +83,9 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
// destroy matrix/vector descriptors // destroy matrix/vector descriptors
CUSPARSE_CALL(cusparseDestroyMatDescr(matA)); CUSPARSE_CALL(hipsparseDestroyMatDescr(matA));
CUSPARSE_CALL(cusparseDestroyMatDescr(matB)); CUSPARSE_CALL(hipsparseDestroyMatDescr(matB));
CUSPARSE_CALL(cusparseDestroyMatDescr(matC)); CUSPARSE_CALL(hipsparseDestroyMatDescr(matC));
return { return {
CSRMatrix( CSRMatrix(
A.num_rows, A.num_cols, dC_csrOffsets, dC_columns, A.num_rows, A.num_cols, dC_csrOffsets, dC_columns,
...@@ -159,9 +161,9 @@ template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>( ...@@ -159,9 +161,9 @@ template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>(
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>( template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&); const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
#if BF16_ENABLED #if BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __nv_bfloat16>( template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&); const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __nv_bfloat16>( template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::vector<CSRMatrix>&, const std::vector<NDArray>&); const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>( template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/csr_transpose.cc * @file array/cuda/csr_transpose.cc
...@@ -23,12 +25,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) { ...@@ -23,12 +25,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
template <> template <>
CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) { CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data; NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
const int64_t nnz = indices->shape[0]; const int64_t nnz = indices->shape[0];
...@@ -48,30 +50,30 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) { ...@@ -48,30 +50,30 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
int32_t* t_indices_ptr = static_cast<int32_t*>(t_indices->data); int32_t* t_indices_ptr = static_cast<int32_t*>(t_indices->data);
void* t_data_ptr = t_data->data; void* t_data_ptr = t_data->data;
#if CUDART_VERSION >= 10010 #if DTKRT_VERSION >= 10010
auto device = runtime::DeviceAPI::Get(csr.indptr->ctx); auto device = runtime::DeviceAPI::Get(csr.indptr->ctx);
// workspace // workspace
size_t workspace_size; size_t workspace_size;
CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize( CUSPARSE_CALL(hipsparseCsr2cscEx2_bufferSize(
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr, thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr, indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO,
CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference HIPSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference
&workspace_size)); &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseCsr2cscEx2( CUSPARSE_CALL(hipsparseCsr2cscEx2(
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr, thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr, indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO,
CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference HIPSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference
workspace)); workspace));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
#else #else
CUSPARSE_CALL(cusparseScsr2csc( CUSPARSE_CALL(hipsparseScsr2csc(
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz,
static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr, static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr,
static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr, static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr,
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO)); HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO));
#endif #endif
return CSRMatrix( return CSRMatrix(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/cuda/cuda_filter.cc * @file array/cuda/cuda_filter.cc
...@@ -6,7 +8,7 @@ ...@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_hashtable.cuh" #include "../../runtime/cuda/cuda_hashtable.cuh"
...@@ -45,7 +47,7 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) { ...@@ -45,7 +47,7 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
const auto& ctx = test->ctx; const auto& ctx = test->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
const int64_t size = test->shape[0]; const int64_t size = test->shape[0];
cudaStream_t cudaStream = runtime::getCurrentCUDAStream(); hipStream_t cudaStream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (size == 0) { if (size == 0) {
return test; return test;
...@@ -74,12 +76,12 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) { ...@@ -74,12 +76,12 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
// generate prefix-sum // generate prefix-sum
{ {
size_t workspace_bytes; size_t workspace_bytes;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr), nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), size + 1, cudaStream)); static_cast<IdType*>(nullptr), size + 1, cudaStream));
void* workspace = device->AllocWorkspace(ctx, workspace_bytes); void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream)); workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
} }
...@@ -108,8 +110,8 @@ template <typename IdType> ...@@ -108,8 +110,8 @@ template <typename IdType>
class CudaFilterSet : public Filter { class CudaFilterSet : public Filter {
public: public:
explicit CudaFilterSet(IdArray array) explicit CudaFilterSet(IdArray array)
: table_(array->shape[0], array->ctx, runtime::getCurrentCUDAStream()) { : table_(array->shape[0], array->ctx, runtime::getCurrentHIPStreamMasqueradingAsCUDA()) {
cudaStream_t cudaStream = runtime::getCurrentCUDAStream(); hipStream_t cudaStream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
table_.FillWithUnique( table_.FillWithUnique(
static_cast<const IdType*>(array->data), array->shape[0], cudaStream); static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
} }
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/dispatcher.cuh * @file array/cuda/dispatcher.cuh
...@@ -7,7 +8,7 @@ ...@@ -7,7 +8,7 @@
#ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_ #ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_ #define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
#include <cusparse.h> #include <hipsparse/hipsparse.h>
#include <dgl/runtime/c_runtime_api.h> #include <dgl/runtime/c_runtime_api.h>
#include "bf16.cuh" #include "bf16.cuh"
...@@ -20,70 +21,70 @@ namespace aten { ...@@ -20,70 +21,70 @@ namespace aten {
template <typename DType> template <typename DType>
struct CSRGEMM { struct CSRGEMM {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
BUG_IF_FAIL(false) << "This piece of code should not be reached."; BUG_IF_FAIL(false) << "This piece of code should not be reached.";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgemm2Nnz(args...); return hipsparseXcsrgemm2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
BUG_IF_FAIL(false) << "This piece of code should not be reached."; BUG_IF_FAIL(false) << "This piece of code should not be reached.";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
}; };
template <> template <>
struct CSRGEMM<__half> { struct CSRGEMM<__half> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// different implementation would be required. // different implementation would be required.
LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16)."; LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgemm2Nnz(args...); return hipsparseXcsrgemm2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16)."; LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
}; };
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
struct CSRGEMM<__nv_bfloat16> { struct CSRGEMM<__hip_bfloat16> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
// different implementation would be required. // different implementation would be required.
LOG(FATAL) LOG(FATAL)
<< "CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16)."; << "CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgemm2Nnz(args...); return hipsparseXcsrgemm2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgemm2, so a different // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "CSRGEMM::compute does not support dtype bfloat16 (BF16)."; LOG(FATAL) << "CSRGEMM::compute does not support dtype bfloat16 (BF16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
}; };
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -91,36 +92,36 @@ struct CSRGEMM<__nv_bfloat16> { ...@@ -91,36 +92,36 @@ struct CSRGEMM<__nv_bfloat16> {
template <> template <>
struct CSRGEMM<float> { struct CSRGEMM<float> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
return cusparseScsrgemm2_bufferSizeExt(args...); return hipsparseScsrgemm2_bufferSizeExt(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgemm2Nnz(args...); return hipsparseXcsrgemm2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
return cusparseScsrgemm2(args...); return hipsparseScsrgemm2(args...);
} }
}; };
template <> template <>
struct CSRGEMM<double> { struct CSRGEMM<double> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
return cusparseDcsrgemm2_bufferSizeExt(args...); return hipsparseDcsrgemm2_bufferSizeExt(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgemm2Nnz(args...); return hipsparseXcsrgemm2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
return cusparseDcsrgemm2(args...); return hipsparseDcsrgemm2(args...);
} }
}; };
...@@ -128,70 +129,70 @@ struct CSRGEMM<double> { ...@@ -128,70 +129,70 @@ struct CSRGEMM<double> {
template <typename DType> template <typename DType>
struct CSRGEAM { struct CSRGEAM {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
BUG_IF_FAIL(false) << "This piece of code should not be reached."; BUG_IF_FAIL(false) << "This piece of code should not be reached.";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgeam2Nnz(args...); return hipsparseXcsrgeam2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
BUG_IF_FAIL(false) << "This piece of code should not be reached."; BUG_IF_FAIL(false) << "This piece of code should not be reached.";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
}; };
template <> template <>
struct CSRGEAM<__half> { struct CSRGEAM<__half> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// different implementation would be required. // different implementation would be required.
LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16)."; LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgeam2Nnz(args...); return hipsparseXcsrgeam2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16)."; LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
}; };
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
struct CSRGEAM<__nv_bfloat16> { struct CSRGEAM<__hip_bfloat16> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
// different implementation would be required. // different implementation would be required.
LOG(FATAL) LOG(FATAL)
<< "CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16)."; << "CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgeam2Nnz(args...); return hipsparseXcsrgeam2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
// TODO(ndickson): There is no cusparseHcsrgeam2, so a different // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "CSRGEAM::compute does not support dtype bfloat16 (BF16)."; LOG(FATAL) << "CSRGEAM::compute does not support dtype bfloat16 (BF16).";
return static_cast<cusparseStatus_t>(0); return static_cast<hipsparseStatus_t>(0);
} }
}; };
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -199,36 +200,36 @@ struct CSRGEAM<__nv_bfloat16> { ...@@ -199,36 +200,36 @@ struct CSRGEAM<__nv_bfloat16> {
template <> template <>
struct CSRGEAM<float> { struct CSRGEAM<float> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
return cusparseScsrgeam2_bufferSizeExt(args...); return hipsparseScsrgeam2_bufferSizeExt(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgeam2Nnz(args...); return hipsparseXcsrgeam2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
return cusparseScsrgeam2(args...); return hipsparseScsrgeam2(args...);
} }
}; };
template <> template <>
struct CSRGEAM<double> { struct CSRGEAM<double> {
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t bufferSizeExt(Args... args) { static inline hipsparseStatus_t bufferSizeExt(Args... args) {
return cusparseDcsrgeam2_bufferSizeExt(args...); return hipsparseDcsrgeam2_bufferSizeExt(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t nnz(Args... args) { static inline hipsparseStatus_t nnz(Args... args) {
return cusparseXcsrgeam2Nnz(args...); return hipsparseXcsrgeam2Nnz(args...);
} }
template <typename... Args> template <typename... Args>
static inline cusparseStatus_t compute(Args... args) { static inline hipsparseStatus_t compute(Args... args) {
return cusparseDcsrgeam2(args...); return hipsparseDcsrgeam2(args...);
} }
}; };
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2022, NVIDIA CORPORATION. * Copyright (c) 2022, NVIDIA CORPORATION.
* *
...@@ -24,7 +26,7 @@ ...@@ -24,7 +26,7 @@
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -78,7 +80,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums( ...@@ -78,7 +80,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
void _Merge( void _Merge(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs, IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) { int n_elms, DGLContext ctx, DGLDataType dtype, hipStream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
int nt = 256; int nt = 256;
int nb = (n_elms + nt - 1) / nt; int nb = (n_elms + nt - 1) / nt;
...@@ -99,7 +101,7 @@ void _Merge( ...@@ -99,7 +101,7 @@ void _Merge(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) { COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(coos[0].row->ctx); auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
uint64_t src_offset = 0, dst_offset = 0; uint64_t src_offset = 0, dst_offset = 0;
bool has_data = false; bool has_data = false;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020-2022 by Contributors * Copyright (c) 2020-2022 by Contributors
* *
...@@ -21,12 +22,12 @@ ...@@ -21,12 +22,12 @@
#ifndef DGL_ARRAY_CUDA_FP16_CUH_ #ifndef DGL_ARRAY_CUDA_FP16_CUH_
#define DGL_ARRAY_CUDA_FP16_CUH_ #define DGL_ARRAY_CUDA_FP16_CUH_
#include <cuda_fp16.h> #include <hip/hip_fp16.h>
#include <algorithm> #include <algorithm>
static __device__ __forceinline__ half max(half a, half b) { static __device__ __forceinline__ half max(half a, half b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__HIP_DEVICE_COMPILE__)
return __hgt(__half(a), __half(b)) ? a : b; return __hgt(__half(a), __half(b)) ? a : b;
#else #else
return __half(max(float(a), float(b))); // NOLINT return __half(max(float(a), float(b))); // NOLINT
...@@ -34,19 +35,19 @@ static __device__ __forceinline__ half max(half a, half b) { ...@@ -34,19 +35,19 @@ static __device__ __forceinline__ half max(half a, half b) {
} }
static __device__ __forceinline__ half min(half a, half b) { static __device__ __forceinline__ half min(half a, half b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__HIP_DEVICE_COMPILE__)
return __hlt(__half(a), __half(b)) ? a : b; return __hlt(__half(a), __half(b)) ? a : b;
#else #else
return __half(min(float(a), float(b))); // NOLINT return __half(min(float(a), float(b))); // NOLINT
#endif #endif
} }
#if 0
#ifdef __CUDACC__ #ifdef __HIPCC__
// Arithmetic FP16 operations for architecture >= 5.3 are already defined in // Arithmetic FP16 operations for architecture >= 5.3 are already defined in
// cuda_fp16.h // hip/hip_fp16.h
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530) #if defined(__HIP_DEVICE_COMPILE__)
// CUDA 12.2 adds "emulated" support for older architectures. // CUDA 12.2 adds "emulated" support for older architectures.
#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020) #if defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
__device__ __forceinline__ __half __device__ __forceinline__ __half
operator+(const __half& lh, const __half& rh) { operator+(const __half& lh, const __half& rh) {
return __half(float(lh) + float(rh)); // NOLINT return __half(float(lh) + float(rh)); // NOLINT
...@@ -127,8 +128,8 @@ __device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) { ...@@ -127,8 +128,8 @@ __device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) { __device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
return float(lh) <= float(rh); // NOLINT return float(lh) <= float(rh); // NOLINT
} }
#endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020) #endif // defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530) #endif // defined(__HIP_DEVICE_COMPILE__)
#endif // __CUDACC__ #endif // __HIPCC__
#endif
#endif // DGL_ARRAY_CUDA_FP16_CUH_ #endif // DGL_ARRAY_CUDA_FP16_CUH_
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/functor.cuh * @file array/cuda/functor.cuh
...@@ -9,8 +10,8 @@ ...@@ -9,8 +10,8 @@
#include <cmath> #include <cmath>
#include <limits> #include <limits>
#include "./atomic.cuh" #include "atomic.cuh"
#include "./fp16.cuh" #include "fp16.cuh"
#include "bf16.cuh" #include "bf16.cuh"
namespace dgl { namespace dgl {
...@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> { ...@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
#if BF16_ENABLED #if BF16_ENABLED
template <typename Idx, bool atomic> template <typename Idx, bool atomic>
struct Sum<Idx, __nv_bfloat16, atomic> : _Sum<Idx, __nv_bfloat16, atomic> { struct Sum<Idx, __hip_bfloat16, atomic> : _Sum<Idx, __hip_bfloat16, atomic> {
static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
return __float2bfloat16_rn(0.); return __float2bfloat16(0.);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Sum<Idx, __nv_bfloat16, atomic>::Call( _Sum<Idx, __hip_bfloat16, atomic>::Call(
out_buf, arg_u_buf, arg_e_buf, val, uid, eid); out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Sum<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id); _Sum<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
} }
// sometimes we have to use float in reduction for better precision // sometimes we have to use float in reduction for better precision
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf, _Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
static_cast<float>(val), uid, eid); static_cast<float>(val), uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Sum<Idx, float, atomic>::Call(out_buf, arg_buf, _Sum<Idx, float, atomic>::Call(out_buf, arg_buf,
static_cast<float>(val), id); static_cast<float>(val), id);
} }
...@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> { ...@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
#if BF16_ENABLED #if BF16_ENABLED
template <typename Idx, bool atomic> template <typename Idx, bool atomic>
struct Max<Idx, __nv_bfloat16, atomic> : _Max<Idx, __nv_bfloat16, atomic> { struct Max<Idx, __hip_bfloat16, atomic> : _Max<Idx, __hip_bfloat16, atomic> {
static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
return __float2bfloat16_rn(-std::numeric_limits<float>::infinity()); return __float2bfloat16(-std::numeric_limits<float>::infinity());
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Max<Idx, __nv_bfloat16, atomic>::Call( _Max<Idx, __hip_bfloat16, atomic>::Call(
out_buf, arg_u_buf, arg_e_buf, val, uid, eid); out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Max<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id); _Max<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
} }
// sometimes we have to use float in reduction for better precision // sometimes we have to use float in reduction for better precision
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf, _Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
static_cast<float>(val), uid, eid); static_cast<float>(val), uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Max<Idx, float, atomic>::Call(out_buf, arg_buf, _Max<Idx, float, atomic>::Call(out_buf, arg_buf,
static_cast<float>(val), id); static_cast<float>(val), id);
} }
...@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> { ...@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
#if BF16_ENABLED #if BF16_ENABLED
template <typename Idx, bool atomic> template <typename Idx, bool atomic>
struct Min<Idx, __nv_bfloat16, atomic> : _Min<Idx, __nv_bfloat16, atomic> { struct Min<Idx, __hip_bfloat16, atomic> : _Min<Idx, __hip_bfloat16, atomic> {
static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
return __float2bfloat16_rn(std::numeric_limits<float>::infinity()); return __float2bfloat16(std::numeric_limits<float>::infinity());
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Min<Idx, __nv_bfloat16, atomic>::Call( _Min<Idx, __hip_bfloat16, atomic>::Call(
out_buf, arg_u_buf, arg_e_buf, val, uid, eid); out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Min<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id); _Min<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
} }
// sometimes we have to use float in reduction for better precision // sometimes we have to use float in reduction for better precision
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf, _Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
static_cast<float>(val), uid, eid); static_cast<float>(val), uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Min<Idx, float, atomic>::Call(out_buf, arg_buf, _Min<Idx, float, atomic>::Call(out_buf, arg_buf,
static_cast<float>(val), id); static_cast<float>(val), id);
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/gather_mm.cu * @file array/cuda/gather_mm.cu
...@@ -7,9 +9,9 @@ ...@@ -7,9 +9,9 @@
#include <algorithm> // std::swap #include <algorithm> // std::swap
#include "./atomic.cuh" #include "atomic.cuh"
#include "./functor.cuh" #include "functor.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using namespace cuda; using namespace cuda;
...@@ -20,54 +22,63 @@ namespace { ...@@ -20,54 +22,63 @@ namespace {
/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double. /** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
*/ */
template <typename DType> template <typename DType>
cublasStatus_t cublasGemm( hipblasStatus_t cublasGemm(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const DType* alpha, const DType* A, int lda, int m, int n, int k, const DType* alpha, const DType* A, int lda,
const DType* B, int ldb, const DType* beta, DType* C, int ldc) { const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype"; LOG(INFO) << "Not supported dtype";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
template <> template <>
cublasStatus_t cublasGemm<__half>( hipblasStatus_t cublasGemm<__half>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const __half* alpha, const __half* A, int lda, int m, int n, int k, const __half* alpha, const __half* A, int lda,
const __half* B, int ldb, const __half* beta, __half* C, int ldc) { const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
return cublasHgemm( return hipblasHgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); handle, transa, transb, m, n, k, (hipblasHalf*)alpha, (hipblasHalf*)A, lda, (hipblasHalf*)B, ldb, (hipblasHalf*)beta, (hipblasHalf*)C, ldc);
} }
// template <>
// hipblasStatus_t cublasGemm<__half>(
// hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
// int m, int n, int k, const __half* alpha, const __half* A, int lda,
// const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
// return hipblasHgemm(
// handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
// }
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
cublasStatus_t cublasGemm<__nv_bfloat16>( hipblasStatus_t cublasGemm<__hip_bfloat16>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int m, int n, int k, const __hip_bfloat16* alpha, const __hip_bfloat16* A,
int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta, int lda, const __hip_bfloat16* B, int ldb, const __hip_bfloat16* beta,
__nv_bfloat16* C, int ldc) { __hip_bfloat16* C, int ldc) {
float alpha_float = __bfloat162float(*alpha); float alpha_float = __bfloat162float(*alpha);
float beta_float = __bfloat162float(*beta); float beta_float = __bfloat162float(*beta);
return cublasGemmEx( return hipblasGemmEx(
handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B, handle, transa, transb, m, n, k, &alpha_float, A, HIPBLAS_R_16B, lda, B,
CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F, HIPBLAS_R_16B, ldb, &beta_float, C, HIPBLAS_R_16B, ldc, HIPBLAS_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); HIPBLAS_GEMM_DEFAULT);
} }
#endif // BF16_ENABLED #endif // BF16_ENABLED
template <> template <>
cublasStatus_t cublasGemm<float>( hipblasStatus_t cublasGemm<float>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const float* alpha, const float* A, int lda, int m, int n, int k, const float* alpha, const float* A, int lda,
const float* B, int ldb, const float* beta, float* C, int ldc) { const float* B, int ldb, const float* beta, float* C, int ldc) {
return cublasSgemm( return hipblasSgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
} }
template <> template <>
cublasStatus_t cublasGemm<double>( hipblasStatus_t cublasGemm<double>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const double* alpha, const double* A, int lda, int m, int n, int k, const double* alpha, const double* A, int lda,
const double* B, int ldb, const double* beta, double* C, int ldc) { const double* B, int ldb, const double* beta, double* C, int ldc) {
return cublasDgemm( return hipblasDgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
} }
...@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel( ...@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
// Load A in shared mem in a coalesced way // Load A in shared mem in a coalesced way
for (unsigned int l = laneId; l < a_tile; l += 32) for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)]; sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
__syncwarp(); // __syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) { for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private DType out_reg = static_cast<DType>(0.0f); // thread private
...@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2( ...@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
/* Load A in shared mem in a coalesced way */ /* Load A in shared mem in a coalesced way */
for (unsigned int l = laneId; l < a_tile; l += 32) for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)]; sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
__syncwarp(); // __syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) { for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private DType out_reg = static_cast<DType>(0.0f); // thread private
...@@ -203,7 +214,7 @@ void SegmentMM( ...@@ -203,7 +214,7 @@ void SegmentMM(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans) { bool a_trans, bool b_trans) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* A_data = A.Ptr<DType>(); const DType* A_data = A.Ptr<DType>();
const DType* B_data = B.Ptr<DType>(); const DType* B_data = B.Ptr<DType>();
const IdType* seglen_A_data = seglen_A.Ptr<IdType>(); const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
...@@ -215,8 +226,8 @@ void SegmentMM( ...@@ -215,8 +226,8 @@ void SegmentMM(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle) if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
IdType m_offset = 0; IdType m_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) { for (IdType etype = 0; etype < num_rel; ++etype) {
...@@ -226,10 +237,10 @@ void SegmentMM( ...@@ -226,10 +237,10 @@ void SegmentMM(
n = B->shape[2]; // cols of B n = B->shape[2]; // cols of B
k = B->shape[1]; // cols of A == rows of B k = B->shape[1]; // cols of A == rows of B
int ldb = n, lda = k, ldc = n; int ldb = n, lda = k, ldc = n;
cublasOperation_t transB = CUBLAS_OP_N; hipblasOperation_t transB = HIPBLAS_OP_N;
cublasOperation_t transA = CUBLAS_OP_N; hipblasOperation_t transA = HIPBLAS_OP_N;
if (b_trans) { if (b_trans) {
transB = CUBLAS_OP_T; transB = HIPBLAS_OP_T;
ldb = n, lda = n, ldc = k; ldb = n, lda = n, ldc = k;
std::swap(n, k); std::swap(n, k);
} }
...@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType> ...@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
void SegmentMMBackwardB( void SegmentMMBackwardB(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) { const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* A_data = A.Ptr<DType>(); const DType* A_data = A.Ptr<DType>();
const DType* dC_data = dC.Ptr<DType>(); const DType* dC_data = dC.Ptr<DType>();
const IdType* seglen_data = seglen.Ptr<IdType>(); const IdType* seglen_data = seglen.Ptr<IdType>();
...@@ -260,8 +271,8 @@ void SegmentMMBackwardB( ...@@ -260,8 +271,8 @@ void SegmentMMBackwardB(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle) if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
IdType k_offset = 0; IdType k_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) { for (IdType etype = 0; etype < num_rel; ++etype) {
...@@ -271,8 +282,8 @@ void SegmentMMBackwardB( ...@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
CHECK_LE(k_offset + k, A->shape[0]) CHECK_LE(k_offset + k, A->shape[0])
<< "Segement index out of bound of A->shape[0]."; << "Segement index out of bound of A->shape[0].";
int lddC = m, ldA = n, lddB = m; int lddC = m, ldA = n, lddB = m;
cublasOperation_t trans_dC = CUBLAS_OP_N; hipblasOperation_t trans_dC = HIPBLAS_OP_N;
cublasOperation_t trans_A = CUBLAS_OP_T; hipblasOperation_t trans_A = HIPBLAS_OP_T;
CUBLAS_CALL(cublasGemm<DType>( CUBLAS_CALL(cublasGemm<DType>(
thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha, thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta, dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
...@@ -299,7 +310,7 @@ void GatherMM( ...@@ -299,7 +310,7 @@ void GatherMM(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b) { const NDArray idx_b) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t out_len = B->shape[2]; // cols of B int64_t out_len = B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A int64_t in_len = A->shape[1]; // cols of A
const int64_t tot_num_rows = A->shape[0]; const int64_t tot_num_rows = A->shape[0];
...@@ -332,7 +343,7 @@ void GatherMMScatter( ...@@ -332,7 +343,7 @@ void GatherMMScatter(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c) { const NDArray idx_b, const NDArray idx_c) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* idx_c_data = idx_c.Ptr<IdType>(); const IdType* idx_c_data = idx_c.Ptr<IdType>();
int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A int64_t in_len = A->shape[1]; // cols of A
...@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>( ...@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b); const NDArray idx_b);
#if BF16_ENABLED #if BF16_ENABLED
template void GatherMM<kDGLCUDA, int32_t, __nv_bfloat16>( template void GatherMM<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b); const NDArray idx_b);
template void GatherMM<kDGLCUDA, int64_t, __nv_bfloat16>( template void GatherMM<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b); const NDArray idx_b);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>( ...@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c); const NDArray idx_b, const NDArray idx_c);
#if BF16_ENABLED #if BF16_ENABLED
template void GatherMMScatter<kDGLCUDA, int32_t, __nv_bfloat16>( template void GatherMMScatter<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c); const NDArray idx_b, const NDArray idx_c);
template void GatherMMScatter<kDGLCUDA, int64_t, __nv_bfloat16>( template void GatherMMScatter<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c); const NDArray idx_b, const NDArray idx_c);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>( ...@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans); bool a_trans, bool b_trans);
#if BF16_ENABLED #if BF16_ENABLED
template void SegmentMM<kDGLCUDA, int32_t, __nv_bfloat16>( template void SegmentMM<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans); bool a_trans, bool b_trans);
template void SegmentMM<kDGLCUDA, int64_t, __nv_bfloat16>( template void SegmentMM<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans); bool a_trans, bool b_trans);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>( ...@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>( template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#if BF16_ENABLED #if BF16_ENABLED
template void SegmentMMBackwardB<kDGLCUDA, int32_t, __nv_bfloat16>( template void SegmentMMBackwardB<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __nv_bfloat16>( template void SegmentMMBackwardB<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>( template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/ge_spmm.cuh * @file array/cuda/ge_spmm.cuh
...@@ -7,7 +9,7 @@ ...@@ -7,7 +9,7 @@
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_ #define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
#include "atomic.cuh" #include "atomic.cuh"
#include "macro.cuh" #include "macro.cuh"
...@@ -121,7 +123,7 @@ void GESpMMCsr( ...@@ -121,7 +123,7 @@ void GESpMMCsr(
const DType* efeat_data = efeat.Ptr<DType>(); const DType* efeat_data = efeat.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int ntx = 32; const int ntx = 32;
const int nty = 32; const int nty = 32;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*! /*!
* Copyright (c) 2022, NVIDIA Corporation * Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -34,19 +36,19 @@ ...@@ -34,19 +36,19 @@
#include <thrust/zip_function.h> #include <thrust/zip_function.h>
#include <algorithm> #include <algorithm>
#include <cub/cub.cuh> // NOLINT #include <hipcub/hipcub.hpp> // NOLINT
#include <limits> #include <limits>
#include <numeric> #include <numeric>
#include <type_traits> #include <type_traits>
#include <utility> #include <utility>
#include "../../array/cuda/atomic.cuh" #include "atomic.cuh"
#include "../../array/cuda/utils.h" #include "utils.h"
#include "../../graph/transform/cuda/cuda_map_edges.cuh" #include "../../graph/transform/cuda/cuda_map_edges.cuh"
#include "../../random/continuous_seed.h" #include "../../random/continuous_seed.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./spmm.cuh" #include "spmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -131,7 +133,7 @@ struct StencilOpFused { ...@@ -131,7 +133,7 @@ struct StencilOpFused {
const IdType* indices; const IdType* indices;
const IdType* nids; const IdType* nids;
bool is_pinned; bool is_pinned;
__device__ auto operator()(IdType idx) { __host__ __device__ auto operator()(IdType idx) {
const auto in_row = idx_coo[idx]; const auto in_row = idx_coo[idx];
const auto ps = probs[idx]; const auto ps = probs[idx];
IdType rofs = idx - subindptr[in_row]; IdType rofs = idx - subindptr[in_row];
...@@ -277,7 +279,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel( ...@@ -277,7 +279,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
const FloatType* const ds, const FloatType* const d2s, const FloatType* const ds, const FloatType* const d2s,
const IdType* const indptr, const FloatType* const probs, const IdType* const indptr, const FloatType* const probs,
const FloatType* const A, const IdType* const subindptr) { const FloatType* const A, const IdType* const subindptr) {
typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce; typedef hipcub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ FloatType var_1_bcast[BLOCK_CTAS]; __shared__ FloatType var_1_bcast[BLOCK_CTAS];
...@@ -351,7 +353,7 @@ int log_size(const IdType size) { ...@@ -351,7 +353,7 @@ int log_size(const IdType size) {
template <typename IdType, typename FloatType, typename exec_policy_t> template <typename IdType, typename FloatType, typename exec_policy_t>
void compute_importance_sampling_probabilities( void compute_importance_sampling_probabilities(
CSRMatrix mat, const IdType hop_size, cudaStream_t stream, CSRMatrix mat, const IdType hop_size, hipStream_t stream,
const continuous_seed seed, const IdType num_rows, const IdType* indptr, const continuous_seed seed, const IdType num_rows, const IdType* indptr,
const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr, const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
const IdType* nids, const IdType* nids,
...@@ -398,17 +400,17 @@ void compute_importance_sampling_probabilities( ...@@ -398,17 +400,17 @@ void compute_importance_sampling_probabilities(
hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx, hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
mat.indptr->dtype); mat.indptr->dtype);
cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get()); hipcub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
{ {
std::size_t temp_storage_bytes = 0; std::size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceRadixSort::SortKeys( CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices, nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
stream)); stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes); auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL(cub::DeviceRadixSort::SortKeys( CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
temp.get(), temp_storage_bytes, hop_b, hop_size, 0, temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
max_log_num_vertices, stream)); max_log_num_vertices, stream));
} }
...@@ -418,13 +420,13 @@ void compute_importance_sampling_probabilities( ...@@ -418,13 +420,13 @@ void compute_importance_sampling_probabilities(
{ {
std::size_t temp_storage_bytes = 0; std::size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceRunLengthEncode::Encode( CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(), nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream)); hop_counts.get(), hop_unique_size.get(), hop_size, stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes); auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL(cub::DeviceRunLengthEncode::Encode( CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(), temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream)); hop_counts.get(), hop_unique_size.get(), hop_size, stream));
...@@ -511,7 +513,7 @@ void compute_importance_sampling_probabilities( ...@@ -511,7 +513,7 @@ void compute_importance_sampling_probabilities(
/////////////////////////////// CSR /////////////////////////////// /////////////////////////////// CSR ///////////////////////////////
template <DGLDeviceType XPU, typename IdType, typename FloatType> template <DGLDeviceType XPU, typename IdType, typename FloatType>
std::pair<COOMatrix, FloatArray> CSRLaborSampling( __host__ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
CSRMatrix mat, IdArray rows_arr, const int64_t num_picks, CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr, FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
float seed2_contribution, IdArray NIDs) { float seed2_contribution, IdArray NIDs) {
...@@ -521,8 +523,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -521,8 +523,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
runtime::CUDAWorkspaceAllocator allocator(ctx); runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto stream = runtime::getCurrentCUDAStream(); const auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
...@@ -569,11 +571,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -569,11 +571,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto ds_d2s = thrust::make_zip_iterator(ds, d2s); auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets, nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream)); TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size); auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
stream)); stream));
...@@ -586,11 +588,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -586,11 +588,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
IdType hop_size; IdType hop_size;
{ {
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream)); stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size); auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream)); stream));
...@@ -619,11 +621,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -619,11 +621,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto modified_in_deg = thrust::make_transform_iterator( auto modified_in_deg = thrust::make_transform_iterator(
iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows}); iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(), nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
num_rows + 1, stream)); num_rows + 1, stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size); auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, modified_in_deg, temp.get(), prefix_temp_size, modified_in_deg,
subindptr_aligned.get(), num_rows + 1, stream)); subindptr_aligned.get(), num_rows + 1, stream));
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/macro.cuh * @file array/cuda/macro.cuh
...@@ -30,14 +31,14 @@ ...@@ -30,14 +31,14 @@
const auto device = runtime::DeviceAPI::Get(ctx); \ const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \ (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.lhs_offset.size())); \ ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(cudaMemcpy( \ CUDA_CALL(hipMemcpy( \
(LHS_OFF), &info.lhs_offset[0], \ (LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \ sizeof(int64_t) * info.lhs_offset.size(), hipMemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \ (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.rhs_offset.size())); \ ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(cudaMemcpy( \ CUDA_CALL(hipMemcpy( \
(RHS_OFF), &info.rhs_offset[0], \ (RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \ sizeof(int64_t) * info.rhs_offset.size(), hipMemcpyHostToDevice)); \
if ((EDGE_MAP)) { \ if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \ constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \ { __VA_ARGS__ } \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment