Commit 74d88bf8 authored by sangwz's avatar sangwz
Browse files

Merge branch 'dtk25.04' of http://developer.sourcefind.cn/codes/OpenDAS/dgl into 2.2.1

parents 2a1ac588 314cedc1
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/functor.cuh * @file array/cuda/functor.cuh
...@@ -9,8 +10,8 @@ ...@@ -9,8 +10,8 @@
#include <cmath> #include <cmath>
#include <limits> #include <limits>
#include "./atomic.cuh" #include "atomic.cuh"
#include "./fp16.cuh" #include "fp16.cuh"
#include "bf16.cuh" #include "bf16.cuh"
namespace dgl { namespace dgl {
...@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> { ...@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
#if BF16_ENABLED #if BF16_ENABLED
template <typename Idx, bool atomic> template <typename Idx, bool atomic>
struct Sum<Idx, __nv_bfloat16, atomic> : _Sum<Idx, __nv_bfloat16, atomic> { struct Sum<Idx, __hip_bfloat16, atomic> : _Sum<Idx, __hip_bfloat16, atomic> {
static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
return __float2bfloat16_rn(0.); return __float2bfloat16(0.);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Sum<Idx, __nv_bfloat16, atomic>::Call( _Sum<Idx, __hip_bfloat16, atomic>::Call(
out_buf, arg_u_buf, arg_e_buf, val, uid, eid); out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Sum<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id); _Sum<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
} }
// sometimes we have to use float in reduction for better precision // sometimes we have to use float in reduction for better precision
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf, _Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
static_cast<float>(val), uid, eid); static_cast<float>(val), uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Sum<Idx, float, atomic>::Call(out_buf, arg_buf, _Sum<Idx, float, atomic>::Call(out_buf, arg_buf,
static_cast<float>(val), id); static_cast<float>(val), id);
} }
...@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> { ...@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
#if BF16_ENABLED #if BF16_ENABLED
template <typename Idx, bool atomic> template <typename Idx, bool atomic>
struct Max<Idx, __nv_bfloat16, atomic> : _Max<Idx, __nv_bfloat16, atomic> { struct Max<Idx, __hip_bfloat16, atomic> : _Max<Idx, __hip_bfloat16, atomic> {
static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
return __float2bfloat16_rn(-std::numeric_limits<float>::infinity()); return __float2bfloat16(-std::numeric_limits<float>::infinity());
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Max<Idx, __nv_bfloat16, atomic>::Call( _Max<Idx, __hip_bfloat16, atomic>::Call(
out_buf, arg_u_buf, arg_e_buf, val, uid, eid); out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Max<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id); _Max<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
} }
// sometimes we have to use float in reduction for better precision // sometimes we have to use float in reduction for better precision
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf, _Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
static_cast<float>(val), uid, eid); static_cast<float>(val), uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Max<Idx, float, atomic>::Call(out_buf, arg_buf, _Max<Idx, float, atomic>::Call(out_buf, arg_buf,
static_cast<float>(val), id); static_cast<float>(val), id);
} }
...@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> { ...@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
#if BF16_ENABLED #if BF16_ENABLED
template <typename Idx, bool atomic> template <typename Idx, bool atomic>
struct Min<Idx, __nv_bfloat16, atomic> : _Min<Idx, __nv_bfloat16, atomic> { struct Min<Idx, __hip_bfloat16, atomic> : _Min<Idx, __hip_bfloat16, atomic> {
static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
return __float2bfloat16_rn(std::numeric_limits<float>::infinity()); return __float2bfloat16(std::numeric_limits<float>::infinity());
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Min<Idx, __nv_bfloat16, atomic>::Call( _Min<Idx, __hip_bfloat16, atomic>::Call(
out_buf, arg_u_buf, arg_e_buf, val, uid, eid); out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
__nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Min<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id); _Min<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
} }
// sometimes we have to use float in reduction for better precision // sometimes we have to use float in reduction for better precision
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
__nv_bfloat16 val, Idx uid, Idx eid) { __hip_bfloat16 val, Idx uid, Idx eid) {
_Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf, _Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
static_cast<float>(val), uid, eid); static_cast<float>(val), uid, eid);
} }
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
_Min<Idx, float, atomic>::Call(out_buf, arg_buf, _Min<Idx, float, atomic>::Call(out_buf, arg_buf,
static_cast<float>(val), id); static_cast<float>(val), id);
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/gather_mm.cu * @file array/cuda/gather_mm.cu
...@@ -7,9 +9,9 @@ ...@@ -7,9 +9,9 @@
#include <algorithm> // std::swap #include <algorithm> // std::swap
#include "./atomic.cuh" #include "atomic.cuh"
#include "./functor.cuh" #include "functor.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using namespace cuda; using namespace cuda;
...@@ -20,54 +22,63 @@ namespace { ...@@ -20,54 +22,63 @@ namespace {
/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double. /** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
*/ */
template <typename DType> template <typename DType>
cublasStatus_t cublasGemm( hipblasStatus_t cublasGemm(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const DType* alpha, const DType* A, int lda, int m, int n, int k, const DType* alpha, const DType* A, int lda,
const DType* B, int ldb, const DType* beta, DType* C, int ldc) { const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype"; LOG(INFO) << "Not supported dtype";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
template <> template <>
cublasStatus_t cublasGemm<__half>( hipblasStatus_t cublasGemm<__half>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const __half* alpha, const __half* A, int lda, int m, int n, int k, const __half* alpha, const __half* A, int lda,
const __half* B, int ldb, const __half* beta, __half* C, int ldc) { const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
return cublasHgemm( return hipblasHgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); handle, transa, transb, m, n, k, (hipblasHalf*)alpha, (hipblasHalf*)A, lda, (hipblasHalf*)B, ldb, (hipblasHalf*)beta, (hipblasHalf*)C, ldc);
} }
// template <>
// hipblasStatus_t cublasGemm<__half>(
// hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
// int m, int n, int k, const __half* alpha, const __half* A, int lda,
// const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
// return hipblasHgemm(
// handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
// }
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
cublasStatus_t cublasGemm<__nv_bfloat16>( hipblasStatus_t cublasGemm<__hip_bfloat16>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int m, int n, int k, const __hip_bfloat16* alpha, const __hip_bfloat16* A,
int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta, int lda, const __hip_bfloat16* B, int ldb, const __hip_bfloat16* beta,
__nv_bfloat16* C, int ldc) { __hip_bfloat16* C, int ldc) {
float alpha_float = __bfloat162float(*alpha); float alpha_float = __bfloat162float(*alpha);
float beta_float = __bfloat162float(*beta); float beta_float = __bfloat162float(*beta);
return cublasGemmEx( return hipblasGemmEx(
handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B, handle, transa, transb, m, n, k, &alpha_float, A, HIPBLAS_R_16B, lda, B,
CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F, HIPBLAS_R_16B, ldb, &beta_float, C, HIPBLAS_R_16B, ldc, HIPBLAS_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); HIPBLAS_GEMM_DEFAULT);
} }
#endif // BF16_ENABLED #endif // BF16_ENABLED
template <> template <>
cublasStatus_t cublasGemm<float>( hipblasStatus_t cublasGemm<float>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const float* alpha, const float* A, int lda, int m, int n, int k, const float* alpha, const float* A, int lda,
const float* B, int ldb, const float* beta, float* C, int ldc) { const float* B, int ldb, const float* beta, float* C, int ldc) {
return cublasSgemm( return hipblasSgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
} }
template <> template <>
cublasStatus_t cublasGemm<double>( hipblasStatus_t cublasGemm<double>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, int k, const double* alpha, const double* A, int lda, int m, int n, int k, const double* alpha, const double* A, int lda,
const double* B, int ldb, const double* beta, double* C, int ldc) { const double* B, int ldb, const double* beta, double* C, int ldc) {
return cublasDgemm( return hipblasDgemm(
handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
} }
...@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel( ...@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
// Load A in shared mem in a coalesced way // Load A in shared mem in a coalesced way
for (unsigned int l = laneId; l < a_tile; l += 32) for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)]; sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
__syncwarp(); // __syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) { for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private DType out_reg = static_cast<DType>(0.0f); // thread private
...@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2( ...@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
/* Load A in shared mem in a coalesced way */ /* Load A in shared mem in a coalesced way */
for (unsigned int l = laneId; l < a_tile; l += 32) for (unsigned int l = laneId; l < a_tile; l += 32)
sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)]; sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
__syncwarp(); // __syncwarp();
for (unsigned int outloop = 0; outloop < out_len; outloop += 32) { for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
DType out_reg = static_cast<DType>(0.0f); // thread private DType out_reg = static_cast<DType>(0.0f); // thread private
...@@ -203,7 +214,7 @@ void SegmentMM( ...@@ -203,7 +214,7 @@ void SegmentMM(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans) { bool a_trans, bool b_trans) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* A_data = A.Ptr<DType>(); const DType* A_data = A.Ptr<DType>();
const DType* B_data = B.Ptr<DType>(); const DType* B_data = B.Ptr<DType>();
const IdType* seglen_A_data = seglen_A.Ptr<IdType>(); const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
...@@ -215,8 +226,8 @@ void SegmentMM( ...@@ -215,8 +226,8 @@ void SegmentMM(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle) if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
IdType m_offset = 0; IdType m_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) { for (IdType etype = 0; etype < num_rel; ++etype) {
...@@ -226,10 +237,10 @@ void SegmentMM( ...@@ -226,10 +237,10 @@ void SegmentMM(
n = B->shape[2]; // cols of B n = B->shape[2]; // cols of B
k = B->shape[1]; // cols of A == rows of B k = B->shape[1]; // cols of A == rows of B
int ldb = n, lda = k, ldc = n; int ldb = n, lda = k, ldc = n;
cublasOperation_t transB = CUBLAS_OP_N; hipblasOperation_t transB = HIPBLAS_OP_N;
cublasOperation_t transA = CUBLAS_OP_N; hipblasOperation_t transA = HIPBLAS_OP_N;
if (b_trans) { if (b_trans) {
transB = CUBLAS_OP_T; transB = HIPBLAS_OP_T;
ldb = n, lda = n, ldc = k; ldb = n, lda = n, ldc = k;
std::swap(n, k); std::swap(n, k);
} }
...@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType> ...@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
void SegmentMMBackwardB( void SegmentMMBackwardB(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) { const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* A_data = A.Ptr<DType>(); const DType* A_data = A.Ptr<DType>();
const DType* dC_data = dC.Ptr<DType>(); const DType* dC_data = dC.Ptr<DType>();
const IdType* seglen_data = seglen.Ptr<IdType>(); const IdType* seglen_data = seglen.Ptr<IdType>();
...@@ -260,8 +271,8 @@ void SegmentMMBackwardB( ...@@ -260,8 +271,8 @@ void SegmentMMBackwardB(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
if (!thr_entry->cublas_handle) if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
IdType k_offset = 0; IdType k_offset = 0;
for (IdType etype = 0; etype < num_rel; ++etype) { for (IdType etype = 0; etype < num_rel; ++etype) {
...@@ -271,8 +282,8 @@ void SegmentMMBackwardB( ...@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
CHECK_LE(k_offset + k, A->shape[0]) CHECK_LE(k_offset + k, A->shape[0])
<< "Segement index out of bound of A->shape[0]."; << "Segement index out of bound of A->shape[0].";
int lddC = m, ldA = n, lddB = m; int lddC = m, ldA = n, lddB = m;
cublasOperation_t trans_dC = CUBLAS_OP_N; hipblasOperation_t trans_dC = HIPBLAS_OP_N;
cublasOperation_t trans_A = CUBLAS_OP_T; hipblasOperation_t trans_A = HIPBLAS_OP_T;
CUBLAS_CALL(cublasGemm<DType>( CUBLAS_CALL(cublasGemm<DType>(
thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha, thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta, dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
...@@ -299,7 +310,7 @@ void GatherMM( ...@@ -299,7 +310,7 @@ void GatherMM(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b) { const NDArray idx_b) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t out_len = B->shape[2]; // cols of B int64_t out_len = B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A int64_t in_len = A->shape[1]; // cols of A
const int64_t tot_num_rows = A->shape[0]; const int64_t tot_num_rows = A->shape[0];
...@@ -332,7 +343,7 @@ void GatherMMScatter( ...@@ -332,7 +343,7 @@ void GatherMMScatter(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c) { const NDArray idx_b, const NDArray idx_c) {
auto device = runtime::DeviceAPI::Get(A->ctx); auto device = runtime::DeviceAPI::Get(A->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* idx_c_data = idx_c.Ptr<IdType>(); const IdType* idx_c_data = idx_c.Ptr<IdType>();
int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B
int64_t in_len = A->shape[1]; // cols of A int64_t in_len = A->shape[1]; // cols of A
...@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>( ...@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b); const NDArray idx_b);
#if BF16_ENABLED #if BF16_ENABLED
template void GatherMM<kDGLCUDA, int32_t, __nv_bfloat16>( template void GatherMM<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b); const NDArray idx_b);
template void GatherMM<kDGLCUDA, int64_t, __nv_bfloat16>( template void GatherMM<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b); const NDArray idx_b);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>( ...@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c); const NDArray idx_b, const NDArray idx_c);
#if BF16_ENABLED #if BF16_ENABLED
template void GatherMMScatter<kDGLCUDA, int32_t, __nv_bfloat16>( template void GatherMMScatter<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c); const NDArray idx_b, const NDArray idx_c);
template void GatherMMScatter<kDGLCUDA, int64_t, __nv_bfloat16>( template void GatherMMScatter<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
const NDArray idx_b, const NDArray idx_c); const NDArray idx_b, const NDArray idx_c);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>( ...@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans); bool a_trans, bool b_trans);
#if BF16_ENABLED #if BF16_ENABLED
template void SegmentMM<kDGLCUDA, int32_t, __nv_bfloat16>( template void SegmentMM<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans); bool a_trans, bool b_trans);
template void SegmentMM<kDGLCUDA, int64_t, __nv_bfloat16>( template void SegmentMM<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
bool a_trans, bool b_trans); bool a_trans, bool b_trans);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>( ...@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>( template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#if BF16_ENABLED #if BF16_ENABLED
template void SegmentMMBackwardB<kDGLCUDA, int32_t, __nv_bfloat16>( template void SegmentMMBackwardB<kDGLCUDA, int32_t, __hip_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
template void SegmentMMBackwardB<kDGLCUDA, int64_t, __nv_bfloat16>( template void SegmentMMBackwardB<kDGLCUDA, int64_t, __hip_bfloat16>(
const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>( template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/ge_spmm.cuh * @file array/cuda/ge_spmm.cuh
...@@ -7,7 +9,7 @@ ...@@ -7,7 +9,7 @@
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_ #define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
#include "atomic.cuh" #include "atomic.cuh"
#include "macro.cuh" #include "macro.cuh"
...@@ -121,7 +123,7 @@ void GESpMMCsr( ...@@ -121,7 +123,7 @@ void GESpMMCsr(
const DType* efeat_data = efeat.Ptr<DType>(); const DType* efeat_data = efeat.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int ntx = 32; const int ntx = 32;
const int nty = 32; const int nty = 32;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*! /*!
* Copyright (c) 2022, NVIDIA Corporation * Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -34,17 +36,21 @@ ...@@ -34,17 +36,21 @@
#include <thrust/zip_function.h> #include <thrust/zip_function.h>
#include <algorithm> #include <algorithm>
#include <cub/cub.cuh> // NOLINT #include <hipcub/hipcub.hpp> // NOLINT
#include <limits> #include <limits>
#include <numeric> #include <numeric>
#include <type_traits> #include <type_traits>
#include <utility> #include <utility>
#include "../../array/cuda/utils.h" #include "../../array/cuda/utils.h"
#include "atomic.cuh"
#include "../../graph/transform/cuda/cuda_map_edges.cuh"
#include "../../random/continuous_seed.h" #include "../../random/continuous_seed.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./spmm.cuh" #include "spmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -129,7 +135,7 @@ struct StencilOpFused { ...@@ -129,7 +135,7 @@ struct StencilOpFused {
const IdType* indices; const IdType* indices;
const IdType* nids; const IdType* nids;
bool is_pinned; bool is_pinned;
__device__ auto operator()(IdType idx) { __host__ __device__ auto operator()(IdType idx) {
const auto in_row = idx_coo[idx]; const auto in_row = idx_coo[idx];
const auto ps = probs[idx]; const auto ps = probs[idx];
IdType rofs = idx - subindptr[in_row]; IdType rofs = idx - subindptr[in_row];
...@@ -275,7 +281,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel( ...@@ -275,7 +281,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
const FloatType* const ds, const FloatType* const d2s, const FloatType* const ds, const FloatType* const d2s,
const IdType* const indptr, const FloatType* const probs, const IdType* const indptr, const FloatType* const probs,
const FloatType* const A, const IdType* const subindptr) { const FloatType* const A, const IdType* const subindptr) {
typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce; typedef hipcub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ FloatType var_1_bcast[BLOCK_CTAS]; __shared__ FloatType var_1_bcast[BLOCK_CTAS];
...@@ -349,7 +355,7 @@ int log_size(const IdType size) { ...@@ -349,7 +355,7 @@ int log_size(const IdType size) {
template <typename IdType, typename FloatType, typename exec_policy_t> template <typename IdType, typename FloatType, typename exec_policy_t>
void compute_importance_sampling_probabilities( void compute_importance_sampling_probabilities(
CSRMatrix mat, const IdType hop_size, cudaStream_t stream, CSRMatrix mat, const IdType hop_size, hipStream_t stream,
const continuous_seed seed, const IdType num_rows, const IdType* indptr, const continuous_seed seed, const IdType num_rows, const IdType* indptr,
const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr, const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
const IdType* nids, const IdType* nids,
...@@ -396,17 +402,17 @@ void compute_importance_sampling_probabilities( ...@@ -396,17 +402,17 @@ void compute_importance_sampling_probabilities(
hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx, hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
mat.indptr->dtype); mat.indptr->dtype);
cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get()); hipcub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
{ {
std::size_t temp_storage_bytes = 0; std::size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceRadixSort::SortKeys( CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices, nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
stream)); stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes); auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL(cub::DeviceRadixSort::SortKeys( CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
temp.get(), temp_storage_bytes, hop_b, hop_size, 0, temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
max_log_num_vertices, stream)); max_log_num_vertices, stream));
} }
...@@ -416,13 +422,13 @@ void compute_importance_sampling_probabilities( ...@@ -416,13 +422,13 @@ void compute_importance_sampling_probabilities(
{ {
std::size_t temp_storage_bytes = 0; std::size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceRunLengthEncode::Encode( CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(), nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream)); hop_counts.get(), hop_unique_size.get(), hop_size, stream));
auto temp = allocator.alloc_unique<char>(temp_storage_bytes); auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
CUDA_CALL(cub::DeviceRunLengthEncode::Encode( CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(), temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
hop_counts.get(), hop_unique_size.get(), hop_size, stream)); hop_counts.get(), hop_unique_size.get(), hop_size, stream));
...@@ -509,7 +515,7 @@ void compute_importance_sampling_probabilities( ...@@ -509,7 +515,7 @@ void compute_importance_sampling_probabilities(
/////////////////////////////// CSR /////////////////////////////// /////////////////////////////// CSR ///////////////////////////////
template <DGLDeviceType XPU, typename IdType, typename FloatType> template <DGLDeviceType XPU, typename IdType, typename FloatType>
std::pair<COOMatrix, FloatArray> CSRLaborSampling( __host__ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
CSRMatrix mat, IdArray rows_arr, const int64_t num_picks, CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr, FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
float seed2_contribution, IdArray NIDs) { float seed2_contribution, IdArray NIDs) {
...@@ -519,19 +525,25 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -519,19 +525,25 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
runtime::CUDAWorkspaceAllocator allocator(ctx); runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto stream = runtime::getCurrentCUDAStream(); const auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
const IdType num_rows = rows_arr->shape[0]; const IdType num_rows = rows_arr->shape[0];
IdType* const rows = rows_arr.Ptr<IdType>(); // IdType* const rows = rows_arr.Ptr<IdType>();
IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>(); IdType* const rows = static_cast<IdType*>(GetDevicePointer(rows_arr));
FloatType* const A = prob_arr.Ptr<FloatType>(); // IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>();
IdType* const nids = IsNullArray(NIDs) ? nullptr : static_cast<IdType*>(GetDevicePointer(NIDs));
IdType* const indptr_ = mat.indptr.Ptr<IdType>(); // FloatType* const A = prob_arr.Ptr<FloatType>();
IdType* const indices_ = mat.indices.Ptr<IdType>(); FloatType* const A = static_cast<FloatType*>(GetDevicePointer(prob_arr));;
IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
// IdType* const indptr_ = mat.indptr.Ptr<IdType>();
IdType* const indptr_ = static_cast<IdType*>(GetDevicePointer(mat.indptr));
// IdType* const indices_ = mat.indices.Ptr<IdType>();
IdType* const indices_ = static_cast<IdType*>(GetDevicePointer(mat.indices));
// IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
IdType* const data = CSRHasData(mat) ? static_cast<IdType*>(GetDevicePointer(mat.data)) : nullptr;
// Read indptr only once in case it is pinned and access is slow. // Read indptr only once in case it is pinned and access is slow.
auto indptr = allocator.alloc_unique<IdType>(num_rows); auto indptr = allocator.alloc_unique<IdType>(num_rows);
...@@ -567,11 +579,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -567,11 +579,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto ds_d2s = thrust::make_zip_iterator(ds, d2s); auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets, nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream)); TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size); auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
stream)); stream));
...@@ -584,11 +596,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -584,11 +596,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
IdType hop_size; IdType hop_size;
{ {
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream)); stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size); auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
stream)); stream));
...@@ -617,11 +629,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -617,11 +629,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto modified_in_deg = thrust::make_transform_iterator( auto modified_in_deg = thrust::make_transform_iterator(
iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows}); iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(), nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
num_rows + 1, stream)); num_rows + 1, stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size); auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, modified_in_deg, temp.get(), prefix_temp_size, modified_in_deg,
subindptr_aligned.get(), num_rows + 1, stream)); subindptr_aligned.get(), num_rows + 1, stream));
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/macro.cuh * @file array/cuda/macro.cuh
...@@ -30,14 +31,14 @@ ...@@ -30,14 +31,14 @@
const auto device = runtime::DeviceAPI::Get(ctx); \ const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \ (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.lhs_offset.size())); \ ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(cudaMemcpy( \ CUDA_CALL(hipMemcpy( \
(LHS_OFF), &info.lhs_offset[0], \ (LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \ sizeof(int64_t) * info.lhs_offset.size(), hipMemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \ (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.rhs_offset.size())); \ ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(cudaMemcpy( \ CUDA_CALL(hipMemcpy( \
(RHS_OFF), &info.rhs_offset[0], \ (RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \ sizeof(int64_t) * info.rhs_offset.size(), hipMemcpyHostToDevice)); \
if ((EDGE_MAP)) { \ if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \ constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \ { __VA_ARGS__ } \
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/cuda/negative_sampling.cu * @file array/cuda/negative_sampling.cu
* @brief rowwise sampling * @brief rowwise sampling
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/array_iterator.h> #include <dgl/array_iterator.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel( ...@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x; const int stride_x = gridDim.x * blockDim.x;
curandStatePhilox4_32_10_t hiprandStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time rng; // this allows generating 4 32-bit ints at a time
curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) { while (tx < num_samples) {
for (int i = 0; i < num_trials; ++i) { for (int i = 0; i < num_trials; ++i) {
uint4 result = curand4(&rng); uint4 result = hiprand4(&rng);
// Turns out that result.x is always 0 with the above RNG. // Turns out that result.x is always 0 with the above RNG.
uint64_t y_hi = result.y >> 16; uint64_t y_hi = result.y >> 16;
uint64_t y_lo = result.y & 0xFFFF; uint64_t y_lo = result.y & 0xFFFF;
...@@ -88,7 +90,7 @@ struct IsNotMinusOne { ...@@ -88,7 +90,7 @@ struct IsNotMinusOne {
template <typename IdType> template <typename IdType>
void SortOrderedPairs( void SortOrderedPairs(
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor, runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) { IdType* tmp_major, IdType* tmp_minor, int64_t n, hipStream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since // Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable. // cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix // We need a 2*n auxiliary storage to store the results form the first radix
...@@ -98,21 +100,21 @@ void SortOrderedPairs( ...@@ -98,21 +100,21 @@ void SortOrderedPairs(
void* tmp2 = nullptr; void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress. // Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream)); stream));
tmp1 = device->AllocWorkspace(ctx, s1); tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream)); stream));
// Radix sort by major key next. // Radix sort by major key next.
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream)); stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1 : tmp1; // reuse buffer if s2 <= s1
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream)); stream));
...@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IdType* out_row_data = out_row.Ptr<IdType>(); IdType* out_row_data = out_row.Ptr<IdType>();
IdType* out_col_data = out_col.Ptr<IdType>(); IdType* out_col_data = out_col.Ptr<IdType>();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = cuda::FindNumThreads(num_actual_samples); const int nt = cuda::FindNumThreads(num_actual_samples);
const int nb = (num_actual_samples + nt - 1) / nt; const int nb = (num_actual_samples + nt - 1) / nt;
std::pair<IdArray, IdArray> result; std::pair<IdArray, IdArray> result;
...@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IsNotMinusOne<IdType> op; IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data); PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data); PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(hipcub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream)); stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size); void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(hipcub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream)); stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
...@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
size_t tmp_size_unique = 0; size_t tmp_size_unique = 0;
void* tmp_unique = nullptr; void* tmp_unique = nullptr;
CUDA_CALL(cub::DeviceSelect::Unique( CUDA_CALL(hipcub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream)); num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size) tmp_unique = (tmp_size_unique > tmp_size)
? device->AllocWorkspace(ctx, tmp_size_unique) ? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer : tmp; // reuse buffer
CUDA_CALL(cub::DeviceSelect::Unique( CUDA_CALL(hipcub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream)); num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out = std::min(num_samples, num_out); num_out = ::min(num_samples, num_out);
result = { result = {
unique_row.CreateView({num_out}, dtype), unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)}; unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique); if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
} else { } else {
num_out = std::min(num_samples, num_out); num_out = ::min(num_samples, num_out);
result = { result = {
out_row.CreateView({num_out}, dtype), out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)}; out_col.CreateView({num_out}, dtype)};
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/cuda/rowwise_sampling.cu * @file array/cuda/rowwise_sampling.cu
* @brief uniform rowwise sampling * @brief uniform rowwise sampling
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h> #include <dgl/runtime/tensordispatch.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <numeric> #include <numeric>
#include "../../array/cuda/atomic.cuh" #include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
using namespace dgl::cuda; using namespace dgl::cuda;
using namespace dgl::aten::cuda; using namespace dgl::aten::cuda;
...@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel( ...@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel( ...@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
__syncthreads(); __syncthreads();
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) { for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const int num = curand(&rng) % (idx + 1); const int num = hiprand(&rng) % (idx + 1);
if (num < num_picks) { if (num < num_picks) {
// use max so as to achieve the replacement order the serial // use max so as to achieve the replacement order the serial
// algorithm would have // algorithm would have
...@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel( ...@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel( ...@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
if (deg > 0) { if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0. // each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const int64_t edge = curand(&rng) % deg; const int64_t edge = hiprand(&rng) % deg;
const int64_t out_idx = out_row_start + idx; const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row; out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge]; out_cols[out_idx] = in_index[in_row_start + edge];
...@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) { CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
const auto& ctx = rows->ctx; const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_rows = rows->shape[0]; const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data); const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
IdType* out_ptr = static_cast<IdType*>( IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg); device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent; hipEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent)); CUDA_CALL(hipEventCreate(&copyEvent));
NDArray new_len_tensor; NDArray new_len_tensor;
if (TensorDispatcher::Global()->IsAvailable()) { if (TensorDispatcher::Global()->IsAvailable()) {
...@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
} }
// copy using the internal current stream // copy using the internal current stream
CUDA_CALL(cudaMemcpyAsync( CUDA_CALL(hipMemcpyAsync(
new_len_tensor->data, out_ptr + num_rows, sizeof(IdType), new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(hipEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
...@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
device->FreeWorkspace(ctx, out_ptr); device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish // wait for copying `new_len` to finish
CUDA_CALL(cudaEventSynchronize(copyEvent)); CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent)); CUDA_CALL(hipEventDestroy(copyEvent));
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0]; const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
picked_row = picked_row.CreateView({new_len}, picked_row->dtype); picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file array/cuda/rowwise_sampling_prob.cu * @file array/cuda/rowwise_sampling_prob.cu
...@@ -6,20 +8,20 @@ ...@@ -6,20 +8,20 @@
* sampling code rowwise_sampling.cu. * sampling code rowwise_sampling.cu.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia. * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <numeric> #include <numeric>
#include "../../array/cuda/atomic.cuh" #include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
// require CUB 1.17 to use DeviceSegmentedSort // require CUB 1.17 to use DeviceSegmentedSort
static_assert( // static_assert(
CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort"); // CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
namespace dgl { namespace dgl {
using namespace cuda; using namespace cuda;
...@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel( ...@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel( ...@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
prob, data, idx, in_row_start, &item_prob); prob, data, idx, in_row_start, &item_prob);
// compute A-Res value // compute A-Res value
ares[ares_idx] = static_cast<FloatType>( ares[ares_idx] = static_cast<FloatType>(
__powf(curand_uniform(&rng), 1.0f / item_prob)); __powf(hiprand_uniform(&rng), 1.0f / item_prob));
ares_idxs[ares_idx] = static_cast<IdType>(in_idx); ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
} }
} }
...@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel( ...@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel( ...@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
if (deg > 0) { if (deg > 0) {
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads // Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan; typedef hipcub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
// Allocate shared memory for BlockScan // Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage; __shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total // Initialize running total
...@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel( ...@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get random value // get random value
FloatType sum = cdf[cdf_row_start + deg - 1]; FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum); FloatType rand = static_cast<FloatType>(hiprand_uniform(&rng) * sum);
// get the offset of the first value within cdf array which is greater // get the offset of the first value within cdf array which is greater
// than random value. // than random value.
int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>( int64_t item = hipcub::UpperBound<FloatType*, int64_t, FloatType>(
&cdf[cdf_row_start], deg, rand); &cdf[cdf_row_start], deg, rand);
item = min(item, deg - 1); item = min(item, deg - 1);
// get in and out index // get in and out index
...@@ -400,18 +402,20 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) { ...@@ -400,18 +402,20 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
const auto idtype = coo.row->dtype; const auto idtype = coo.row->dtype;
const auto ctx = coo.row->ctx; const auto ctx = coo.row->ctx;
const int64_t nnz = coo.row->shape[0]; const int64_t nnz = coo.row->shape[0];
const IdType* row = coo.row.Ptr<IdType>(); // const IdType* row = coo.row.Ptr<IdType>();
const IdType* col = coo.col.Ptr<IdType>(); const IdType* row = static_cast<IdType*>(GetDevicePointer(coo.row));
// const IdType* col = coo.col.Ptr<IdType>();
const IdType* col = static_cast<IdType*>(GetDevicePointer(coo.col));
const IdArray& eid = const IdArray& eid =
COOHasData(coo) ? coo.data : Range(0, nnz, sizeof(IdType) * 8, ctx); COOHasData(coo) ? coo.data : Range(0, nnz, sizeof(IdType) * 8, ctx);
const IdType* data = coo.data.Ptr<IdType>(); const IdType* data = static_cast<IdType*>(GetDevicePointer(coo.data));
IdArray new_row = IdArray::Empty({nnz}, idtype, ctx); IdArray new_row = IdArray::Empty({nnz}, idtype, ctx);
IdArray new_col = IdArray::Empty({nnz}, idtype, ctx); IdArray new_col = IdArray::Empty({nnz}, idtype, ctx);
IdArray new_eid = IdArray::Empty({nnz}, idtype, ctx); IdArray new_eid = IdArray::Empty({nnz}, idtype, ctx);
IdType* new_row_data = new_row.Ptr<IdType>(); IdType* new_row_data = new_row.Ptr<IdType>();
IdType* new_col_data = new_col.Ptr<IdType>(); IdType* new_col_data = new_col.Ptr<IdType>();
IdType* new_eid_data = new_eid.Ptr<IdType>(); IdType* new_eid_data = new_eid.Ptr<IdType>();
auto stream = runtime::getCurrentCUDAStream(); auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz)); int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
...@@ -439,9 +443,10 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) { ...@@ -439,9 +443,10 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
template <DGLDeviceType XPU, typename IdType, typename DType> template <DGLDeviceType XPU, typename IdType, typename DType>
COOMatrix _COORemoveIf( COOMatrix _COORemoveIf(
const COOMatrix& coo, const NDArray& values, DType criteria) { const COOMatrix& coo, const NDArray& values, DType criteria) {
const DType* val = values.Ptr<DType>(); // const DType* val = values.Ptr<DType>();
const DType* val = static_cast<DType*>(GetDevicePointer(values));
auto maskgen = [val, criteria]( auto maskgen = [val, criteria](
int nb, int nt, cudaStream_t stream, int64_t nnz, int nb, int nt, hipStream_t stream, int64_t nnz,
const IdType* data, int8_t* flags) { const IdType* data, int8_t* flags) {
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz, (_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
...@@ -481,7 +486,7 @@ COOMatrix _CSRRowWiseSampling( ...@@ -481,7 +486,7 @@ COOMatrix _CSRRowWiseSampling(
const FloatArray& prob, bool replace) { const FloatArray& prob, bool replace) {
const auto& ctx = rows->ctx; const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_rows = rows->shape[0]; const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data); const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...@@ -530,10 +535,10 @@ COOMatrix _CSRRowWiseSampling( ...@@ -530,10 +535,10 @@ COOMatrix _CSRRowWiseSampling(
IdType* temp_ptr = static_cast<IdType*>( IdType* temp_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, temp_deg); device->FreeWorkspace(ctx, temp_deg);
...@@ -551,16 +556,16 @@ COOMatrix _CSRRowWiseSampling( ...@@ -551,16 +556,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* out_ptr = static_cast<IdType*>( IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
prefix_temp_size = 0; prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg); device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent; hipEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent)); CUDA_CALL(hipEventCreate(&copyEvent));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// wait on a cudaevent // wait on a cudaevent
IdType new_len; IdType new_len;
...@@ -568,7 +573,7 @@ COOMatrix _CSRRowWiseSampling( ...@@ -568,7 +573,7 @@ COOMatrix _CSRRowWiseSampling(
device->CopyDataFromTo( device->CopyDataFromTo(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx, out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
DGLContext{kDGLCPU, 0}, mat.indptr->dtype); DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(hipEventRecord(copyEvent, stream));
// allocate workspace // allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one // 1) for w/ replacement, it's a global buffer to store cdf segments (one
...@@ -612,16 +617,16 @@ COOMatrix _CSRRowWiseSampling( ...@@ -612,16 +617,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* sort_temp_idxs = static_cast<IdType*>( IdType* sort_temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(IdType))); device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp); hipcub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs); hipcub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
void* d_temp_storage = nullptr; void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream)); num_rows, temp_ptr, temp_ptr + 1, stream));
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes); d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream)); num_rows, temp_ptr, temp_ptr + 1, stream));
device->FreeWorkspace(ctx, d_temp_storage); device->FreeWorkspace(ctx, d_temp_storage);
...@@ -641,8 +646,8 @@ COOMatrix _CSRRowWiseSampling( ...@@ -641,8 +646,8 @@ COOMatrix _CSRRowWiseSampling(
device->FreeWorkspace(ctx, out_ptr); device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish // wait for copying `new_len` to finish
CUDA_CALL(cudaEventSynchronize(copyEvent)); CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent)); CUDA_CALL(hipEventDestroy(copyEvent));
picked_row = picked_row.CreateView({new_len}, picked_row->dtype); picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype); picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cuh * @file array/cuda/sddmm.cuh
...@@ -10,8 +12,8 @@ ...@@ -10,8 +12,8 @@
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../selector.h" #include "../selector.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./utils.h" #include "utils.h"
#include "atomic.cuh" #include "atomic.cuh"
#include "bf16.cuh" #include "bf16.cuh"
#include "fp16.cuh" #include "fp16.cuh"
...@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel( ...@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
} }
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset /= 2) for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(full_mask, val, offset); val += __shfl_down(val, offset);
if (tx == 0) outoff[i] = val; if (tx == 0) outoff[i] = val;
} }
} }
...@@ -275,7 +277,7 @@ void SDDMMCoo( ...@@ -275,7 +277,7 @@ void SDDMMCoo(
const DType* lhs_data = lhs.Ptr<DType>(); const DType* lhs_data = lhs.Ptr<DType>();
const DType* rhs_data = rhs.Ptr<DType>(); const DType* rhs_data = rhs.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *lhs_off = nullptr, *rhs_off = nullptr; int64_t *lhs_off = nullptr, *rhs_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
...@@ -337,7 +339,7 @@ void SDDMMCsr( ...@@ -337,7 +339,7 @@ void SDDMMCsr(
const DType* lhs_data = lhs.Ptr<DType>(); const DType* lhs_data = lhs.Ptr<DType>();
const DType* rhs_data = rhs.Ptr<DType>(); const DType* rhs_data = rhs.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0]; int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0];
int64_t *lhs_off = nullptr, *rhs_off = nullptr; int64_t *lhs_off = nullptr, *rhs_off = nullptr;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu * @file array/cuda/sddmm.cu
...@@ -5,8 +6,8 @@ ...@@ -5,8 +6,8 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "./functor.cuh" #include "functor.cuh"
#include "./sddmm.cuh" #include "sddmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>( ...@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template void SDDMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>( ...@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo, const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo, const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template void SDDMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo, const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED #endif // BF16_ENABLED
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu * @file array/cuda/sddmm.cu
...@@ -5,7 +6,7 @@ ...@@ -5,7 +6,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "./sddmm.cuh" #include "sddmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>( ...@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCooHetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCooHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs, const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
template void SDDMMCooHetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCooHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs, const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu * @file array/cuda/sddmm.cu
...@@ -5,7 +6,7 @@ ...@@ -5,7 +6,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "./sddmm.cuh" #include "sddmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>( ...@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs, const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs, const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cuh * @file array/cuda/segment_reduce.cuh
...@@ -10,8 +12,8 @@ ...@@ -10,8 +12,8 @@
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh" #include "atomic.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) { ...@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
IdType* arg_data = arg.Ptr<IdType>(); IdType* arg_data = arg.Ptr<IdType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = out->shape[0]; int64_t n = out->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
...@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) { ...@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
const IdType* idx_data = idx.Ptr<IdType>(); const IdType* idx_data = idx.Ptr<IdType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = feat->shape[0]; int64_t n = feat->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
...@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero( ...@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx, const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types, const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) { std::vector<NDArray>* list_out) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (op == "copy_lhs" || op == "copy_rhs") { if (op == "copy_lhs" || op == "copy_rhs") {
std::vector<std::vector<dgl_id_t>> src_dst_ntypes( std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
graph->NumVertexTypes(), std::vector<dgl_id_t>()); graph->NumVertexTypes(), std::vector<dgl_id_t>());
...@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) { ...@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const IdType* arg_data = arg.Ptr<IdType>(); const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = feat->shape[0]; int64_t n = feat->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cu * @file array/cuda/segment_reduce.cu
...@@ -6,9 +7,9 @@ ...@@ -6,9 +7,9 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include "./functor.cuh" #include "functor.cuh"
#include "./segment_reduce.cuh" #include "segment_reduce.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>( ...@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out, const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg); NDArray arg);
#if BF16_ENABLED #if BF16_ENABLED
template void SegmentReduce<kDGLCUDA, int32_t, __nv_bfloat16>( template void SegmentReduce<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out, const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg); NDArray arg);
template void SegmentReduce<kDGLCUDA, int64_t, __nv_bfloat16>( template void SegmentReduce<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out, const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg); NDArray arg);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>( ...@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
template void ScatterAdd<kDGLCUDA, int64_t, __half>( template void ScatterAdd<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray idx, NDArray out); NDArray feat, NDArray idx, NDArray out);
#if BF16_ENABLED #if BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, __nv_bfloat16>( template void ScatterAdd<kDGLCUDA, int32_t, __hip_bfloat16>(
NDArray feat, NDArray idx, NDArray out); NDArray feat, NDArray idx, NDArray out);
template void ScatterAdd<kDGLCUDA, int64_t, __nv_bfloat16>( template void ScatterAdd<kDGLCUDA, int64_t, __hip_bfloat16>(
NDArray feat, NDArray idx, NDArray out); NDArray feat, NDArray idx, NDArray out);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, float>( template void ScatterAdd<kDGLCUDA, int32_t, float>(
...@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>( ...@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx, const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out); const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
#if BF16_ENABLED #if BF16_ENABLED
template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const HeteroGraphPtr& g, const std::string& op, const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx, const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out); const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const HeteroGraphPtr& g, const std::string& op, const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx, const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out); const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
...@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>( ...@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>( template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray arg, NDArray out); NDArray feat, NDArray arg, NDArray out);
#if BF16_ENABLED #if BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, __nv_bfloat16>( template void BackwardSegmentCmp<kDGLCUDA, int32_t, __hip_bfloat16>(
NDArray feat, NDArray arg, NDArray out); NDArray feat, NDArray arg, NDArray out);
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __nv_bfloat16>( template void BackwardSegmentCmp<kDGLCUDA, int64_t, __hip_bfloat16>(
NDArray feat, NDArray arg, NDArray out); NDArray feat, NDArray arg, NDArray out);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>( template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by contributors. * Copyright (c) 2021 by contributors.
* @file array/cuda/spmat_op_impl_coo.cu * @file array/cuda/spmat_op_impl_coo.cu
...@@ -10,8 +12,8 @@ ...@@ -10,8 +12,8 @@
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh" #include "atomic.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel( ...@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) { int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = coo.row->ctx; const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0]; IdType nnz = coo.row->shape[0];
IdType nt = 1024; IdType nt = 1024;
...@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel( ...@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) { NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = coo.row->ctx; const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0]; IdType nnz = coo.row->shape[0];
IdType num_rows = coo.num_rows; IdType num_rows = coo.num_rows;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmat_op_impl_csr.cu * @file array/cuda/spmat_op_impl_csr.cu
...@@ -7,14 +9,14 @@ ...@@ -7,14 +9,14 @@
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/for_each.h> #include <thrust/for_each.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <numeric> #include <numeric>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh" #include "atomic.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -28,7 +30,7 @@ namespace impl { ...@@ -28,7 +30,7 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) { bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = csr.indptr->ctx; const auto& ctx = csr.indptr->ctx;
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx); IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx); IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
...@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType> ...@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) { NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0]; const auto rowlen = row->shape[0];
const auto collen = col->shape[0]; const auto collen = col->shape[0];
const auto rstlen = std::max(rowlen, collen); const auto rstlen = ::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx); NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
if (rstlen == 0) return rst; if (rstlen == 0) return rst;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = dgl::cuda::FindNumThreads(rstlen); const int nt = dgl::cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt; const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr; const IdType* data = nullptr;
...@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType> ...@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
bool CSRHasDuplicate(CSRMatrix csr) { bool CSRHasDuplicate(CSRMatrix csr) {
if (!csr.sorted) csr = CSRSort(csr); if (!csr.sorted) csr = CSRSort(csr);
const auto& ctx = csr.indptr->ctx; const auto& ctx = csr.indptr->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory // We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine. // but should be fine.
...@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel( ...@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) { NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto len = rows->shape[0]; const auto len = rows->shape[0];
const IdType* vid_data = rows.Ptr<IdType>(); const IdType* vid_data = rows.Ptr<IdType>();
const IdType* indptr_data = const IdType* indptr_data =
...@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel( ...@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) { CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t len = rows->shape[0]; const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true); IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len); const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
...@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices( ...@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
CSRMatrix csr, NDArray row, NDArray col) { CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0]; const auto rowlen = row->shape[0];
const auto collen = col->shape[0]; const auto collen = col->shape[0];
const auto len = std::max(rowlen, collen); const auto len = ::max(rowlen, collen);
if (len == 0) return {NullArray(), NullArray(), NullArray()}; if (len == 0) return {NullArray(), NullArray(), NullArray()};
const auto& ctx = row->ctx; const auto& ctx = row->ctx;
...@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices( ...@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
const int64_t nnz = csr.indices->shape[0]; const int64_t nnz = csr.indices->shape[0];
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* indptr_data = const IdType* indptr_data =
static_cast<IdType*>(GetDevicePointer(csr.indptr)); static_cast<IdType*>(GetDevicePointer(csr.indptr));
...@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel( ...@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
static_cast<IdType>(num_rows)); static_cast<IdType>(num_rows));
NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size); NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
typedef cub::WarpReduce<IdType> WarpReduce; typedef hipcub::WarpReduce<IdType> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS]; __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
while (out_row < last_row) { while (out_row < last_row) {
...@@ -546,7 +548,7 @@ __global__ void _SegmentMaskColKernel( ...@@ -546,7 +548,7 @@ __global__ void _SegmentMaskColKernel(
mask[idx] = 1; mask[idx] = 1;
} }
} }
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count); IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
if (laneid == 0) { if (laneid == 0) {
count[out_row] = reduce_count; count[out_row] = reduce_count;
} }
...@@ -557,7 +559,7 @@ __global__ void _SegmentMaskColKernel( ...@@ -557,7 +559,7 @@ __global__ void _SegmentMaskColKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceMatrix( CSRMatrix CSRSliceMatrix(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) { CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = rows->ctx; const auto& ctx = rows->ctx;
const auto& dtype = rows->dtype; const auto& dtype = rows->dtype;
const auto nbits = dtype.bits; const auto nbits = dtype.bits;
...@@ -582,7 +584,7 @@ CSRMatrix CSRSliceMatrix( ...@@ -582,7 +584,7 @@ CSRMatrix CSRSliceMatrix(
// A count for how many masked values per row. // A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits); IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
CUDA_CALL( CUDA_CALL(
cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows))); hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col. // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5); // For performance, the load factor of the hashmap is in (0.25, 0.5);
...@@ -593,7 +595,7 @@ CSRMatrix CSRSliceMatrix( ...@@ -593,7 +595,7 @@ CSRMatrix CSRSliceMatrix(
using it = thrust::counting_iterator<int64_t>; using it = thrust::counting_iterator<int64_t>;
runtime::CUDAWorkspaceAllocator allocator(ctx); runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
thrust::for_each( thrust::for_each(
exec_policy, it(0), it(new_ncols), exec_policy, it(0), it(new_ncols),
[key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(), [key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
...@@ -609,14 +611,15 @@ CSRMatrix CSRSliceMatrix( ...@@ -609,14 +611,15 @@ CSRMatrix CSRSliceMatrix(
// Execute SegmentMaskColKernel // Execute SegmentMaskColKernel
const int64_t num_rows = csr.num_rows; const int64_t num_rows = csr.num_rows;
constexpr int WARP_SIZE = 32; constexpr int WARP_SIZE = 64;
// With a simple fine-tuning, TILE_SIZE=16 gives a good performance. // With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
constexpr int TILE_SIZE = 16; constexpr int TILE_SIZE = 32;
constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE; constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
IdType nb = IdType nb =
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE); dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS); const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb); const dim3 nblks(nb);
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks, (_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows, nthrs, 0, stream, indptr_data, indices_data, num_rows,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cuh * @file array/cuda/spmm.cuh
...@@ -11,7 +13,7 @@ ...@@ -11,7 +13,7 @@
#include <limits> #include <limits>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
#include "atomic.cuh" #include "atomic.cuh"
#include "bf16.cuh" #include "bf16.cuh"
#include "fp16.cuh" #include "fp16.cuh"
...@@ -28,14 +30,14 @@ namespace aten { ...@@ -28,14 +30,14 @@ namespace aten {
*/ */
template <typename DType, typename IdType> template <typename DType, typename IdType>
inline bool cusparse_available(bool more_nnz_than_matrix_size) { inline bool cusparse_available(bool more_nnz_than_matrix_size) {
#if CUDART_VERSION < 11000 #if DTKRT_VERSION < 11000
if (std::is_same<IdType, int>::value && if (std::is_same<IdType, int>::value &&
(std::is_same<DType, float>::value || std::is_same<DType, double>::value)) (std::is_same<DType, float>::value || std::is_same<DType, double>::value))
return true; return true;
return false; return false;
#else #else
if (std::is_same<DType, __half>::value || if (std::is_same<DType, __half>::value ||
std::is_same<DType, __nv_bfloat16>::value) std::is_same<DType, __hip_bfloat16>::value)
return false; // cusparse's SpMM on fp16 is slow, temporally disabled. return false; // cusparse's SpMM on fp16 is slow, temporally disabled.
// If the CSR matrix has more NNZ than matrix size, we should not use // If the CSR matrix has more NNZ than matrix size, we should not use
// cuSPARSE 11.1. // cuSPARSE 11.1.
...@@ -47,54 +49,54 @@ namespace { ...@@ -47,54 +49,54 @@ namespace {
/** @brief Call cuBLAS geam API for transpose operation for float and double. */ /** @brief Call cuBLAS geam API for transpose operation for float and double. */
template <typename DType> template <typename DType>
cublasStatus_t Xgeam( hipblasStatus_t Xgeam(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const DType* alpha, const DType* A, int lda, int m, int n, const DType* alpha, const DType* A, int lda,
const DType* beta, const DType* B, int ldb, DType* C, int ldc) { const DType* beta, const DType* B, int ldb, DType* C, int ldc) {
LOG(FATAL) << "Not supported dtype"; LOG(FATAL) << "Not supported dtype";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
template <> template <>
cublasStatus_t Xgeam<__half>( hipblasStatus_t Xgeam<__half>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const __half* alpha, const __half* A, int lda, int m, int n, const __half* alpha, const __half* A, int lda,
const __half* beta, const __half* B, int ldb, __half* C, int ldc) { const __half* beta, const __half* B, int ldb, __half* C, int ldc) {
// TODO(ndickson): There is no cublasHgeam, so a different // TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "Xgeam does not support dtype half (FP16)"; LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
cublasStatus_t Xgeam<__nv_bfloat16>( hipblasStatus_t Xgeam<__hip_bfloat16>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda, int m, int n, const __hip_bfloat16* alpha, const __hip_bfloat16* A, int lda,
const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb, const __hip_bfloat16* beta, const __hip_bfloat16* B, int ldb,
__nv_bfloat16* C, int ldc) { __hip_bfloat16* C, int ldc) {
// TODO(ndickson): There is no cublasHgeam, so a different // TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)"; LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
#endif // BF16_ENABLED #endif // BF16_ENABLED
template <> template <>
cublasStatus_t Xgeam<float>( hipblasStatus_t Xgeam<float>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const float* alpha, const float* A, int lda, int m, int n, const float* alpha, const float* A, int lda,
const float* beta, const float* B, int ldb, float* C, int ldc) { const float* beta, const float* B, int ldb, float* C, int ldc) {
return cublasSgeam( return hipblasSgeam(
handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
} }
template <> template <>
cublasStatus_t Xgeam<double>( hipblasStatus_t Xgeam<double>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const double* alpha, const double* A, int lda, int m, int n, const double* alpha, const double* A, int lda,
const double* beta, const double* B, int ldb, double* C, int ldc) { const double* beta, const double* B, int ldb, double* C, int ldc) {
return cublasDgeam( return hipblasDgeam(
handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
} }
...@@ -119,12 +121,12 @@ template <typename DType> ...@@ -119,12 +121,12 @@ template <typename DType>
void _Transpose(const DType* in, DType* out, int row, int col) { void _Transpose(const DType* in, DType* out, int row, int col) {
DType alpha = 1., beta = 0.; DType alpha = 1., beta = 0.;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (!thr_entry->cublas_handle) if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
CUBLAS_CALL(Xgeam<DType>( CUBLAS_CALL(Xgeam<DType>(
thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in, thr_entry->cublas_handle, HIPBLAS_OP_T, HIPBLAS_OP_N, row, col, &alpha, in,
col, &beta, nullptr, row, out, row)); col, &beta, nullptr, row, out, row));
} }
...@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) { ...@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
*/ */
template <> template <>
void _Transpose<__half>(const __half* in, __half* out, int row, int col) { void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(row); int nt = FindNumThreads(row);
int nb = col; int nb = col;
CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
...@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) { ...@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel. * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
*/ */
template <> template <>
void _Transpose<__nv_bfloat16>( void _Transpose<__hip_bfloat16>(
const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) { const __hip_bfloat16* in, __hip_bfloat16* out, int row, int col) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(row); int nt = FindNumThreads(row);
int nb = col; int nb = col;
CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
} }
#endif // BF16_ENABLED #endif // BF16_ENABLED
#if CUDART_VERSION < 11000 #if DTKRT_VERSION < 11000
template <typename DType> template <typename DType>
cusparseStatus_t Xcsrmm2( hipsparseStatus_t Xcsrmm2(
cusparseHandle_t handle, cusparseOperation_t transA, hipsparseHandle_t handle, hipsparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, hipsparseOperation_t transB, int m, int n, int k, int nnz,
const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA, const DType* alpha, const hipsparseMatDescr_t descrA, const DType* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb, const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb,
const DType* beta, DType* C, int ldc) { const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype"; LOG(INFO) << "Not supported dtype";
return CUSPARSE_STATUS_EXECUTION_FAILED; return HIPSPARSE_STATUS_EXECUTION_FAILED;
} }
template <> template <>
cusparseStatus_t Xcsrmm2<float>( hipsparseStatus_t Xcsrmm2<float>(
cusparseHandle_t handle, cusparseOperation_t transA, hipsparseHandle_t handle, hipsparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, hipsparseOperation_t transB, int m, int n, int k, int nnz,
const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA, const float* alpha, const hipsparseMatDescr_t descrA, const float* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb, const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb,
const float* beta, float* C, int ldc) { const float* beta, float* C, int ldc) {
return cusparseScsrmm2( return hipsparseScsrmm2(
handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
csrColIndA, B, ldb, beta, C, ldc); csrColIndA, B, ldb, beta, C, ldc);
} }
template <> template <>
cusparseStatus_t Xcsrmm2<double>( hipsparseStatus_t Xcsrmm2<double>(
cusparseHandle_t handle, cusparseOperation_t transA, hipsparseHandle_t handle, hipsparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, hipsparseOperation_t transB, int m, int n, int k, int nnz,
const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA, const double* alpha, const hipsparseMatDescr_t descrA, const double* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb, const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb,
const double* beta, double* C, int ldc) { const double* beta, double* C, int ldc) {
return cusparseDcsrmm2( return hipsparseDcsrmm2(
handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
csrColIndA, B, ldb, beta, C, ldc); csrColIndA, B, ldb, beta, C, ldc);
} }
...@@ -214,12 +216,12 @@ void CusparseCsrmm2( ...@@ -214,12 +216,12 @@ void CusparseCsrmm2(
// device // device
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
// all one data array // all one data array
DType* valptr = nullptr; DType* valptr = nullptr;
if (!A_data) { if (!A_data) {
...@@ -227,55 +229,59 @@ void CusparseCsrmm2( ...@@ -227,55 +229,59 @@ void CusparseCsrmm2(
static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType))); static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
_Fill(valptr, nnz, static_cast<DType>(1.)); _Fill(valptr, nnz, static_cast<DType>(1.));
} }
#if CUDART_VERSION >= 11000 #if DTKRT_VERSION >= 11000
cusparseSpMatDescr_t matA; hipsparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC; hipsparseDnMatDescr_t matB, matC;
constexpr auto dtype = cuda_dtype<DType>::value; constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value; constexpr auto idtype = cusparse_idtype<IdType>::value;
CUSPARSE_CALL(cusparseCreateCsr( CUSPARSE_CALL(hipsparseCreateCsr(
&matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data), &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
static_cast<IdType*>(csr.indices->data), static_cast<IdType*>(csr.indices->data),
const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype, const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
CUSPARSE_INDEX_BASE_ZERO, dtype)); HIPSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(cusparseCreateDnMat( CUSPARSE_CALL(hipsparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW)); &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
CUSPARSE_CALL( CUSPARSE_CALL(
cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
size_t workspace_size; size_t workspace_size;
cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
? CUSPARSE_SPMM_CSR_ALG3 ? CUSPARSE_SPMM_CSR_ALG3
: CUSPARSE_SPMM_CSR_ALG2; : CUSPARSE_SPMM_CSR_ALG2;
CUSPARSE_CALL(cusparseSpMM_bufferSize( CUSPARSE_CALL(hipsparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, spmm_alg, &workspace_size)); matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM( CUSPARSE_CALL(hipsparseSpMM(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, spmm_alg, workspace));
matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA)); CUSPARSE_CALL(hipsparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB)); CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC)); CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
#else #else
// allocate matrix for temporary transposed output // allocate matrix for temporary transposed output
DType* trans_out = DType* trans_out =
static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType))); static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType)));
cusparseMatDescr_t descr; hipsparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(Xcsrmm2<DType>( CUSPARSE_CALL(Xcsrmm2<DType>(
thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
(valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data), (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out, static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out,
m)); m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
// transpose the output matrix // transpose the output matrix
_Transpose(trans_out, C_data, n, m); _Transpose(trans_out, C_data, n, m);
device->FreeWorkspace(ctx, trans_out); device->FreeWorkspace(ctx, trans_out);
...@@ -287,8 +293,10 @@ void CusparseCsrmm2( ...@@ -287,8 +293,10 @@ void CusparseCsrmm2(
template <typename DType, typename IdType> template <typename DType, typename IdType>
void CusparseCsrmm2Hetero( void CusparseCsrmm2Hetero(
const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data, const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
const DType* A_data, DType* C_data, int64_t x_length, cudaStream_t strm_id, const DType* A_data, DType* C_data, int64_t x_length, cudaStream_t strm_id,
bool use_deterministic_alg_only = false) { bool use_deterministic_alg_only = false) {
// We use csrmm2 to perform following operation: // We use csrmm2 to perform following operation:
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
// for node feature tensor. However, since cusparse only supports // for node feature tensor. However, since cusparse only supports
...@@ -311,9 +319,9 @@ void CusparseCsrmm2Hetero( ...@@ -311,9 +319,9 @@ void CusparseCsrmm2Hetero(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, strm_id));
// all one data array // all one data array
DType* valptr = nullptr; DType* valptr = nullptr;
if (!A_data) { if (!A_data) {
...@@ -321,51 +329,53 @@ void CusparseCsrmm2Hetero( ...@@ -321,51 +329,53 @@ void CusparseCsrmm2Hetero(
static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType))); static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
_Fill(valptr, nnz, static_cast<DType>(1.)); _Fill(valptr, nnz, static_cast<DType>(1.));
} }
#if CUDART_VERSION >= 11000 #if DTKRT_VERSION >= 11000
cusparseSpMatDescr_t matA; hipsparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC; hipsparseDnMatDescr_t matB, matC;
constexpr auto dtype = cuda_dtype<DType>::value; constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value; constexpr auto idtype = cusparse_idtype<IdType>::value;
CUSPARSE_CALL(cusparseCreateCsr( CUSPARSE_CALL(hipsparseCreateCsr(
&matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data), &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
static_cast<IdType*>(csr.indices->data), static_cast<IdType*>(csr.indices->data),
const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype, const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
CUSPARSE_INDEX_BASE_ZERO, dtype)); HIPSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(cusparseCreateDnMat( CUSPARSE_CALL(hipsparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW)); &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
CUSPARSE_CALL( CUSPARSE_CALL(
cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
size_t workspace_size; size_t workspace_size;
cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
? CUSPARSE_SPMM_CSR_ALG3 ? CUSPARSE_SPMM_CSR_ALG3
: CUSPARSE_SPMM_CSR_ALG2; : CUSPARSE_SPMM_CSR_ALG2;
CUSPARSE_CALL(cusparseSpMM_bufferSize( CUSPARSE_CALL(hipsparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, spmm_alg, &workspace_size)); matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM( CUSPARSE_CALL(hipsparseSpMM(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, spmm_alg, workspace)); matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA)); CUSPARSE_CALL(hipsparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB)); CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC)); CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
#else #else
cusparseMatDescr_t descr; hipsparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
CHECK_EQ(sizeof(IdType), sizeof(int32_t)); CHECK_EQ(sizeof(IdType), sizeof(int32_t));
CUSPARSE_CALL(Xcsrmm2<DType>( CUSPARSE_CALL(Xcsrmm2<DType>(
thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
(valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data), (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m)); static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
#endif #endif
if (valptr) device->FreeWorkspace(ctx, valptr); if (valptr) device->FreeWorkspace(ctx, valptr);
} }
...@@ -632,7 +642,7 @@ void SpMMCoo( ...@@ -632,7 +642,7 @@ void SpMMCoo(
*/ */
#if BF16_ENABLED #if BF16_ENABLED
if (std::is_same<DType, __half>::value || if (std::is_same<DType, __half>::value ||
std::is_same<DType, __nv_bfloat16>::value) std::is_same<DType, __hip_bfloat16>::value)
#else #else
if (std::is_same<DType, __half>::value) if (std::is_same<DType, __half>::value)
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -645,7 +655,7 @@ void SpMMCoo( ...@@ -645,7 +655,7 @@ void SpMMCoo(
*efeat_data = efeat.Ptr<DType>(); *efeat_data = efeat.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>(); Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0]; const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0];
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
...@@ -710,7 +720,7 @@ void SpMMCsr( ...@@ -710,7 +720,7 @@ void SpMMCsr(
Idx* argu_data = argu.Ptr<Idx>(); Idx* argu_data = argu.Ptr<Idx>();
Idx* arge_data = arge.Ptr<Idx>(); Idx* arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
...@@ -771,7 +781,7 @@ void SpMMCmpCsrHetero( ...@@ -771,7 +781,7 @@ void SpMMCmpCsrHetero(
Idx* argu_data = argu.Ptr<Idx>(); Idx* argu_data = argu.Ptr<Idx>();
Idx* arge_data = arge.Ptr<Idx>(); Idx* arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu * @file array/cuda/spmm.cu
...@@ -8,9 +9,9 @@ ...@@ -8,9 +9,9 @@
#include <cstdlib> #include <cstdlib>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./ge_spmm.cuh" #include "ge_spmm.cuh"
#include "./spmm.cuh" #include "spmm.cuh"
namespace dgl { namespace dgl {
...@@ -114,11 +115,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>( ...@@ -114,11 +115,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
#if BF16_ENABLED #if BF16_ENABLED
template void SpMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>( template void SpMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
template void SpMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>( template void SpMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
...@@ -149,11 +150,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>( ...@@ -149,11 +150,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
#if BF16_ENABLED #if BF16_ENABLED
template void SpMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>( template void SpMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
template void SpMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>( template void SpMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu * @file array/cuda/spmm.cu
...@@ -8,9 +10,9 @@ ...@@ -8,9 +10,9 @@
#include <cstdlib> #include <cstdlib>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./ge_spmm.cuh" #include "ge_spmm.cuh"
#include "./spmm.cuh" #include "spmm.cuh"
namespace dgl { namespace dgl {
...@@ -42,7 +44,7 @@ void SpMMCsrHetero( ...@@ -42,7 +44,7 @@ void SpMMCsrHetero(
use_deterministic_alg_only = true; use_deterministic_alg_only = true;
bool use_legacy_cusparsemm = bool use_legacy_cusparsemm =
(CUDART_VERSION < 11000) && (reduce == "sum") && (DTKRT_VERSION < 11000) && (reduce == "sum") &&
// legacy cuSPARSE does not care about NNZ, hence the argument "false". // legacy cuSPARSE does not care about NNZ, hence the argument "false".
((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) || ((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
(op == "mul" && is_scalar_efeat && (op == "mul" && is_scalar_efeat &&
...@@ -55,7 +57,7 @@ void SpMMCsrHetero( ...@@ -55,7 +57,7 @@ void SpMMCsrHetero(
if (m == 0) continue; if (m == 0) continue;
DType* out = static_cast<DType*>(device->AllocWorkspace( DType* out = static_cast<DType*>(device->AllocWorkspace(
vec_csr[0].indptr->ctx, m * n * sizeof(DType))); vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType))); CUDA_CALL(hipMemset(out, 0, m * n * sizeof(DType)));
trans_out[ntype] = out; trans_out[ntype] = out;
} }
} }
...@@ -116,7 +118,7 @@ void SpMMCsrHetero( ...@@ -116,7 +118,7 @@ void SpMMCsrHetero(
} }
} }
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) { for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
const dgl_type_t src_id = ufeat_ntids[etype]; const dgl_type_t src_id = ufeat_ntids[etype];
const dgl_type_t dst_id = out_ntids[etype]; const dgl_type_t dst_id = out_ntids[etype];
...@@ -128,7 +130,7 @@ void SpMMCsrHetero( ...@@ -128,7 +130,7 @@ void SpMMCsrHetero(
cusparse_available<DType, IdType>(more_nnz)) { // cusparse cusparse_available<DType, IdType>(more_nnz)) { // cusparse
/* If CUDA is less than 11.0, put the output in trans_out for later /* If CUDA is less than 11.0, put the output in trans_out for later
* transposition */ * transposition */
DType* out = (CUDART_VERSION < 11000) DType* out = (DTKRT_VERSION < 11000)
? trans_out[dst_id] ? trans_out[dst_id]
: static_cast<DType*>((*vec_out)[dst_id]->data); : static_cast<DType*>((*vec_out)[dst_id]->data);
CusparseCsrmm2Hetero<DType, IdType>( CusparseCsrmm2Hetero<DType, IdType>(
...@@ -214,14 +216,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>( ...@@ -214,14 +216,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids); const std::vector<dgl_type_t>& out_ntids);
#if BF16_ENABLED #if BF16_ENABLED
template void SpMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void SpMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat, const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out, const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
std::vector<std::vector<NDArray>>* out_aux, std::vector<std::vector<NDArray>>* out_aux,
const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids); const std::vector<dgl_type_t>& out_ntids);
template void SpMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void SpMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat, const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out, const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/utils.h * @file array/cuda/utils.h
...@@ -11,7 +13,7 @@ ...@@ -11,7 +13,7 @@
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include <dmlc/logging.h> #include <dmlc/logging.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <type_traits> #include <type_traits>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
...@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) { ...@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {
template <typename T> template <typename T>
__device__ __forceinline__ T _ldg(T* addr) { __device__ __forceinline__ T _ldg(T* addr) {
#if __CUDA_ARCH__ >= 350 #if __HIP_DEVICE_COMPILE__
return __ldg(addr); return __ldg(addr);
#else #else
return *addr; return *addr;
...@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) { ...@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
/** @brief Fill the vector started from ptr of size length with val */ /** @brief Fill the vector started from ptr of size length with val */
template <typename DType> template <typename DType>
void _Fill(DType* ptr, size_t length, DType val) { void _Fill(DType* ptr, size_t length, DType val) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(length); int nt = FindNumThreads(length);
int nb = int nb =
(length + nt - 1) / nt; // on x-axis, no need to worry about upperbound. (length + nt - 1) / nt; // on x-axis, no need to worry about upperbound.
...@@ -185,8 +187,8 @@ template <typename IdType> ...@@ -185,8 +187,8 @@ template <typename IdType>
__global__ void _LinearSearchKernel( __global__ void _LinearSearchKernel(
const IdType* indptr, const IdType* indices, const IdType* data, const IdType* indptr, const IdType* indices, const IdType* data,
const IdType* row, const IdType* col, int64_t row_stride, const IdType* row, const IdType* col, int64_t row_stride,
int64_t col_stride, int64_t length, const __nv_bfloat16* weights, int64_t col_stride, int64_t length, const __hip_bfloat16* weights,
__nv_bfloat16 filler, __nv_bfloat16* out) { __hip_bfloat16 filler, __hip_bfloat16* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x; int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x; const int stride_x = gridDim.x * blockDim.x;
while (tx < length) { while (tx < length) {
...@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel( ...@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
} else { } else {
// If the result is saved in bf16, it should be fine to convert it to // If the result is saved in bf16, it should be fine to convert it to
// float first // float first
out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast<float>(v)); out[tx] = weights ? weights[v] : __hip_bfloat16(static_cast<float>(v));
} }
tx += stride_x; tx += stride_x;
} }
...@@ -277,12 +279,12 @@ template <typename DType, typename BoolType> ...@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
void MaskSelect( void MaskSelect(
runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input, runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input,
const BoolType* mask, DType* output, int64_t n, int64_t* rst, const BoolType* mask, DType* output, int64_t n, int64_t* rst,
cudaStream_t stream) { hipStream_t stream) {
size_t workspace_size = 0; size_t workspace_size = 0;
CUDA_CALL(cub::DeviceSelect::Flagged( CUDA_CALL(hipcub::DeviceSelect::Flagged(
nullptr, workspace_size, input, mask, output, rst, n, stream)); nullptr, workspace_size, input, mask, output, rst, n, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceSelect::Flagged( CUDA_CALL(hipcub::DeviceSelect::Flagged(
workspace, workspace_size, input, mask, output, rst, n, stream)); workspace, workspace_size, input, mask, output, rst, n, stream));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
} }
...@@ -290,7 +292,7 @@ void MaskSelect( ...@@ -290,7 +292,7 @@ void MaskSelect(
inline void* GetDevicePointer(runtime::NDArray array) { inline void* GetDevicePointer(runtime::NDArray array) {
void* ptr = array->data; void* ptr = array->data;
if (array.IsPinned()) { if (array.IsPinned()) {
CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0)); CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
} }
return ptr; return ptr;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment