Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/cuda/negative_sampling.cu * @file array/cuda/negative_sampling.cu
* @brief rowwise sampling * @brief rowwise sampling
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/array_iterator.h> #include <dgl/array_iterator.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel( ...@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x; const int stride_x = gridDim.x * blockDim.x;
curandStatePhilox4_32_10_t hiprandStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time rng; // this allows generating 4 32-bit ints at a time
curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) { while (tx < num_samples) {
for (int i = 0; i < num_trials; ++i) { for (int i = 0; i < num_trials; ++i) {
uint4 result = curand4(&rng); uint4 result = hiprand4(&rng);
// Turns out that result.x is always 0 with the above RNG. // Turns out that result.x is always 0 with the above RNG.
uint64_t y_hi = result.y >> 16; uint64_t y_hi = result.y >> 16;
uint64_t y_lo = result.y & 0xFFFF; uint64_t y_lo = result.y & 0xFFFF;
...@@ -88,7 +90,7 @@ struct IsNotMinusOne { ...@@ -88,7 +90,7 @@ struct IsNotMinusOne {
template <typename IdType> template <typename IdType>
void SortOrderedPairs( void SortOrderedPairs(
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor, runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) { IdType* tmp_major, IdType* tmp_minor, int64_t n, hipStream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since // Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable. // cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix // We need a 2*n auxiliary storage to store the results form the first radix
...@@ -98,21 +100,21 @@ void SortOrderedPairs( ...@@ -98,21 +100,21 @@ void SortOrderedPairs(
void* tmp2 = nullptr; void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress. // Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream)); stream));
tmp1 = device->AllocWorkspace(ctx, s1); tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream)); stream));
// Radix sort by major key next. // Radix sort by major key next.
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream)); stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1 : tmp1; // reuse buffer if s2 <= s1
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream)); stream));
...@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IdType* out_row_data = out_row.Ptr<IdType>(); IdType* out_row_data = out_row.Ptr<IdType>();
IdType* out_col_data = out_col.Ptr<IdType>(); IdType* out_col_data = out_col.Ptr<IdType>();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = cuda::FindNumThreads(num_actual_samples); const int nt = cuda::FindNumThreads(num_actual_samples);
const int nb = (num_actual_samples + nt - 1) / nt; const int nb = (num_actual_samples + nt - 1) / nt;
std::pair<IdArray, IdArray> result; std::pair<IdArray, IdArray> result;
...@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IsNotMinusOne<IdType> op; IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data); PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data); PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(hipcub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream)); stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size); void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(hipcub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream)); stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
...@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
size_t tmp_size_unique = 0; size_t tmp_size_unique = 0;
void* tmp_unique = nullptr; void* tmp_unique = nullptr;
CUDA_CALL(cub::DeviceSelect::Unique( CUDA_CALL(hipcub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream)); num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size) tmp_unique = (tmp_size_unique > tmp_size)
? device->AllocWorkspace(ctx, tmp_size_unique) ? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer : tmp; // reuse buffer
CUDA_CALL(cub::DeviceSelect::Unique( CUDA_CALL(hipcub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream)); num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out = std::min(num_samples, num_out); num_out = ::min(num_samples, num_out);
result = { result = {
unique_row.CreateView({num_out}, dtype), unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)}; unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique); if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
} else { } else {
num_out = std::min(num_samples, num_out); num_out = ::min(num_samples, num_out);
result = { result = {
out_row.CreateView({num_out}, dtype), out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)}; out_col.CreateView({num_out}, dtype)};
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/cuda/rowwise_sampling.cu * @file array/cuda/rowwise_sampling.cu
* @brief uniform rowwise sampling * @brief uniform rowwise sampling
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h> #include <dgl/runtime/tensordispatch.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <numeric> #include <numeric>
#include "../../array/cuda/atomic.cuh" #include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
using namespace dgl::cuda; using namespace dgl::cuda;
using namespace dgl::aten::cuda; using namespace dgl::aten::cuda;
...@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel( ...@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel( ...@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
__syncthreads(); __syncthreads();
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) { for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const int num = curand(&rng) % (idx + 1); const int num = hiprand(&rng) % (idx + 1);
if (num < num_picks) { if (num < num_picks) {
// use max so as to achieve the replacement order the serial // use max so as to achieve the replacement order the serial
// algorithm would have // algorithm would have
...@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel( ...@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel( ...@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
if (deg > 0) { if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0. // each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const int64_t edge = curand(&rng) % deg; const int64_t edge = hiprand(&rng) % deg;
const int64_t out_idx = out_row_start + idx; const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row; out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge]; out_cols[out_idx] = in_index[in_row_start + edge];
...@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) { CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
const auto& ctx = rows->ctx; const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_rows = rows->shape[0]; const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data); const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
IdType* out_ptr = static_cast<IdType*>( IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg); device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent; hipEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent)); CUDA_CALL(hipEventCreate(&copyEvent));
NDArray new_len_tensor; NDArray new_len_tensor;
if (TensorDispatcher::Global()->IsAvailable()) { if (TensorDispatcher::Global()->IsAvailable()) {
...@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
} }
// copy using the internal current stream // copy using the internal current stream
CUDA_CALL(cudaMemcpyAsync( CUDA_CALL(hipMemcpyAsync(
new_len_tensor->data, out_ptr + num_rows, sizeof(IdType), new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(hipEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
...@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
device->FreeWorkspace(ctx, out_ptr); device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish // wait for copying `new_len` to finish
CUDA_CALL(cudaEventSynchronize(copyEvent)); CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent)); CUDA_CALL(hipEventDestroy(copyEvent));
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0]; const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
picked_row = picked_row.CreateView({new_len}, picked_row->dtype); picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file array/cuda/rowwise_sampling_prob.cu * @file array/cuda/rowwise_sampling_prob.cu
...@@ -6,20 +8,20 @@ ...@@ -6,20 +8,20 @@
* sampling code rowwise_sampling.cu. * sampling code rowwise_sampling.cu.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia. * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <numeric> #include <numeric>
#include "../../array/cuda/atomic.cuh" #include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
// require CUB 1.17 to use DeviceSegmentedSort // require CUB 1.17 to use DeviceSegmentedSort
static_assert( // static_assert(
CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort"); // CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
namespace dgl { namespace dgl {
using namespace cuda; using namespace cuda;
...@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel( ...@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel( ...@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
prob, data, idx, in_row_start, &item_prob); prob, data, idx, in_row_start, &item_prob);
// compute A-Res value // compute A-Res value
ares[ares_idx] = static_cast<FloatType>( ares[ares_idx] = static_cast<FloatType>(
__powf(curand_uniform(&rng), 1.0f / item_prob)); __powf(hiprand_uniform(&rng), 1.0f / item_prob));
ares_idxs[ares_idx] = static_cast<IdType>(in_idx); ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
} }
} }
...@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel( ...@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
const int64_t last_row = const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
...@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel( ...@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
if (deg > 0) { if (deg > 0) {
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads // Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan; typedef hipcub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
// Allocate shared memory for BlockScan // Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage; __shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total // Initialize running total
...@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel( ...@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get random value // get random value
FloatType sum = cdf[cdf_row_start + deg - 1]; FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum); FloatType rand = static_cast<FloatType>(hiprand_uniform(&rng) * sum);
// get the offset of the first value within cdf array which is greater // get the offset of the first value within cdf array which is greater
// than random value. // than random value.
int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>( int64_t item = hipcub::UpperBound<FloatType*, int64_t, FloatType>(
&cdf[cdf_row_start], deg, rand); &cdf[cdf_row_start], deg, rand);
item = min(item, deg - 1); item = min(item, deg - 1);
// get in and out index // get in and out index
...@@ -411,7 +413,7 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) { ...@@ -411,7 +413,7 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
IdType* new_row_data = new_row.Ptr<IdType>(); IdType* new_row_data = new_row.Ptr<IdType>();
IdType* new_col_data = new_col.Ptr<IdType>(); IdType* new_col_data = new_col.Ptr<IdType>();
IdType* new_eid_data = new_eid.Ptr<IdType>(); IdType* new_eid_data = new_eid.Ptr<IdType>();
auto stream = runtime::getCurrentCUDAStream(); auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz)); int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
...@@ -441,7 +443,7 @@ COOMatrix _COORemoveIf( ...@@ -441,7 +443,7 @@ COOMatrix _COORemoveIf(
const COOMatrix& coo, const NDArray& values, DType criteria) { const COOMatrix& coo, const NDArray& values, DType criteria) {
const DType* val = values.Ptr<DType>(); const DType* val = values.Ptr<DType>();
auto maskgen = [val, criteria]( auto maskgen = [val, criteria](
int nb, int nt, cudaStream_t stream, int64_t nnz, int nb, int nt, hipStream_t stream, int64_t nnz,
const IdType* data, int8_t* flags) { const IdType* data, int8_t* flags) {
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz, (_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
...@@ -481,7 +483,7 @@ COOMatrix _CSRRowWiseSampling( ...@@ -481,7 +483,7 @@ COOMatrix _CSRRowWiseSampling(
const FloatArray& prob, bool replace) { const FloatArray& prob, bool replace) {
const auto& ctx = rows->ctx; const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_rows = rows->shape[0]; const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data); const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...@@ -530,10 +532,10 @@ COOMatrix _CSRRowWiseSampling( ...@@ -530,10 +532,10 @@ COOMatrix _CSRRowWiseSampling(
IdType* temp_ptr = static_cast<IdType*>( IdType* temp_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, temp_deg); device->FreeWorkspace(ctx, temp_deg);
...@@ -551,16 +553,16 @@ COOMatrix _CSRRowWiseSampling( ...@@ -551,16 +553,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* out_ptr = static_cast<IdType*>( IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
prefix_temp_size = 0; prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg); device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent; hipEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent)); CUDA_CALL(hipEventCreate(&copyEvent));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// wait on a cudaevent // wait on a cudaevent
IdType new_len; IdType new_len;
...@@ -568,7 +570,7 @@ COOMatrix _CSRRowWiseSampling( ...@@ -568,7 +570,7 @@ COOMatrix _CSRRowWiseSampling(
device->CopyDataFromTo( device->CopyDataFromTo(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx, out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
DGLContext{kDGLCPU, 0}, mat.indptr->dtype); DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(hipEventRecord(copyEvent, stream));
// allocate workspace // allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one // 1) for w/ replacement, it's a global buffer to store cdf segments (one
...@@ -612,16 +614,16 @@ COOMatrix _CSRRowWiseSampling( ...@@ -612,16 +614,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* sort_temp_idxs = static_cast<IdType*>( IdType* sort_temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(IdType))); device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp); hipcub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs); hipcub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
void* d_temp_storage = nullptr; void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream)); num_rows, temp_ptr, temp_ptr + 1, stream));
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes); d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream)); num_rows, temp_ptr, temp_ptr + 1, stream));
device->FreeWorkspace(ctx, d_temp_storage); device->FreeWorkspace(ctx, d_temp_storage);
...@@ -641,8 +643,8 @@ COOMatrix _CSRRowWiseSampling( ...@@ -641,8 +643,8 @@ COOMatrix _CSRRowWiseSampling(
device->FreeWorkspace(ctx, out_ptr); device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish // wait for copying `new_len` to finish
CUDA_CALL(cudaEventSynchronize(copyEvent)); CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent)); CUDA_CALL(hipEventDestroy(copyEvent));
picked_row = picked_row.CreateView({new_len}, picked_row->dtype); picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype); picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cuh * @file array/cuda/sddmm.cuh
...@@ -10,8 +12,8 @@ ...@@ -10,8 +12,8 @@
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../selector.h" #include "../selector.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./utils.h" #include "utils.h"
#include "atomic.cuh" #include "atomic.cuh"
#include "bf16.cuh" #include "bf16.cuh"
#include "fp16.cuh" #include "fp16.cuh"
...@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel( ...@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
} }
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset /= 2) for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(full_mask, val, offset); val += __shfl_down(val, offset);
if (tx == 0) outoff[i] = val; if (tx == 0) outoff[i] = val;
} }
} }
...@@ -275,7 +277,7 @@ void SDDMMCoo( ...@@ -275,7 +277,7 @@ void SDDMMCoo(
const DType* lhs_data = lhs.Ptr<DType>(); const DType* lhs_data = lhs.Ptr<DType>();
const DType* rhs_data = rhs.Ptr<DType>(); const DType* rhs_data = rhs.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *lhs_off = nullptr, *rhs_off = nullptr; int64_t *lhs_off = nullptr, *rhs_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
...@@ -337,7 +339,7 @@ void SDDMMCsr( ...@@ -337,7 +339,7 @@ void SDDMMCsr(
const DType* lhs_data = lhs.Ptr<DType>(); const DType* lhs_data = lhs.Ptr<DType>();
const DType* rhs_data = rhs.Ptr<DType>(); const DType* rhs_data = rhs.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0]; int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0];
int64_t *lhs_off = nullptr, *rhs_off = nullptr; int64_t *lhs_off = nullptr, *rhs_off = nullptr;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu * @file array/cuda/sddmm.cu
...@@ -5,8 +6,8 @@ ...@@ -5,8 +6,8 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "./functor.cuh" #include "functor.cuh"
#include "./sddmm.cuh" #include "sddmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>( ...@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template void SDDMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>( ...@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo, const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo, const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template void SDDMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo, const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED #endif // BF16_ENABLED
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu * @file array/cuda/sddmm.cu
...@@ -5,7 +6,7 @@ ...@@ -5,7 +6,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "./sddmm.cuh" #include "sddmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>( ...@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCooHetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCooHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs, const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
template void SDDMMCooHetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCooHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs, const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu * @file array/cuda/sddmm.cu
...@@ -5,7 +6,7 @@ ...@@ -5,7 +6,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "./sddmm.cuh" #include "sddmm.cuh"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>( ...@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED #if BF16_ENABLED
template void SDDMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void SDDMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs, const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid, int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid); const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void SDDMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs, const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target, const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cuh * @file array/cuda/segment_reduce.cuh
...@@ -10,8 +12,8 @@ ...@@ -10,8 +12,8 @@
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh" #include "atomic.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) { ...@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
IdType* arg_data = arg.Ptr<IdType>(); IdType* arg_data = arg.Ptr<IdType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = out->shape[0]; int64_t n = out->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
...@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) { ...@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
const IdType* idx_data = idx.Ptr<IdType>(); const IdType* idx_data = idx.Ptr<IdType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = feat->shape[0]; int64_t n = feat->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
...@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero( ...@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx, const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types, const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) { std::vector<NDArray>* list_out) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (op == "copy_lhs" || op == "copy_rhs") { if (op == "copy_lhs" || op == "copy_rhs") {
std::vector<std::vector<dgl_id_t>> src_dst_ntypes( std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
graph->NumVertexTypes(), std::vector<dgl_id_t>()); graph->NumVertexTypes(), std::vector<dgl_id_t>());
...@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) { ...@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const IdType* arg_data = arg.Ptr<IdType>(); const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = feat->shape[0]; int64_t n = feat->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cu * @file array/cuda/segment_reduce.cu
...@@ -6,9 +7,9 @@ ...@@ -6,9 +7,9 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include "./functor.cuh" #include "functor.cuh"
#include "./segment_reduce.cuh" #include "segment_reduce.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>( ...@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out, const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg); NDArray arg);
#if BF16_ENABLED #if BF16_ENABLED
template void SegmentReduce<kDGLCUDA, int32_t, __nv_bfloat16>( template void SegmentReduce<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out, const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg); NDArray arg);
template void SegmentReduce<kDGLCUDA, int64_t, __nv_bfloat16>( template void SegmentReduce<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out, const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg); NDArray arg);
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>( ...@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
template void ScatterAdd<kDGLCUDA, int64_t, __half>( template void ScatterAdd<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray idx, NDArray out); NDArray feat, NDArray idx, NDArray out);
#if BF16_ENABLED #if BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, __nv_bfloat16>( template void ScatterAdd<kDGLCUDA, int32_t, __hip_bfloat16>(
NDArray feat, NDArray idx, NDArray out); NDArray feat, NDArray idx, NDArray out);
template void ScatterAdd<kDGLCUDA, int64_t, __nv_bfloat16>( template void ScatterAdd<kDGLCUDA, int64_t, __hip_bfloat16>(
NDArray feat, NDArray idx, NDArray out); NDArray feat, NDArray idx, NDArray out);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, float>( template void ScatterAdd<kDGLCUDA, int32_t, float>(
...@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>( ...@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx, const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out); const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
#if BF16_ENABLED #if BF16_ENABLED
template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const HeteroGraphPtr& g, const std::string& op, const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx, const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out); const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const HeteroGraphPtr& g, const std::string& op, const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx, const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out); const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
...@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>( ...@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>( template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray arg, NDArray out); NDArray feat, NDArray arg, NDArray out);
#if BF16_ENABLED #if BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, __nv_bfloat16>( template void BackwardSegmentCmp<kDGLCUDA, int32_t, __hip_bfloat16>(
NDArray feat, NDArray arg, NDArray out); NDArray feat, NDArray arg, NDArray out);
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __nv_bfloat16>( template void BackwardSegmentCmp<kDGLCUDA, int64_t, __hip_bfloat16>(
NDArray feat, NDArray arg, NDArray out); NDArray feat, NDArray arg, NDArray out);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>( template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by contributors. * Copyright (c) 2021 by contributors.
* @file array/cuda/spmat_op_impl_coo.cu * @file array/cuda/spmat_op_impl_coo.cu
...@@ -10,8 +12,8 @@ ...@@ -10,8 +12,8 @@
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh" #include "atomic.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel( ...@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) { int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = coo.row->ctx; const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0]; IdType nnz = coo.row->shape[0];
IdType nt = 1024; IdType nt = 1024;
...@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel( ...@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) { NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = coo.row->ctx; const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0]; IdType nnz = coo.row->shape[0];
IdType num_rows = coo.num_rows; IdType num_rows = coo.num_rows;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmat_op_impl_csr.cu * @file array/cuda/spmat_op_impl_csr.cu
...@@ -7,14 +9,14 @@ ...@@ -7,14 +9,14 @@
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/for_each.h> #include <thrust/for_each.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <numeric> #include <numeric>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh" #include "atomic.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
...@@ -28,7 +30,7 @@ namespace impl { ...@@ -28,7 +30,7 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) { bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = csr.indptr->ctx; const auto& ctx = csr.indptr->ctx;
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx); IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx); IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
...@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType> ...@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) { NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0]; const auto rowlen = row->shape[0];
const auto collen = col->shape[0]; const auto collen = col->shape[0];
const auto rstlen = std::max(rowlen, collen); const auto rstlen = ::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx); NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
if (rstlen == 0) return rst; if (rstlen == 0) return rst;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = dgl::cuda::FindNumThreads(rstlen); const int nt = dgl::cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt; const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr; const IdType* data = nullptr;
...@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType> ...@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
bool CSRHasDuplicate(CSRMatrix csr) { bool CSRHasDuplicate(CSRMatrix csr) {
if (!csr.sorted) csr = CSRSort(csr); if (!csr.sorted) csr = CSRSort(csr);
const auto& ctx = csr.indptr->ctx; const auto& ctx = csr.indptr->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory // We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine. // but should be fine.
...@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel( ...@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) { NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto len = rows->shape[0]; const auto len = rows->shape[0];
const IdType* vid_data = rows.Ptr<IdType>(); const IdType* vid_data = rows.Ptr<IdType>();
const IdType* indptr_data = const IdType* indptr_data =
...@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel( ...@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) { CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t len = rows->shape[0]; const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true); IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len); const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
...@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices( ...@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
CSRMatrix csr, NDArray row, NDArray col) { CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0]; const auto rowlen = row->shape[0];
const auto collen = col->shape[0]; const auto collen = col->shape[0];
const auto len = std::max(rowlen, collen); const auto len = ::max(rowlen, collen);
if (len == 0) return {NullArray(), NullArray(), NullArray()}; if (len == 0) return {NullArray(), NullArray(), NullArray()};
const auto& ctx = row->ctx; const auto& ctx = row->ctx;
...@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices( ...@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
const int64_t nnz = csr.indices->shape[0]; const int64_t nnz = csr.indices->shape[0];
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* indptr_data = const IdType* indptr_data =
static_cast<IdType*>(GetDevicePointer(csr.indptr)); static_cast<IdType*>(GetDevicePointer(csr.indptr));
...@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel( ...@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
static_cast<IdType>(num_rows)); static_cast<IdType>(num_rows));
NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size); NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
typedef cub::WarpReduce<IdType> WarpReduce; typedef hipcub::WarpReduce<IdType> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS]; __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
while (out_row < last_row) { while (out_row < last_row) {
...@@ -547,6 +549,7 @@ __global__ void _SegmentMaskColKernel( ...@@ -547,6 +549,7 @@ __global__ void _SegmentMaskColKernel(
} }
} }
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count); IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
if (laneid == 0) { if (laneid == 0) {
count[out_row] = reduce_count; count[out_row] = reduce_count;
} }
...@@ -557,13 +560,16 @@ __global__ void _SegmentMaskColKernel( ...@@ -557,13 +560,16 @@ __global__ void _SegmentMaskColKernel(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceMatrix( CSRMatrix CSRSliceMatrix(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) { CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = rows->ctx; const auto& ctx = rows->ctx;
const auto& dtype = rows->dtype; const auto& dtype = rows->dtype;
const auto nbits = dtype.bits; const auto nbits = dtype.bits;
const int64_t new_nrows = rows->shape[0]; const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0]; const int64_t new_ncols = cols->shape[0];
std::cout << "new_nrows : " << new_nrows << std::endl;
std::cout << "new_ncols : " << new_ncols << std::endl;
if (new_nrows == 0 || new_ncols == 0) if (new_nrows == 0 || new_ncols == 0)
return CSRMatrix( return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...@@ -572,6 +578,7 @@ CSRMatrix CSRSliceMatrix( ...@@ -572,6 +578,7 @@ CSRMatrix CSRSliceMatrix(
// First slice rows // First slice rows
csr = CSRSliceRows(csr, rows); csr = CSRSliceRows(csr, rows);
std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
if (csr.indices->shape[0] == 0) if (csr.indices->shape[0] == 0)
return CSRMatrix( return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...@@ -581,9 +588,11 @@ CSRMatrix CSRSliceMatrix( ...@@ -581,9 +588,11 @@ CSRMatrix CSRSliceMatrix(
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx); IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
// A count for how many masked values per row. // A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits); IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
std::cout << "1 IdArray count : " << count << std::endl;
CUDA_CALL( CUDA_CALL(
cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows))); hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
std::cout << "2 IdArray count : " << count << std::endl;
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col. // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5); // For performance, the load factor of the hashmap is in (0.25, 0.5);
// Because num_cols is usually less than 1 Million (on GPU), the // Because num_cols is usually less than 1 Million (on GPU), the
...@@ -593,7 +602,7 @@ CSRMatrix CSRSliceMatrix( ...@@ -593,7 +602,7 @@ CSRMatrix CSRSliceMatrix(
using it = thrust::counting_iterator<int64_t>; using it = thrust::counting_iterator<int64_t>;
runtime::CUDAWorkspaceAllocator allocator(ctx); runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
thrust::for_each( thrust::for_each(
exec_policy, it(0), it(new_ncols), exec_policy, it(0), it(new_ncols),
[key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(), [key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
...@@ -617,20 +626,37 @@ CSRMatrix CSRSliceMatrix( ...@@ -617,20 +626,37 @@ CSRMatrix CSRSliceMatrix(
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE); dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS); const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb); const dim3 nblks(nb);
std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
std::cout << "indptr_data : " << indptr_data << std::endl;
std::cout << "indices_data : " << indices_data << std::endl;
std::cout << "num_rows : " << num_rows << std::endl;
std::cout << "buffer_size : " << buffer_size << std::endl;
std::cout << "mask : " << mask << std::endl;
std::cout << "count : " << count << std::endl;
std::cout << "hashmap_buffer : " << hashmap_buffer << std::endl;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks, (_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows, nthrs, 0, stream, indptr_data, indices_data, num_rows,
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(), hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
count.Ptr<IdType>()); count.Ptr<IdType>());
std::cout << "3 IdArray count : " << count << std::endl;
IdArray idx = AsNumBits(NonZero(mask), nbits); IdArray idx = AsNumBits(NonZero(mask), nbits);
std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
if (idx->shape[0] == 0) if (idx->shape[0] == 0)
return CSRMatrix( return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
NullArray(dtype, ctx), NullArray(dtype, ctx)); NullArray(dtype, ctx), NullArray(dtype, ctx));
// Indptr needs to be adjusted according to the new nnz per row. // Indptr needs to be adjusted according to the new nnz per row.
std::cout << " count : " << count << std::endl;
IdArray ret_indptr = CumSum(count, true); IdArray ret_indptr = CumSum(count, true);
std::cout << " IdArray ret_indptr : " << ret_indptr << std::endl;
// Column & data can be obtained by index select. // Column & data can be obtained by index select.
IdArray ret_col = IndexSelect(csr.indices, idx); IdArray ret_col = IndexSelect(csr.indices, idx);
...@@ -641,6 +667,8 @@ CSRMatrix CSRSliceMatrix( ...@@ -641,6 +667,8 @@ CSRMatrix CSRSliceMatrix(
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash); Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
ret_col = IndexSelect(col_hash, ret_col); ret_col = IndexSelect(col_hash, ret_col);
// std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data); return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cuh * @file array/cuda/spmm.cuh
...@@ -11,7 +13,7 @@ ...@@ -11,7 +13,7 @@
#include <limits> #include <limits>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
#include "atomic.cuh" #include "atomic.cuh"
#include "bf16.cuh" #include "bf16.cuh"
#include "fp16.cuh" #include "fp16.cuh"
...@@ -28,14 +30,14 @@ namespace aten { ...@@ -28,14 +30,14 @@ namespace aten {
*/ */
template <typename DType, typename IdType> template <typename DType, typename IdType>
inline bool cusparse_available(bool more_nnz_than_matrix_size) { inline bool cusparse_available(bool more_nnz_than_matrix_size) {
#if CUDART_VERSION < 11000 #if DTKRT_VERSION < 11000
if (std::is_same<IdType, int>::value && if (std::is_same<IdType, int>::value &&
(std::is_same<DType, float>::value || std::is_same<DType, double>::value)) (std::is_same<DType, float>::value || std::is_same<DType, double>::value))
return true; return true;
return false; return false;
#else #else
if (std::is_same<DType, __half>::value || if (std::is_same<DType, __half>::value ||
std::is_same<DType, __nv_bfloat16>::value) std::is_same<DType, __hip_bfloat16>::value)
return false; // cusparse's SpMM on fp16 is slow, temporally disabled. return false; // cusparse's SpMM on fp16 is slow, temporally disabled.
// If the CSR matrix has more NNZ than matrix size, we should not use // If the CSR matrix has more NNZ than matrix size, we should not use
// cuSPARSE 11.1. // cuSPARSE 11.1.
...@@ -47,54 +49,54 @@ namespace { ...@@ -47,54 +49,54 @@ namespace {
/** @brief Call cuBLAS geam API for transpose operation for float and double. */ /** @brief Call cuBLAS geam API for transpose operation for float and double. */
template <typename DType> template <typename DType>
cublasStatus_t Xgeam( hipblasStatus_t Xgeam(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const DType* alpha, const DType* A, int lda, int m, int n, const DType* alpha, const DType* A, int lda,
const DType* beta, const DType* B, int ldb, DType* C, int ldc) { const DType* beta, const DType* B, int ldb, DType* C, int ldc) {
LOG(FATAL) << "Not supported dtype"; LOG(FATAL) << "Not supported dtype";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
template <> template <>
cublasStatus_t Xgeam<__half>( hipblasStatus_t Xgeam<__half>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const __half* alpha, const __half* A, int lda, int m, int n, const __half* alpha, const __half* A, int lda,
const __half* beta, const __half* B, int ldb, __half* C, int ldc) { const __half* beta, const __half* B, int ldb, __half* C, int ldc) {
// TODO(ndickson): There is no cublasHgeam, so a different // TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "Xgeam does not support dtype half (FP16)"; LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
cublasStatus_t Xgeam<__nv_bfloat16>( hipblasStatus_t Xgeam<__hip_bfloat16>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda, int m, int n, const __hip_bfloat16* alpha, const __hip_bfloat16* A, int lda,
const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb, const __hip_bfloat16* beta, const __hip_bfloat16* B, int ldb,
__nv_bfloat16* C, int ldc) { __hip_bfloat16* C, int ldc) {
// TODO(ndickson): There is no cublasHgeam, so a different // TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required. // implementation would be required.
LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)"; LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)";
return CUBLAS_STATUS_EXECUTION_FAILED; return HIPBLAS_STATUS_EXECUTION_FAILED;
} }
#endif // BF16_ENABLED #endif // BF16_ENABLED
template <> template <>
cublasStatus_t Xgeam<float>( hipblasStatus_t Xgeam<float>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const float* alpha, const float* A, int lda, int m, int n, const float* alpha, const float* A, int lda,
const float* beta, const float* B, int ldb, float* C, int ldc) { const float* beta, const float* B, int ldb, float* C, int ldc) {
return cublasSgeam( return hipblasSgeam(
handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
} }
template <> template <>
cublasStatus_t Xgeam<double>( hipblasStatus_t Xgeam<double>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const double* alpha, const double* A, int lda, int m, int n, const double* alpha, const double* A, int lda,
const double* beta, const double* B, int ldb, double* C, int ldc) { const double* beta, const double* B, int ldb, double* C, int ldc) {
return cublasDgeam( return hipblasDgeam(
handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
} }
...@@ -119,12 +121,12 @@ template <typename DType> ...@@ -119,12 +121,12 @@ template <typename DType>
void _Transpose(const DType* in, DType* out, int row, int col) { void _Transpose(const DType* in, DType* out, int row, int col) {
DType alpha = 1., beta = 0.; DType alpha = 1., beta = 0.;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (!thr_entry->cublas_handle) if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
CUBLAS_CALL(Xgeam<DType>( CUBLAS_CALL(Xgeam<DType>(
thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in, thr_entry->cublas_handle, HIPBLAS_OP_T, HIPBLAS_OP_N, row, col, &alpha, in,
col, &beta, nullptr, row, out, row)); col, &beta, nullptr, row, out, row));
} }
...@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) { ...@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
*/ */
template <> template <>
void _Transpose<__half>(const __half* in, __half* out, int row, int col) { void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(row); int nt = FindNumThreads(row);
int nb = col; int nb = col;
CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
...@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) { ...@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel. * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
*/ */
template <> template <>
void _Transpose<__nv_bfloat16>( void _Transpose<__hip_bfloat16>(
const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) { const __hip_bfloat16* in, __hip_bfloat16* out, int row, int col) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(row); int nt = FindNumThreads(row);
int nb = col; int nb = col;
CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
} }
#endif // BF16_ENABLED #endif // BF16_ENABLED
#if CUDART_VERSION < 11000 #if DTKRT_VERSION < 11000
template <typename DType> template <typename DType>
cusparseStatus_t Xcsrmm2( hipsparseStatus_t Xcsrmm2(
cusparseHandle_t handle, cusparseOperation_t transA, hipsparseHandle_t handle, hipsparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, hipsparseOperation_t transB, int m, int n, int k, int nnz,
const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA, const DType* alpha, const hipsparseMatDescr_t descrA, const DType* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb, const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb,
const DType* beta, DType* C, int ldc) { const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype"; LOG(INFO) << "Not supported dtype";
return CUSPARSE_STATUS_EXECUTION_FAILED; return HIPSPARSE_STATUS_EXECUTION_FAILED;
} }
template <> template <>
cusparseStatus_t Xcsrmm2<float>( hipsparseStatus_t Xcsrmm2<float>(
cusparseHandle_t handle, cusparseOperation_t transA, hipsparseHandle_t handle, hipsparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, hipsparseOperation_t transB, int m, int n, int k, int nnz,
const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA, const float* alpha, const hipsparseMatDescr_t descrA, const float* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb, const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb,
const float* beta, float* C, int ldc) { const float* beta, float* C, int ldc) {
return cusparseScsrmm2( return hipsparseScsrmm2(
handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
csrColIndA, B, ldb, beta, C, ldc); csrColIndA, B, ldb, beta, C, ldc);
} }
template <> template <>
cusparseStatus_t Xcsrmm2<double>( hipsparseStatus_t Xcsrmm2<double>(
cusparseHandle_t handle, cusparseOperation_t transA, hipsparseHandle_t handle, hipsparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz, hipsparseOperation_t transB, int m, int n, int k, int nnz,
const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA, const double* alpha, const hipsparseMatDescr_t descrA, const double* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb, const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb,
const double* beta, double* C, int ldc) { const double* beta, double* C, int ldc) {
return cusparseDcsrmm2( return hipsparseDcsrmm2(
handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
csrColIndA, B, ldb, beta, C, ldc); csrColIndA, B, ldb, beta, C, ldc);
} }
...@@ -213,12 +215,12 @@ void CusparseCsrmm2( ...@@ -213,12 +215,12 @@ void CusparseCsrmm2(
// device // device
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
// all one data array // all one data array
DType* valptr = nullptr; DType* valptr = nullptr;
if (!A_data) { if (!A_data) {
...@@ -226,52 +228,52 @@ void CusparseCsrmm2( ...@@ -226,52 +228,52 @@ void CusparseCsrmm2(
static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType))); static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
_Fill(valptr, nnz, static_cast<DType>(1.)); _Fill(valptr, nnz, static_cast<DType>(1.));
} }
#if CUDART_VERSION >= 11000 #if DTKRT_VERSION >= 11000
cusparseSpMatDescr_t matA; hipsparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC; hipsparseDnMatDescr_t matB, matC;
constexpr auto dtype = cuda_dtype<DType>::value; constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value; constexpr auto idtype = cusparse_idtype<IdType>::value;
CUSPARSE_CALL(cusparseCreateCsr( CUSPARSE_CALL(hipsparseCreateCsr(
&matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data), &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
static_cast<IdType*>(csr.indices->data), static_cast<IdType*>(csr.indices->data),
const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype, const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
CUSPARSE_INDEX_BASE_ZERO, dtype)); HIPSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(cusparseCreateDnMat( CUSPARSE_CALL(hipsparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW)); &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
CUSPARSE_CALL( CUSPARSE_CALL(
cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
size_t workspace_size; size_t workspace_size;
CUSPARSE_CALL(cusparseSpMM_bufferSize( CUSPARSE_CALL(hipsparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, &workspace_size)); matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM( CUSPARSE_CALL(hipsparseSpMM(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, workspace)); matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA)); CUSPARSE_CALL(hipsparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB)); CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC)); CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
#else #else
// allocate matrix for temporary transposed output // allocate matrix for temporary transposed output
DType* trans_out = DType* trans_out =
static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType))); static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType)));
cusparseMatDescr_t descr; hipsparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(Xcsrmm2<DType>( CUSPARSE_CALL(Xcsrmm2<DType>(
thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
(valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data), (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out, static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out,
m)); m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
// transpose the output matrix // transpose the output matrix
_Transpose(trans_out, C_data, n, m); _Transpose(trans_out, C_data, n, m);
device->FreeWorkspace(ctx, trans_out); device->FreeWorkspace(ctx, trans_out);
...@@ -284,7 +286,7 @@ template <typename DType, typename IdType> ...@@ -284,7 +286,7 @@ template <typename DType, typename IdType>
void CusparseCsrmm2Hetero( void CusparseCsrmm2Hetero(
const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data, const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
const DType* A_data, DType* C_data, int64_t x_length, const DType* A_data, DType* C_data, int64_t x_length,
cudaStream_t strm_id) { hipStream_t strm_id) {
// We use csrmm2 to perform following operation: // We use csrmm2 to perform following operation:
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
// for node feature tensor. However, since cusparse only supports // for node feature tensor. However, since cusparse only supports
...@@ -307,9 +309,9 @@ void CusparseCsrmm2Hetero( ...@@ -307,9 +309,9 @@ void CusparseCsrmm2Hetero(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id)); CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, strm_id));
// all one data array // all one data array
DType* valptr = nullptr; DType* valptr = nullptr;
if (!A_data) { if (!A_data) {
...@@ -317,48 +319,48 @@ void CusparseCsrmm2Hetero( ...@@ -317,48 +319,48 @@ void CusparseCsrmm2Hetero(
static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType))); static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
_Fill(valptr, nnz, static_cast<DType>(1.)); _Fill(valptr, nnz, static_cast<DType>(1.));
} }
#if CUDART_VERSION >= 11000 #if DTKRT_VERSION >= 11000
cusparseSpMatDescr_t matA; hipsparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC; hipsparseDnMatDescr_t matB, matC;
constexpr auto dtype = cuda_dtype<DType>::value; constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value; constexpr auto idtype = cusparse_idtype<IdType>::value;
CUSPARSE_CALL(cusparseCreateCsr( CUSPARSE_CALL(hipsparseCreateCsr(
&matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data), &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
static_cast<IdType*>(csr.indices->data), static_cast<IdType*>(csr.indices->data),
const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype, const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
CUSPARSE_INDEX_BASE_ZERO, dtype)); HIPSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(cusparseCreateDnMat( CUSPARSE_CALL(hipsparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW)); &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
CUSPARSE_CALL( CUSPARSE_CALL(
cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
size_t workspace_size; size_t workspace_size;
CUSPARSE_CALL(cusparseSpMM_bufferSize( CUSPARSE_CALL(hipsparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, &workspace_size)); matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM( CUSPARSE_CALL(hipsparseSpMM(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, workspace)); matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA)); CUSPARSE_CALL(hipsparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB)); CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC)); CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
#else #else
cusparseMatDescr_t descr; hipsparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
CHECK_EQ(sizeof(IdType), sizeof(int32_t)); CHECK_EQ(sizeof(IdType), sizeof(int32_t));
CUSPARSE_CALL(Xcsrmm2<DType>( CUSPARSE_CALL(Xcsrmm2<DType>(
thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
(valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data), (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m)); static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
#endif #endif
if (valptr) device->FreeWorkspace(ctx, valptr); if (valptr) device->FreeWorkspace(ctx, valptr);
} }
...@@ -625,7 +627,7 @@ void SpMMCoo( ...@@ -625,7 +627,7 @@ void SpMMCoo(
*/ */
#if BF16_ENABLED #if BF16_ENABLED
if (std::is_same<DType, __half>::value || if (std::is_same<DType, __half>::value ||
std::is_same<DType, __nv_bfloat16>::value) std::is_same<DType, __hip_bfloat16>::value)
#else #else
if (std::is_same<DType, __half>::value) if (std::is_same<DType, __half>::value)
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -638,7 +640,7 @@ void SpMMCoo( ...@@ -638,7 +640,7 @@ void SpMMCoo(
*efeat_data = efeat.Ptr<DType>(); *efeat_data = efeat.Ptr<DType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>(); Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0]; const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0];
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
...@@ -703,7 +705,7 @@ void SpMMCsr( ...@@ -703,7 +705,7 @@ void SpMMCsr(
Idx* argu_data = argu.Ptr<Idx>(); Idx* argu_data = argu.Ptr<Idx>();
Idx* arge_data = arge.Ptr<Idx>(); Idx* arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
...@@ -764,7 +766,7 @@ void SpMMCmpCsrHetero( ...@@ -764,7 +766,7 @@ void SpMMCmpCsrHetero(
Idx* argu_data = argu.Ptr<Idx>(); Idx* argu_data = argu.Ptr<Idx>();
Idx* arge_data = arge.Ptr<Idx>(); Idx* arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu * @file array/cuda/spmm.cu
...@@ -6,9 +7,9 @@ ...@@ -6,9 +7,9 @@
#include <dgl/array.h> #include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./ge_spmm.cuh" #include "ge_spmm.cuh"
#include "./spmm.cuh" #include "spmm.cuh"
namespace dgl { namespace dgl {
...@@ -109,11 +110,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>( ...@@ -109,11 +110,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
#if BF16_ENABLED #if BF16_ENABLED
template void SpMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>( template void SpMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
template void SpMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>( template void SpMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
...@@ -144,11 +145,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>( ...@@ -144,11 +145,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
#if BF16_ENABLED #if BF16_ENABLED
template void SpMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>( template void SpMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
template void SpMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>( template void SpMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux); std::vector<NDArray> out_aux);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu * @file array/cuda/spmm.cu
...@@ -6,9 +8,9 @@ ...@@ -6,9 +8,9 @@
#include <dgl/array.h> #include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh" #include "functor.cuh"
#include "./ge_spmm.cuh" #include "ge_spmm.cuh"
#include "./spmm.cuh" #include "spmm.cuh"
namespace dgl { namespace dgl {
...@@ -37,7 +39,7 @@ void SpMMCsrHetero( ...@@ -37,7 +39,7 @@ void SpMMCsrHetero(
std::vector<DType*> trans_out((*vec_out).size(), NULL); std::vector<DType*> trans_out((*vec_out).size(), NULL);
bool use_legacy_cusparsemm = bool use_legacy_cusparsemm =
(CUDART_VERSION < 11000) && (reduce == "sum") && (DTKRT_VERSION < 11000) && (reduce == "sum") &&
// legacy cuSPARSE does not care about NNZ, hence the argument "false". // legacy cuSPARSE does not care about NNZ, hence the argument "false".
((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) || ((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
(op == "mul" && is_scalar_efeat && (op == "mul" && is_scalar_efeat &&
...@@ -50,7 +52,7 @@ void SpMMCsrHetero( ...@@ -50,7 +52,7 @@ void SpMMCsrHetero(
if (m == 0) continue; if (m == 0) continue;
DType* out = static_cast<DType*>(device->AllocWorkspace( DType* out = static_cast<DType*>(device->AllocWorkspace(
vec_csr[0].indptr->ctx, m * n * sizeof(DType))); vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType))); CUDA_CALL(hipMemset(out, 0, m * n * sizeof(DType)));
trans_out[ntype] = out; trans_out[ntype] = out;
} }
} }
...@@ -111,7 +113,7 @@ void SpMMCsrHetero( ...@@ -111,7 +113,7 @@ void SpMMCsrHetero(
} }
} }
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) { for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
const dgl_type_t src_id = ufeat_ntids[etype]; const dgl_type_t src_id = ufeat_ntids[etype];
const dgl_type_t dst_id = out_ntids[etype]; const dgl_type_t dst_id = out_ntids[etype];
...@@ -123,7 +125,7 @@ void SpMMCsrHetero( ...@@ -123,7 +125,7 @@ void SpMMCsrHetero(
cusparse_available<DType, IdType>(more_nnz)) { // cusparse cusparse_available<DType, IdType>(more_nnz)) { // cusparse
/* If CUDA is less than 11.0, put the output in trans_out for later /* If CUDA is less than 11.0, put the output in trans_out for later
* transposition */ * transposition */
DType* out = (CUDART_VERSION < 11000) DType* out = (DTKRT_VERSION < 11000)
? trans_out[dst_id] ? trans_out[dst_id]
: static_cast<DType*>((*vec_out)[dst_id]->data); : static_cast<DType*>((*vec_out)[dst_id]->data);
CusparseCsrmm2Hetero<DType, IdType>( CusparseCsrmm2Hetero<DType, IdType>(
...@@ -209,14 +211,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>( ...@@ -209,14 +211,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids); const std::vector<dgl_type_t>& out_ntids);
#if BF16_ENABLED #if BF16_ENABLED
template void SpMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>( template void SpMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat, const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out, const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
std::vector<std::vector<NDArray>>* out_aux, std::vector<std::vector<NDArray>>* out_aux,
const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids); const std::vector<dgl_type_t>& out_ntids);
template void SpMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>( template void SpMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat, const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out, const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/utils.h * @file array/cuda/utils.h
...@@ -11,7 +13,7 @@ ...@@ -11,7 +13,7 @@
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include <dmlc/logging.h> #include <dmlc/logging.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <type_traits> #include <type_traits>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
...@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) { ...@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {
template <typename T> template <typename T>
__device__ __forceinline__ T _ldg(T* addr) { __device__ __forceinline__ T _ldg(T* addr) {
#if __CUDA_ARCH__ >= 350 #if __HIP_DEVICE_COMPILE__
return __ldg(addr); return __ldg(addr);
#else #else
return *addr; return *addr;
...@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) { ...@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
/** @brief Fill the vector started from ptr of size length with val */ /** @brief Fill the vector started from ptr of size length with val */
template <typename DType> template <typename DType>
void _Fill(DType* ptr, size_t length, DType val) { void _Fill(DType* ptr, size_t length, DType val) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(length); int nt = FindNumThreads(length);
int nb = int nb =
(length + nt - 1) / nt; // on x-axis, no need to worry about upperbound. (length + nt - 1) / nt; // on x-axis, no need to worry about upperbound.
...@@ -185,8 +187,8 @@ template <typename IdType> ...@@ -185,8 +187,8 @@ template <typename IdType>
__global__ void _LinearSearchKernel( __global__ void _LinearSearchKernel(
const IdType* indptr, const IdType* indices, const IdType* data, const IdType* indptr, const IdType* indices, const IdType* data,
const IdType* row, const IdType* col, int64_t row_stride, const IdType* row, const IdType* col, int64_t row_stride,
int64_t col_stride, int64_t length, const __nv_bfloat16* weights, int64_t col_stride, int64_t length, const __hip_bfloat16* weights,
__nv_bfloat16 filler, __nv_bfloat16* out) { __hip_bfloat16 filler, __hip_bfloat16* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x; int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x; const int stride_x = gridDim.x * blockDim.x;
while (tx < length) { while (tx < length) {
...@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel( ...@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
} else { } else {
// If the result is saved in bf16, it should be fine to convert it to // If the result is saved in bf16, it should be fine to convert it to
// float first // float first
out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast<float>(v)); out[tx] = weights ? weights[v] : __hip_bfloat16(static_cast<float>(v));
} }
tx += stride_x; tx += stride_x;
} }
...@@ -277,12 +279,12 @@ template <typename DType, typename BoolType> ...@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
void MaskSelect( void MaskSelect(
runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input, runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input,
const BoolType* mask, DType* output, int64_t n, int64_t* rst, const BoolType* mask, DType* output, int64_t n, int64_t* rst,
cudaStream_t stream) { hipStream_t stream) {
size_t workspace_size = 0; size_t workspace_size = 0;
CUDA_CALL(cub::DeviceSelect::Flagged( CUDA_CALL(hipcub::DeviceSelect::Flagged(
nullptr, workspace_size, input, mask, output, rst, n, stream)); nullptr, workspace_size, input, mask, output, rst, n, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceSelect::Flagged( CUDA_CALL(hipcub::DeviceSelect::Flagged(
workspace, workspace_size, input, mask, output, rst, n, stream)); workspace, workspace_size, input, mask, output, rst, n, stream));
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
} }
...@@ -290,7 +292,7 @@ void MaskSelect( ...@@ -290,7 +292,7 @@ void MaskSelect(
inline void* GetDevicePointer(runtime::NDArray array) { inline void* GetDevicePointer(runtime::NDArray array) {
void* ptr = array->data; void* ptr = array->data;
if (array.IsPinned()) { if (array.IsPinned()) {
CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0)); CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
} }
return ptr; return ptr;
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cuda/utils.cu * @file array/cuda/utils.cu
* @brief Utilities for CUDA kernels. * @brief Utilities for CUDA kernels.
*/ */
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace cuda { namespace cuda {
...@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) { ...@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1)); int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction // Call CUB's reduction
size_t workspace_size = 0; size_t workspace_size = 0;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CUDA_CALL(cub::DeviceReduce::Min( CUDA_CALL(hipcub::DeviceReduce::Min(
nullptr, workspace_size, flags, rst, length, stream)); nullptr, workspace_size, flags, rst, length, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size); void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceReduce::Min( CUDA_CALL(hipcub::DeviceReduce::Min(
workspace, workspace_size, flags, rst, length, stream)); workspace, workspace_size, flags, rst, length, stream));
int8_t cpu_rst = GetCUDAScalar(device, ctx, rst); int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
device->FreeWorkspace(ctx, workspace); device->FreeWorkspace(ctx, workspace);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/cpu/array_index_select_uvm.cuh * @file array/cpu/array_index_select_uvm.cuh
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2019-2022 by Contributors * Copyright (c) 2019-2022 by Contributors
* @file array/cuda/uvm/array_index_select_uvm.cu * @file array/cuda/uvm/array_index_select_uvm.cu
...@@ -8,7 +10,7 @@ ...@@ -8,7 +10,7 @@
#include "../../../runtime/cuda/cuda_common.h" #include "../../../runtime/cuda/cuda_common.h"
#include "../array_index_select.cuh" #include "../array_index_select.cuh"
#include "../utils.h" #include "../utils.h"
#include "./array_index_select_uvm.cuh" #include "array_index_select_uvm.cuh"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -17,7 +19,7 @@ namespace impl { ...@@ -17,7 +19,7 @@ namespace impl {
template <typename DType, typename IdType> template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t arr_len = array->shape[0]; const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0]; const int64_t len = index->shape[0];
int64_t num_feat = 1; int64_t num_feat = 1;
...@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray); ...@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template <typename DType, typename IdType> template <typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* source_data = static_cast<DType*>(source->data); const DType* source_data = static_cast<DType*>(source->data);
const IdType* idx_data = static_cast<IdType*>(index->data); const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = dest->shape[0]; const int64_t arr_len = dest->shape[0];
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file array/filter.cc * @file array/filter.cc
* @brief Object for selecting items in a set, or selecting items not in a set. * @brief Object for selecting items in a set, or selecting items not in a set.
*/ */
#include "./filter.h" #include "filter.h"
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/packed_func.h> #include <dgl/runtime/packed_func.h>
...@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet") ...@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
IdArray array = args[0]; IdArray array = args[0];
auto ctx = array->ctx; auto ctx = array->ctx;
// TODO(nv-dlasalle): Implement CPU version. // TODO(nv-dlasalle): Implement CPU version.
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA|| ctx.device_type == kDGLROCM) {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, { ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
*rv = CreateSetFilter<kDGLCUDA, IdType>(array); *rv = CreateSetFilter<kDGLCUDA, IdType>(array);
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/kernel.cc * @file array/kernel.cc
...@@ -7,7 +8,7 @@ ...@@ -7,7 +8,7 @@
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./check.h" #include "check.h"
#include "kernel_decl.h" #include "kernel_decl.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment