Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/negative_sampling.cu
* @brief rowwise sampling
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/array.h>
#include <dgl/array_iterator.h>
#include <dgl/random.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
using namespace dgl::runtime;
......@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
curandStatePhilox4_32_10_t
hiprandStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time
curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
hiprand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) {
for (int i = 0; i < num_trials; ++i) {
uint4 result = curand4(&rng);
uint4 result = hiprand4(&rng);
// Turns out that result.x is always 0 with the above RNG.
uint64_t y_hi = result.y >> 16;
uint64_t y_lo = result.y & 0xFFFF;
......@@ -88,7 +90,7 @@ struct IsNotMinusOne {
template <typename IdType>
void SortOrderedPairs(
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
IdType* tmp_major, IdType* tmp_minor, int64_t n, hipStream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix
......@@ -98,21 +100,21 @@ void SortOrderedPairs(
void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
// Radix sort by major key next.
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
......@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IdType* out_row_data = out_row.Ptr<IdType>();
IdType* out_col_data = out_col.Ptr<IdType>();
auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = cuda::FindNumThreads(num_actual_samples);
const int nb = (num_actual_samples + nt - 1) / nt;
std::pair<IdArray, IdArray> result;
......@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL(cub::DeviceSelect::If(
CUDA_CALL(hipcub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL(cub::DeviceSelect::If(
CUDA_CALL(hipcub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
......@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
size_t tmp_size_unique = 0;
void* tmp_unique = nullptr;
CUDA_CALL(cub::DeviceSelect::Unique(
CUDA_CALL(hipcub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size)
? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer
CUDA_CALL(cub::DeviceSelect::Unique(
CUDA_CALL(hipcub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out = std::min(num_samples, num_out);
num_out = ::min(num_samples, num_out);
result = {
unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
} else {
num_out = std::min(num_samples, num_out);
num_out = ::min(num_samples, num_out);
result = {
out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)};
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/rowwise_sampling.cu
* @brief uniform rowwise sampling
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <numeric>
#include "../../array/cuda/atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
using namespace dgl::cuda;
using namespace dgl::aten::cuda;
......@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
hiprandStatePhilox4_32_10_t rng;
hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
......@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
__syncthreads();
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const int num = curand(&rng) % (idx + 1);
const int num = hiprand(&rng) % (idx + 1);
if (num < num_picks) {
// use max so as to achieve the replacement order the serial
// algorithm would have
......@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
hiprandStatePhilox4_32_10_t rng;
hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
......@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const int64_t edge = curand(&rng) % deg;
const int64_t edge = hiprand(&rng) % deg;
const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge];
......@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
......@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent));
hipEvent_t copyEvent;
CUDA_CALL(hipEventCreate(&copyEvent));
NDArray new_len_tensor;
if (TensorDispatcher::Global()->IsAvailable()) {
......@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
}
// copy using the internal current stream
CUDA_CALL(cudaMemcpyAsync(
CUDA_CALL(hipMemcpyAsync(
new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
cudaMemcpyDeviceToHost, stream));
CUDA_CALL(cudaEventRecord(copyEvent, stream));
hipMemcpyDeviceToHost, stream));
CUDA_CALL(hipEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
......@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
CUDA_CALL(cudaEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent));
CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(hipEventDestroy(copyEvent));
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2022 by Contributors
* @file array/cuda/rowwise_sampling_prob.cu
......@@ -6,20 +8,20 @@
* sampling code rowwise_sampling.cu.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <numeric>
#include "../../array/cuda/atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
// require CUB 1.17 to use DeviceSegmentedSort
static_assert(
CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
// static_assert(
// CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
namespace dgl {
using namespace cuda;
......@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
hiprandStatePhilox4_32_10_t rng;
hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
......@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
prob, data, idx, in_row_start, &item_prob);
// compute A-Res value
ares[ares_idx] = static_cast<FloatType>(
__powf(curand_uniform(&rng), 1.0f / item_prob));
__powf(hiprand_uniform(&rng), 1.0f / item_prob));
ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
}
}
......@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
hiprandStatePhilox4_32_10_t rng;
hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
......@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
if (deg > 0) {
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
typedef hipcub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total
......@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get random value
FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum);
FloatType rand = static_cast<FloatType>(hiprand_uniform(&rng) * sum);
// get the offset of the first value within cdf array which is greater
// than random value.
int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>(
int64_t item = hipcub::UpperBound<FloatType*, int64_t, FloatType>(
&cdf[cdf_row_start], deg, rand);
item = min(item, deg - 1);
// get in and out index
......@@ -411,7 +413,7 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
IdType* new_row_data = new_row.Ptr<IdType>();
IdType* new_col_data = new_col.Ptr<IdType>();
IdType* new_eid_data = new_eid.Ptr<IdType>();
auto stream = runtime::getCurrentCUDAStream();
auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx);
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
......@@ -441,7 +443,7 @@ COOMatrix _COORemoveIf(
const COOMatrix& coo, const NDArray& values, DType criteria) {
const DType* val = values.Ptr<DType>();
auto maskgen = [val, criteria](
int nb, int nt, cudaStream_t stream, int64_t nnz,
int nb, int nt, hipStream_t stream, int64_t nnz,
const IdType* data, int8_t* flags) {
CUDA_KERNEL_CALL(
(_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
......@@ -481,7 +483,7 @@ COOMatrix _CSRRowWiseSampling(
const FloatArray& prob, bool replace) {
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
......@@ -530,10 +532,10 @@ COOMatrix _CSRRowWiseSampling(
IdType* temp_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, temp_deg);
......@@ -551,16 +553,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent));
hipEvent_t copyEvent;
CUDA_CALL(hipEventCreate(&copyEvent));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// wait on a cudaevent
IdType new_len;
......@@ -568,7 +570,7 @@ COOMatrix _CSRRowWiseSampling(
device->CopyDataFromTo(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
CUDA_CALL(cudaEventRecord(copyEvent, stream));
CUDA_CALL(hipEventRecord(copyEvent, stream));
// allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one
......@@ -612,16 +614,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* sort_temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
hipcub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
hipcub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
device->FreeWorkspace(ctx, d_temp_storage);
......@@ -641,8 +643,8 @@ COOMatrix _CSRRowWiseSampling(
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
CUDA_CALL(cudaEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent));
CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(hipEventDestroy(copyEvent));
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cuh
......@@ -10,8 +12,8 @@
#include "../../runtime/cuda/cuda_common.h"
#include "../selector.h"
#include "./functor.cuh"
#include "./utils.h"
#include "functor.cuh"
#include "utils.h"
#include "atomic.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
......@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
}
#pragma unroll
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(full_mask, val, offset);
val += __shfl_down(val, offset);
if (tx == 0) outoff[i] = val;
}
}
......@@ -275,7 +277,7 @@ void SDDMMCoo(
const DType* lhs_data = lhs.Ptr<DType>();
const DType* rhs_data = rhs.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *lhs_off = nullptr, *rhs_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
......@@ -337,7 +339,7 @@ void SDDMMCsr(
const DType* lhs_data = lhs.Ptr<DType>();
const DType* rhs_data = rhs.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0];
int64_t *lhs_off = nullptr, *rhs_off = nullptr;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
......@@ -5,8 +6,8 @@
*/
#include <dgl/array.h>
#include "./functor.cuh"
#include "./sddmm.cuh"
#include "functor.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
......@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
template void SDDMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SDDMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template void SDDMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SDDMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
......@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
template void SDDMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SDDMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template void SDDMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SDDMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
......@@ -5,7 +6,7 @@
*/
#include <dgl/array.h>
#include "./sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
......@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
template void SDDMMCooHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SDDMMCooHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCooHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SDDMMCooHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
......@@ -5,7 +6,7 @@
*/
#include <dgl/array.h>
#include "./sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
......@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
template void SDDMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SDDMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SDDMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cuh
......@@ -10,8 +12,8 @@
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh"
#include "./utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace dgl {
......@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
DType* out_data = out.Ptr<DType>();
IdType* arg_data = arg.Ptr<IdType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = out->shape[0];
int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
......@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
const IdType* idx_data = idx.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = feat->shape[0];
int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
......@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (op == "copy_lhs" || op == "copy_rhs") {
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
graph->NumVertexTypes(), std::vector<dgl_id_t>());
......@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t n = feat->shape[0];
int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cu
......@@ -6,9 +7,9 @@
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include "./functor.cuh"
#include "./segment_reduce.cuh"
#include "./utils.h"
#include "functor.cuh"
#include "segment_reduce.cuh"
#include "utils.h"
namespace dgl {
......@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
#if BF16_ENABLED
template void SegmentReduce<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SegmentReduce<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
template void SegmentReduce<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SegmentReduce<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
#endif // BF16_ENABLED
......@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
template void ScatterAdd<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray idx, NDArray out);
#if BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, __nv_bfloat16>(
template void ScatterAdd<kDGLCUDA, int32_t, __hip_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
template void ScatterAdd<kDGLCUDA, int64_t, __nv_bfloat16>(
template void ScatterAdd<kDGLCUDA, int64_t, __hip_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
#endif // BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, float>(
......@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
#if BF16_ENABLED
template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __nv_bfloat16>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __nv_bfloat16>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
......@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray arg, NDArray out);
#if BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, __nv_bfloat16>(
template void BackwardSegmentCmp<kDGLCUDA, int32_t, __hip_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __nv_bfloat16>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __hip_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
#endif // BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by contributors.
* @file array/cuda/spmat_op_impl_coo.cu
......@@ -10,8 +12,8 @@
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh"
#include "./utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace dgl {
......@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType nt = 1024;
......@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType num_rows = coo.num_rows;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmat_op_impl_csr.cu
......@@ -7,14 +9,14 @@
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <numeric>
#include <unordered_set>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh"
#include "./utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace dgl {
......@@ -28,7 +30,7 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = csr.indptr->ctx;
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
......@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const auto rstlen = std::max(rowlen, collen);
const auto rstlen = ::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
if (rstlen == 0) return rst;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = dgl::cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr;
......@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
bool CSRHasDuplicate(CSRMatrix csr) {
if (!csr.sorted) csr = CSRSort(csr);
const auto& ctx = csr.indptr->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
......@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto len = rows->shape[0];
const IdType* vid_data = rows.Ptr<IdType>();
const IdType* indptr_data =
......@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
......@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const auto len = std::max(rowlen, collen);
const auto len = ::max(rowlen, collen);
if (len == 0) return {NullArray(), NullArray(), NullArray()};
const auto& ctx = row->ctx;
......@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
const int64_t nnz = csr.indices->shape[0];
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* indptr_data =
static_cast<IdType*>(GetDevicePointer(csr.indptr));
......@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
static_cast<IdType>(num_rows));
NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
typedef cub::WarpReduce<IdType> WarpReduce;
typedef hipcub::WarpReduce<IdType> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
while (out_row < last_row) {
......@@ -547,6 +549,7 @@ __global__ void _SegmentMaskColKernel(
}
}
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
if (laneid == 0) {
count[out_row] = reduce_count;
}
......@@ -557,13 +560,16 @@ __global__ void _SegmentMaskColKernel(
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceMatrix(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = rows->ctx;
const auto& dtype = rows->dtype;
const auto nbits = dtype.bits;
const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0];
std::cout << "new_nrows : " << new_nrows << std::endl;
std::cout << "new_ncols : " << new_ncols << std::endl;
if (new_nrows == 0 || new_ncols == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
......@@ -572,6 +578,7 @@ CSRMatrix CSRSliceMatrix(
// First slice rows
csr = CSRSliceRows(csr, rows);
std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
if (csr.indices->shape[0] == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
......@@ -581,9 +588,11 @@ CSRMatrix CSRSliceMatrix(
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
// A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
std::cout << "1 IdArray count : " << count << std::endl;
CUDA_CALL(
cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
std::cout << "2 IdArray count : " << count << std::endl;
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5);
// Because num_cols is usually less than 1 Million (on GPU), the
......@@ -593,7 +602,7 @@ CSRMatrix CSRSliceMatrix(
using it = thrust::counting_iterator<int64_t>;
runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
thrust::for_each(
exec_policy, it(0), it(new_ncols),
[key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
......@@ -617,20 +626,37 @@ CSRMatrix CSRSliceMatrix(
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb);
std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
std::cout << "indptr_data : " << indptr_data << std::endl;
std::cout << "indices_data : " << indices_data << std::endl;
std::cout << "num_rows : " << num_rows << std::endl;
std::cout << "buffer_size : " << buffer_size << std::endl;
std::cout << "mask : " << mask << std::endl;
std::cout << "count : " << count << std::endl;
std::cout << "hashmap_buffer : " << hashmap_buffer << std::endl;
CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
count.Ptr<IdType>());
std::cout << "3 IdArray count : " << count << std::endl;
IdArray idx = AsNumBits(NonZero(mask), nbits);
std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
if (idx->shape[0] == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
NullArray(dtype, ctx), NullArray(dtype, ctx));
// Indptr needs to be adjusted according to the new nnz per row.
std::cout << " count : " << count << std::endl;
IdArray ret_indptr = CumSum(count, true);
std::cout << " IdArray ret_indptr : " << ret_indptr << std::endl;
// Column & data can be obtained by index select.
IdArray ret_col = IndexSelect(csr.indices, idx);
......@@ -641,6 +667,8 @@ CSRMatrix CSRSliceMatrix(
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
ret_col = IndexSelect(col_hash, ret_col);
// std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
}
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cuh
......@@ -11,7 +13,7 @@
#include <limits>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
......@@ -28,14 +30,14 @@ namespace aten {
*/
template <typename DType, typename IdType>
inline bool cusparse_available(bool more_nnz_than_matrix_size) {
#if CUDART_VERSION < 11000
#if DTKRT_VERSION < 11000
if (std::is_same<IdType, int>::value &&
(std::is_same<DType, float>::value || std::is_same<DType, double>::value))
return true;
return false;
#else
if (std::is_same<DType, __half>::value ||
std::is_same<DType, __nv_bfloat16>::value)
std::is_same<DType, __hip_bfloat16>::value)
return false; // cusparse's SpMM on fp16 is slow, temporally disabled.
// If the CSR matrix has more NNZ than matrix size, we should not use
// cuSPARSE 11.1.
......@@ -47,54 +49,54 @@ namespace {
/** @brief Call cuBLAS geam API for transpose operation for float and double. */
template <typename DType>
cublasStatus_t Xgeam(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
hipblasStatus_t Xgeam(
hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const DType* alpha, const DType* A, int lda,
const DType* beta, const DType* B, int ldb, DType* C, int ldc) {
LOG(FATAL) << "Not supported dtype";
return CUBLAS_STATUS_EXECUTION_FAILED;
return HIPBLAS_STATUS_EXECUTION_FAILED;
}
template <>
cublasStatus_t Xgeam<__half>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
hipblasStatus_t Xgeam<__half>(
hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const __half* alpha, const __half* A, int lda,
const __half* beta, const __half* B, int ldb, __half* C, int ldc) {
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
return CUBLAS_STATUS_EXECUTION_FAILED;
return HIPBLAS_STATUS_EXECUTION_FAILED;
}
#if BF16_ENABLED
template <>
cublasStatus_t Xgeam<__nv_bfloat16>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda,
const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb,
__nv_bfloat16* C, int ldc) {
hipblasStatus_t Xgeam<__hip_bfloat16>(
hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const __hip_bfloat16* alpha, const __hip_bfloat16* A, int lda,
const __hip_bfloat16* beta, const __hip_bfloat16* B, int ldb,
__hip_bfloat16* C, int ldc) {
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)";
return CUBLAS_STATUS_EXECUTION_FAILED;
return HIPBLAS_STATUS_EXECUTION_FAILED;
}
#endif // BF16_ENABLED
template <>
cublasStatus_t Xgeam<float>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
hipblasStatus_t Xgeam<float>(
hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const float* alpha, const float* A, int lda,
const float* beta, const float* B, int ldb, float* C, int ldc) {
return cublasSgeam(
return hipblasSgeam(
handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
}
template <>
cublasStatus_t Xgeam<double>(
cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
hipblasStatus_t Xgeam<double>(
hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
int m, int n, const double* alpha, const double* A, int lda,
const double* beta, const double* B, int ldb, double* C, int ldc) {
return cublasDgeam(
return hipblasDgeam(
handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
}
......@@ -119,12 +121,12 @@ template <typename DType>
void _Transpose(const DType* in, DType* out, int row, int col) {
DType alpha = 1., beta = 0.;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (!thr_entry->cublas_handle)
CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
CUBLAS_CALL(Xgeam<DType>(
thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in,
thr_entry->cublas_handle, HIPBLAS_OP_T, HIPBLAS_OP_N, row, col, &alpha, in,
col, &beta, nullptr, row, out, row));
}
......@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
*/
template <>
void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(row);
int nb = col;
CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
......@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
*/
template <>
void _Transpose<__nv_bfloat16>(
const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
void _Transpose<__hip_bfloat16>(
const __hip_bfloat16* in, __hip_bfloat16* out, int row, int col) {
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(row);
int nb = col;
CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
}
#endif // BF16_ENABLED
#if CUDART_VERSION < 11000
#if DTKRT_VERSION < 11000
template <typename DType>
cusparseStatus_t Xcsrmm2(
cusparseHandle_t handle, cusparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz,
const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA,
hipsparseStatus_t Xcsrmm2(
hipsparseHandle_t handle, hipsparseOperation_t transA,
hipsparseOperation_t transB, int m, int n, int k, int nnz,
const DType* alpha, const hipsparseMatDescr_t descrA, const DType* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb,
const DType* beta, DType* C, int ldc) {
LOG(INFO) << "Not supported dtype";
return CUSPARSE_STATUS_EXECUTION_FAILED;
return HIPSPARSE_STATUS_EXECUTION_FAILED;
}
template <>
cusparseStatus_t Xcsrmm2<float>(
cusparseHandle_t handle, cusparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz,
const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA,
hipsparseStatus_t Xcsrmm2<float>(
hipsparseHandle_t handle, hipsparseOperation_t transA,
hipsparseOperation_t transB, int m, int n, int k, int nnz,
const float* alpha, const hipsparseMatDescr_t descrA, const float* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb,
const float* beta, float* C, int ldc) {
return cusparseScsrmm2(
return hipsparseScsrmm2(
handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
csrColIndA, B, ldb, beta, C, ldc);
}
template <>
cusparseStatus_t Xcsrmm2<double>(
cusparseHandle_t handle, cusparseOperation_t transA,
cusparseOperation_t transB, int m, int n, int k, int nnz,
const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA,
hipsparseStatus_t Xcsrmm2<double>(
hipsparseHandle_t handle, hipsparseOperation_t transA,
hipsparseOperation_t transB, int m, int n, int k, int nnz,
const double* alpha, const hipsparseMatDescr_t descrA, const double* csrValA,
const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb,
const double* beta, double* C, int ldc) {
return cusparseDcsrmm2(
return hipsparseDcsrmm2(
handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
csrColIndA, B, ldb, beta, C, ldc);
}
......@@ -213,12 +215,12 @@ void CusparseCsrmm2(
// device
auto device = runtime::DeviceAPI::Get(ctx);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
// all one data array
DType* valptr = nullptr;
if (!A_data) {
......@@ -226,52 +228,52 @@ void CusparseCsrmm2(
static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
_Fill(valptr, nnz, static_cast<DType>(1.));
}
#if CUDART_VERSION >= 11000
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC;
#if DTKRT_VERSION >= 11000
hipsparseSpMatDescr_t matA;
hipsparseDnMatDescr_t matB, matC;
constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value;
CUSPARSE_CALL(cusparseCreateCsr(
CUSPARSE_CALL(hipsparseCreateCsr(
&matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
static_cast<IdType*>(csr.indices->data),
const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
CUSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(cusparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
HIPSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(hipsparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
CUSPARSE_CALL(
cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
size_t workspace_size;
CUSPARSE_CALL(cusparseSpMM_bufferSize(
CUSPARSE_CALL(hipsparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, &workspace_size));
matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM(
CUSPARSE_CALL(hipsparseSpMM(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, workspace));
matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC));
CUSPARSE_CALL(hipsparseDestroySpMat(matA));
CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
#else
// allocate matrix for temporary transposed output
DType* trans_out =
static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType)));
cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
hipsparseMatDescr_t descr;
CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(Xcsrmm2<DType>(
thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
(valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out,
m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
// transpose the output matrix
_Transpose(trans_out, C_data, n, m);
device->FreeWorkspace(ctx, trans_out);
......@@ -284,7 +286,7 @@ template <typename DType, typename IdType>
void CusparseCsrmm2Hetero(
const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
const DType* A_data, DType* C_data, int64_t x_length,
cudaStream_t strm_id) {
hipStream_t strm_id) {
// We use csrmm2 to perform following operation:
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
// for node feature tensor. However, since cusparse only supports
......@@ -307,9 +309,9 @@ void CusparseCsrmm2Hetero(
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id));
CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, strm_id));
// all one data array
DType* valptr = nullptr;
if (!A_data) {
......@@ -317,48 +319,48 @@ void CusparseCsrmm2Hetero(
static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
_Fill(valptr, nnz, static_cast<DType>(1.));
}
#if CUDART_VERSION >= 11000
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC;
#if DTKRT_VERSION >= 11000
hipsparseSpMatDescr_t matA;
hipsparseDnMatDescr_t matB, matC;
constexpr auto dtype = cuda_dtype<DType>::value;
constexpr auto idtype = cusparse_idtype<IdType>::value;
CUSPARSE_CALL(cusparseCreateCsr(
CUSPARSE_CALL(hipsparseCreateCsr(
&matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
static_cast<IdType*>(csr.indices->data),
const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
CUSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(cusparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
HIPSPARSE_INDEX_BASE_ZERO, dtype));
CUSPARSE_CALL(hipsparseCreateDnMat(
&matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
CUSPARSE_CALL(
cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
size_t workspace_size;
CUSPARSE_CALL(cusparseSpMM_bufferSize(
CUSPARSE_CALL(hipsparseSpMM_bufferSize(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, &workspace_size));
matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseSpMM(
CUSPARSE_CALL(hipsparseSpMM(
thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
matC, dtype, CUSPARSE_SPMM_CSR_ALG2, workspace));
matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
device->FreeWorkspace(ctx, workspace);
CUSPARSE_CALL(cusparseDestroySpMat(matA));
CUSPARSE_CALL(cusparseDestroyDnMat(matB));
CUSPARSE_CALL(cusparseDestroyDnMat(matC));
CUSPARSE_CALL(hipsparseDestroySpMat(matA));
CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
#else
cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
hipsparseMatDescr_t descr;
CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
CHECK_EQ(sizeof(IdType), sizeof(int32_t));
CUSPARSE_CALL(Xcsrmm2<DType>(
thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
(valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
#endif
if (valptr) device->FreeWorkspace(ctx, valptr);
}
......@@ -625,7 +627,7 @@ void SpMMCoo(
*/
#if BF16_ENABLED
if (std::is_same<DType, __half>::value ||
std::is_same<DType, __nv_bfloat16>::value)
std::is_same<DType, __hip_bfloat16>::value)
#else
if (std::is_same<DType, __half>::value)
#endif // BF16_ENABLED
......@@ -638,7 +640,7 @@ void SpMMCoo(
*efeat_data = efeat.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0];
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
......@@ -703,7 +705,7 @@ void SpMMCsr(
Idx* argu_data = argu.Ptr<Idx>();
Idx* arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
......@@ -764,7 +766,7 @@ void SpMMCmpCsrHetero(
Idx* argu_data = argu.Ptr<Idx>();
Idx* arge_data = arge.Ptr<Idx>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
......@@ -6,9 +7,9 @@
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh"
#include "./ge_spmm.cuh"
#include "./spmm.cuh"
#include "functor.cuh"
#include "ge_spmm.cuh"
#include "spmm.cuh"
namespace dgl {
......@@ -109,11 +110,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
#if BF16_ENABLED
template void SpMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SpMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
template void SpMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SpMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
......@@ -144,11 +145,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
#if BF16_ENABLED
template void SpMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SpMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
template void SpMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SpMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
......@@ -6,9 +8,9 @@
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./functor.cuh"
#include "./ge_spmm.cuh"
#include "./spmm.cuh"
#include "functor.cuh"
#include "ge_spmm.cuh"
#include "spmm.cuh"
namespace dgl {
......@@ -37,7 +39,7 @@ void SpMMCsrHetero(
std::vector<DType*> trans_out((*vec_out).size(), NULL);
bool use_legacy_cusparsemm =
(CUDART_VERSION < 11000) && (reduce == "sum") &&
(DTKRT_VERSION < 11000) && (reduce == "sum") &&
// legacy cuSPARSE does not care about NNZ, hence the argument "false".
((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
(op == "mul" && is_scalar_efeat &&
......@@ -50,7 +52,7 @@ void SpMMCsrHetero(
if (m == 0) continue;
DType* out = static_cast<DType*>(device->AllocWorkspace(
vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType)));
CUDA_CALL(hipMemset(out, 0, m * n * sizeof(DType)));
trans_out[ntype] = out;
}
}
......@@ -111,7 +113,7 @@ void SpMMCsrHetero(
}
}
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
const dgl_type_t src_id = ufeat_ntids[etype];
const dgl_type_t dst_id = out_ntids[etype];
......@@ -123,7 +125,7 @@ void SpMMCsrHetero(
cusparse_available<DType, IdType>(more_nnz)) { // cusparse
/* If CUDA is less than 11.0, put the output in trans_out for later
* transposition */
DType* out = (CUDART_VERSION < 11000)
DType* out = (DTKRT_VERSION < 11000)
? trans_out[dst_id]
: static_cast<DType*>((*vec_out)[dst_id]->data);
CusparseCsrmm2Hetero<DType, IdType>(
......@@ -209,14 +211,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
#if BF16_ENABLED
template void SpMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
template void SpMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
std::vector<std::vector<NDArray>>* out_aux,
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
template void SpMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
template void SpMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.h
......@@ -11,7 +13,7 @@
#include <dgl/runtime/ndarray.h>
#include <dmlc/logging.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <type_traits>
#include "../../runtime/cuda/cuda_common.h"
......@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {
template <typename T>
__device__ __forceinline__ T _ldg(T* addr) {
#if __CUDA_ARCH__ >= 350
#if __HIP_DEVICE_COMPILE__
return __ldg(addr);
#else
return *addr;
......@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
/** @brief Fill the vector started from ptr of size length with val */
template <typename DType>
void _Fill(DType* ptr, size_t length, DType val) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = FindNumThreads(length);
int nb =
(length + nt - 1) / nt; // on x-axis, no need to worry about upperbound.
......@@ -185,8 +187,8 @@ template <typename IdType>
__global__ void _LinearSearchKernel(
const IdType* indptr, const IdType* indices, const IdType* data,
const IdType* row, const IdType* col, int64_t row_stride,
int64_t col_stride, int64_t length, const __nv_bfloat16* weights,
__nv_bfloat16 filler, __nv_bfloat16* out) {
int64_t col_stride, int64_t length, const __hip_bfloat16* weights,
__hip_bfloat16 filler, __hip_bfloat16* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
......@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
} else {
// If the result is saved in bf16, it should be fine to convert it to
// float first
out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast<float>(v));
out[tx] = weights ? weights[v] : __hip_bfloat16(static_cast<float>(v));
}
tx += stride_x;
}
......@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
void MaskSelect(
runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input,
const BoolType* mask, DType* output, int64_t n, int64_t* rst,
cudaStream_t stream) {
hipStream_t stream) {
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceSelect::Flagged(
CUDA_CALL(hipcub::DeviceSelect::Flagged(
nullptr, workspace_size, input, mask, output, rst, n, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceSelect::Flagged(
CUDA_CALL(hipcub::DeviceSelect::Flagged(
workspace, workspace_size, input, mask, output, rst, n, stream));
device->FreeWorkspace(ctx, workspace);
}
......@@ -290,7 +292,7 @@ void MaskSelect(
inline void* GetDevicePointer(runtime::NDArray array) {
void* ptr = array->data;
if (array.IsPinned()) {
CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0));
CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
}
return ptr;
}
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.cu
* @brief Utilities for CUDA kernels.
*/
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
namespace dgl {
namespace cuda {
......@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction
size_t workspace_size = 0;
cudaStream_t stream = runtime::getCurrentCUDAStream();
CUDA_CALL(cub::DeviceReduce::Min(
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CUDA_CALL(hipcub::DeviceReduce::Min(
nullptr, workspace_size, flags, rst, length, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceReduce::Min(
CUDA_CALL(hipcub::DeviceReduce::Min(
workspace, workspace_size, flags, rst, length, stream));
int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
device->FreeWorkspace(ctx, workspace);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cpu/array_index_select_uvm.cuh
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/cuda/uvm/array_index_select_uvm.cu
......@@ -8,7 +10,7 @@
#include "../../../runtime/cuda/cuda_common.h"
#include "../array_index_select.cuh"
#include "../utils.h"
#include "./array_index_select_uvm.cuh"
#include "array_index_select_uvm.cuh"
namespace dgl {
using runtime::NDArray;
......@@ -17,7 +19,7 @@ namespace impl {
template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0];
int64_t num_feat = 1;
......@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template <typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* source_data = static_cast<DType*>(source->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = dest->shape[0];
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file array/filter.cc
* @brief Object for selecting items in a set, or selecting items not in a set.
*/
#include "./filter.h"
#include "filter.h"
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/packed_func.h>
......@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
IdArray array = args[0];
auto ctx = array->ctx;
// TODO(nv-dlasalle): Implement CPU version.
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA|| ctx.device_type == kDGLROCM) {
#ifdef DGL_USE_CUDA
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
*rv = CreateSetFilter<kDGLCUDA, IdType>(array);
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/kernel.cc
......@@ -7,7 +8,7 @@
#include <dgl/packed_func_ext.h>
#include "../c_api_common.h"
#include "./check.h"
#include "check.h"
#include "kernel_decl.h"
using namespace dgl::runtime;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment