Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file graph/transform/cuda/knn.cu * @file graph/transform/cuda/knn.cu
* @brief k-nearest-neighbor (KNN) implementation (cuda) * @brief k-nearest-neighbor (KNN) implementation (cuda)
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <algorithm> #include <algorithm>
#include <cub/cub.cuh> // NOLINT #include <hipcub/hipcub.hpp> // NOLINT
#include <limits> #include <limits>
#include <string> #include <string>
#include <type_traits> #include <type_traits>
...@@ -467,7 +469,7 @@ void BruteForceKNNCuda( ...@@ -467,7 +469,7 @@ void BruteForceKNNCuda(
const NDArray& data_points, const IdArray& data_offsets, const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k, const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) { IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = data_points->ctx; const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1; const int64_t batch_size = data_offsets->shape[0] - 1;
...@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda( ...@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda(
const NDArray& data_points, const IdArray& data_offsets, const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k, const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) { IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = data_points->ctx; const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1; const int64_t batch_size = data_offsets->shape[0] - 1;
...@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda( ...@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda(
// get max shared memory per block in bytes // get max shared memory per block in bytes
// determine block size according to this value // determine block size according to this value
int max_sharedmem_per_block = 0; int max_sharedmem_per_block = 0;
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, &max_sharedmem_per_block, hipDeviceAttributeMaxSharedMemoryPerBlock,
ctx.device_id)); ctx.device_id));
const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>( const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
(k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType), (k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
...@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda( ...@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda(
GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream, GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
query_offsets_data, num_block_per_segment, batch_size, block_size); query_offsets_data, num_block_per_segment, batch_size, block_size);
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum, nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream)); batch_size, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum, prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream)); batch_size, stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
// wait for results // wait for results
CUDA_CALL(cudaStreamSynchronize(stream)); CUDA_CALL(hipStreamSynchronize(stream));
int64_t num_blocks = 0, final_elem = 0, int64_t num_blocks = 0, final_elem = 0,
copyoffset = (batch_size - 1) * sizeof(IdType); copyoffset = (batch_size - 1) * sizeof(IdType);
...@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda( ...@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda(
/** @brief Setup rng state for nn-descent */ /** @brief Setup rng state for nn-descent */
__global__ void SetupRngKernel( __global__ void SetupRngKernel(
curandState* states, const uint64_t seed, const size_t n) { hiprandState_t* states, const uint64_t seed, const size_t n) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x; size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) { if (id < n) {
curand_init(seed, id, 0, states + id); hiprand_init(seed, id, 0, states + id);
} }
} }
...@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel( ...@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x; const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0; IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return; if (point_idx >= offsets[batch_size]) return;
curandState state; hiprandState_t state;
curand_init(seed, point_idx, 0, &state); hiprand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch // find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) { for (IdType b = 0; b < batch_size + 1; ++b) {
...@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel( ...@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel(
current_central_nodes[i] = point_idx; current_central_nodes[i] = point_idx;
} }
for (IdType i = k; i < segment_size; ++i) { for (IdType i = k; i < segment_size; ++i) {
const IdType j = static_cast<IdType>(curand(&state) % (i + 1)); const IdType j = static_cast<IdType>(hiprand(&state) % (i + 1));
if (j < k) current_neighbors[j] = i + segment_start; if (j < k) current_neighbors[j] = i + segment_start;
} }
...@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel( ...@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x; const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0; IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return; if (point_idx >= offsets[batch_size]) return;
curandState state; hiprandState_t state;
curand_init(seed, point_idx, 0, &state); hiprand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch // find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) { for (IdType b = 0; b < batch_size + 1; ++b) {
...@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel( ...@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) { if (curr_num < num_candidates) {
candidate_data[curr_num] = candidate; candidate_data[curr_num] = candidate;
} else { } else {
IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1)); IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = candidate; if (pos < num_candidates) candidate_data[pos] = candidate;
} }
++candidate_array[0]; ++candidate_array[0];
...@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel( ...@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) { if (curr_num < num_candidates) {
candidate_data[curr_num] = reverse_candidate; candidate_data[curr_num] = reverse_candidate;
} else { } else {
IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1)); IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = reverse_candidate; if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
} }
++candidate_array[0]; ++candidate_array[0];
...@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType> ...@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void NNDescent( void NNDescent(
const NDArray& points, const IdArray& offsets, IdArray result, const int k, const NDArray& points, const IdArray& offsets, IdArray result, const int k,
const int num_iters, const int num_candidates, const double delta) { const int num_iters, const int num_candidates, const double delta) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = points->ctx; const auto& ctx = points->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
const int64_t num_nodes = points->shape[0]; const int64_t num_nodes = points->shape[0];
...@@ -887,7 +889,7 @@ void NNDescent( ...@@ -887,7 +889,7 @@ void NNDescent(
uint64_t seed; uint64_t seed;
int warp_size = 0; int warp_size = 0;
CUDA_CALL( CUDA_CALL(
cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id)); hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, ctx.device_id));
// We don't need large block sizes, since there's not much inter-thread // We don't need large block sizes, since there's not much inter-thread
// communication // communication
int64_t block_size = warp_size; int64_t block_size = warp_size;
...@@ -911,7 +913,7 @@ void NNDescent( ...@@ -911,7 +913,7 @@ void NNDescent(
IdType* total_num_updates_d = IdType* total_num_updates_d =
static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType))); static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
CUDA_CALL(cub::DeviceReduce::Sum( CUDA_CALL(hipcub::DeviceReduce::Sum(
nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes, nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
stream)); stream));
IdType* sum_temp_storage = IdType* sum_temp_storage =
...@@ -942,7 +944,7 @@ void NNDescent( ...@@ -942,7 +944,7 @@ void NNDescent(
feature_size); feature_size);
total_num_updates = 0; total_num_updates = 0;
CUDA_CALL(cub::DeviceReduce::Sum( CUDA_CALL(hipcub::DeviceReduce::Sum(
sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d, sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
num_nodes, stream)); num_nodes, stream));
device->CopyDataFromTo( device->CopyDataFromTo(
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2018 by Contributors * Copyright (c) 2018 by Contributors
* @file graph/traversal.cc * @file graph/traversal.cc
* @brief Graph traversal implementation * @brief Graph traversal implementation
*/ */
#include "./traversal.h" #include "traversal.h"
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file graph/unit_graph.cc * @file graph/unit_graph.cc
* @brief UnitGraph graph implementation * @brief UnitGraph graph implementation
*/ */
#include "./unit_graph.h" #include "unit_graph.h"
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
...@@ -11,7 +12,7 @@ ...@@ -11,7 +12,7 @@
#include <dgl/lazy.h> #include <dgl/lazy.h>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./serialize/dglstream.h" #include "serialize/dglstream.h"
namespace dgl { namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file ndarray_partition.h * @file ndarray_partition.h
...@@ -6,7 +8,7 @@ ...@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/workspace.h" #include "../../runtime/workspace.h"
...@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder( ...@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
const auto& ctx = in_idx->ctx; const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx); auto device = DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_in = in_idx->shape[0]; const int64_t num_in = in_idx->shape[0];
...@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder( ...@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
} }
const int64_t part_bits = const int64_t part_bits =
static_cast<int64_t>(std::ceil(std::log2(num_parts))); static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors // First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in); Workspace<IdType> proc_id_in(device, ctx, num_in);
...@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder( ...@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size; size_t sort_workspace_size;
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(), nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits, static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream)); stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size); Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(), sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out, proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream)); num_in, 0, part_bits, stream));
...@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder( ...@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
static_assert( static_assert(
sizeof(AtomicCount) == sizeof(*out_counts), sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd " "AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"); "in hipcub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged, // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow // add a compile time check against the cub version to allow
...@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder( ...@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
"value of int."; "value of int.";
size_t hist_workspace_size; size_t hist_workspace_size;
CUDA_CALL(cub::DeviceHistogram::HistogramEven( CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(), nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1, reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts), static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream)); static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size); Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL(cub::DeviceHistogram::HistogramEven( CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(), hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1, reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts), static_cast<IdType>(0), static_cast<IdType>(num_parts),
...@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder< ...@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) { IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
const auto& ctx = global_idx->ctx; const auto& ctx = global_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1) { if (num_parts > 1) {
IdArray local_idx = IdArray local_idx =
...@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder( ...@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder(
<< num_parts; << num_parts;
const auto& ctx = local_idx->ctx; const auto& ctx = local_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1) { if (num_parts > 1) {
IdArray global_idx = IdArray global_idx =
...@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange( ...@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
const auto& ctx = in_idx->ctx; const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx); auto device = DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_in = in_idx->shape[0]; const int64_t num_in = in_idx->shape[0];
...@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange( ...@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
} }
const int64_t part_bits = const int64_t part_bits =
static_cast<int64_t>(std::ceil(std::log2(num_parts))); static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors // First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in); Workspace<IdType> proc_id_in(device, ctx, num_in);
...@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange( ...@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size; size_t sort_workspace_size;
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(), nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits, static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream)); stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size); Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(), sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out, proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream)); num_in, 0, part_bits, stream));
...@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange( ...@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
static_assert( static_assert(
sizeof(AtomicCount) == sizeof(*out_counts), sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd " "AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"); "in hipcub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged, // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow // add a compile time check against the cub version to allow
...@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange( ...@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
"value of int."; "value of int.";
size_t hist_workspace_size; size_t hist_workspace_size;
CUDA_CALL(cub::DeviceHistogram::HistogramEven( CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(), nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1, reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts), static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream)); static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size); Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL(cub::DeviceHistogram::HistogramEven( CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(), hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1, reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts), static_cast<IdType>(0), static_cast<IdType>(num_parts),
...@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType> ...@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
IdArray MapToLocalFromRange( IdArray MapToLocalFromRange(
const int num_parts, IdArray range, IdArray global_idx) { const int num_parts, IdArray range, IdArray global_idx) {
const auto& ctx = global_idx->ctx; const auto& ctx = global_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1 && global_idx->shape[0] > 0) { if (num_parts > 1 && global_idx->shape[0] > 0) {
IdArray local_idx = IdArray local_idx =
...@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange( ...@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange(
<< num_parts; << num_parts;
const auto& ctx = local_idx->ctx; const auto& ctx = local_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1 && local_idx->shape[0] > 0) { if (num_parts > 1 && local_idx->shape[0] > 0) {
IdArray global_idx = IdArray global_idx =
......
...@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition { ...@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray in_idx) const override { IdArray in_idx) const override {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx; auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA || ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
return impl::GeneratePermutationFromRemainder<kDGLCUDA, IdType>( return impl::GeneratePermutationFromRemainder<kDGLCUDA, IdType>(
ArraySize(), NumParts(), in_idx); ArraySize(), NumParts(), in_idx);
...@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition { ...@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray MapToLocal(IdArray in_idx) const override { IdArray MapToLocal(IdArray in_idx) const override {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx; auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
return impl::MapToLocalFromRemainder<kDGLCUDA, IdType>( return impl::MapToLocalFromRemainder<kDGLCUDA, IdType>(
NumParts(), in_idx); NumParts(), in_idx);
...@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition { ...@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray MapToGlobal(IdArray in_idx, const int part_id) const override { IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx; auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
return impl::MapToGlobalFromRemainder<kDGLCUDA, IdType>( return impl::MapToGlobalFromRemainder<kDGLCUDA, IdType>(
NumParts(), in_idx, part_id); NumParts(), in_idx, part_id);
...@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition { ...@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition {
IdArray in_idx) const override { IdArray in_idx) const override {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx; auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
if (ctx.device_type != range_->ctx.device_type || if (ctx.device_type != range_->ctx.device_type ||
ctx.device_id != range_->ctx.device_id) { ctx.device_id != range_->ctx.device_id) {
LOG(FATAL) << "The range for the NDArrayPartition and the input " LOG(FATAL) << "The range for the NDArrayPartition and the input "
...@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition { ...@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition {
IdArray MapToLocal(IdArray in_idx) const override { IdArray MapToLocal(IdArray in_idx) const override {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx; auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, { ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
return impl::MapToLocalFromRange<kDGLCUDA, IdType, RangeType>( return impl::MapToLocalFromRange<kDGLCUDA, IdType, RangeType>(
...@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition { ...@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition {
IdArray MapToGlobal(IdArray in_idx, const int part_id) const override { IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx; auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, { ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
return impl::MapToGlobalFromRange<kDGLCUDA, IdType, RangeType>( return impl::MapToGlobalFromRange<kDGLCUDA, IdType, RangeType>(
......
// !!! This is a file automatically generated by hipify!!!
/*! /*!
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* All rights reserved. * All rights reserved.
...@@ -24,13 +25,13 @@ ...@@ -24,13 +25,13 @@
#include <cmath> #include <cmath>
#ifdef __NVCC__ #ifdef __HIPCC__
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#else #else
#include <random> #include <random>
#include "pcg_random.hpp" #include "pcg_random.hpp"
#endif // __CUDA_ARCH__ #endif // __HIP_DEVICE_COMPILE__
#ifndef M_SQRT1_2 #ifndef M_SQRT1_2
#define M_SQRT1_2 0.707106781186547524401 #define M_SQRT1_2 0.707106781186547524401
...@@ -58,24 +59,24 @@ class continuous_seed { ...@@ -58,24 +59,24 @@ class continuous_seed {
c[1] = std::sin(pi * r / 2); c[1] = std::sin(pi * r / 2);
} }
#ifdef __CUDA_ARCH__ #ifdef __HIP_DEVICE_COMPILE__
__device__ inline float uniform(const uint64_t t) const { __device__ inline float uniform(const uint64_t t) const {
const uint64_t kCurandSeed = 999961; // Could be any random number. const uint64_t kCurandSeed = 999961; // Could be any random number.
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
curand_init(kCurandSeed, s[0], t, &rng); hiprand_init(kCurandSeed, s[0], t, &rng);
float rnd; float rnd;
if (s[0] != s[1]) { if (s[0] != s[1]) {
rnd = c[0] * curand_normal(&rng); rnd = c[0] * hiprand_normal(&rng);
curand_init(kCurandSeed, s[1], t, &rng); hiprand_init(kCurandSeed, s[1], t, &rng);
rnd += c[1] * curand_normal(&rng); rnd += c[1] * hiprand_normal(&rng);
rnd = normcdff(rnd); rnd = normcdff(rnd);
} else { } else {
rnd = curand_uniform(&rng); rnd = hiprand_uniform(&rng);
} }
return rnd; return rnd;
} }
#else #else
inline float uniform(const uint64_t t) const { __host__ inline float uniform(const uint64_t t) const {
pcg32 ng0(s[0], t); pcg32 ng0(s[0], t);
float rnd; float rnd;
if (s[0] != s[1]) { if (s[0] != s[1]) {
...@@ -91,7 +92,7 @@ class continuous_seed { ...@@ -91,7 +92,7 @@ class continuous_seed {
} }
return rnd; return rnd;
} }
#endif // __CUDA_ARCH__ #endif // __HIP_DEVICE_COMPILE__
}; };
} // namespace random } // namespace random
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file rpc/rpc.cc * @file rpc/rpc.cc
* @brief Implementation of RPC utilities used by both server and client sides. * @brief Implementation of RPC utilities used by both server and client sides.
*/ */
#if defined(__linux__) #if defined(__linux__)
#include "./rpc.h" #include "rpc.h"
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file rpc/rpc.h * @file rpc/rpc.h
...@@ -19,9 +20,9 @@ ...@@ -19,9 +20,9 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "./network/common.h" #include "network/common.h"
#include "./rpc_msg.h" #include "rpc_msg.h"
#include "./server_state.h" #include "server_state.h"
#include "network/socket_communicator.h" #include "network/socket_communicator.h"
namespace dgl { namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2016-2022 by Contributors * Copyright (c) 2016-2022 by Contributors
* @file c_runtime_api.cc * @file c_runtime_api.cc
...@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) { ...@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) {
return "cpu"; return "cpu";
case kDGLCUDA: case kDGLCUDA:
return "cuda"; return "cuda";
case kDGLROCM:
return "cuda";
// add more device here once supported // add more device here once supported
default: default:
LOG(FATAL) << "unknown type =" << type; LOG(FATAL) << "unknown type =" << type;
...@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo( ...@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo(
} }
bool DeviceAPI::PinData(void* ptr, size_t nbytes) { bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
LOG(FATAL) << "Device does not support cudaHostRegister api."; LOG(FATAL) << "Device does not support hipHostRegister api.";
return false; return false;
} }
void* DeviceAPI::AllocPinnedDataSpace( void* DeviceAPI::AllocPinnedDataSpace(
size_t nbytes, void** ctx, void** deleter) { size_t nbytes, void** ctx, void** deleter) {
LOG(FATAL) << "Device does not support cudaHostAlloc api."; LOG(FATAL) << "Device does not support hipHostMalloc api.";
return nullptr; return nullptr;
} }
...@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) { ...@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
} }
void DeviceAPI::UnpinData(void* ptr) { void DeviceAPI::UnpinData(void* ptr) {
LOG(FATAL) << "Device does not support cudaHostUnregister api."; LOG(FATAL) << "Device does not support hipHostUnregister api.";
} }
} // namespace runtime } // namespace runtime
} // namespace dgl } // namespace dgl
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2017 by Contributors * Copyright (c) 2017 by Contributors
* @file cuda_common.h * @file cuda_common.h
...@@ -6,10 +7,10 @@ ...@@ -6,10 +7,10 @@
#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_ #ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_ #define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#include <cublas_v2.h> #include <hipblas/hipblas.h>
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <curand.h> #include <hiprand/hiprand.h>
#include <cusparse.h> #include <hipsparse/hipsparse.h>
#include <dgl/runtime/packed_func.h> #include <dgl/runtime/packed_func.h>
#include <memory> #include <memory>
...@@ -25,8 +26,8 @@ namespace runtime { ...@@ -25,8 +26,8 @@ namespace runtime {
DGL's memory pool and the current cuda stream DGL's memory pool and the current cuda stream
runtime::CUDAWorkspaceAllocator allocator(ctx); runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto stream = runtime::getCurrentCUDAStream(); const auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
now, one can pass exec_policy to thrust functions now, one can pass exec_policy to thrust functions
...@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) { ...@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) {
#define CUDA_DRIVER_CALL(x) \ #define CUDA_DRIVER_CALL(x) \
{ \ { \
CUresult result = x; \ hipError_t result = x; \
if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \ if (result != hipSuccess && result != hipErrorDeinitialized) { \
const char* msg; \ const char* msg; \
cuGetErrorName(result, &msg); \ hipGetErrorName(result, &msg); \
LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \ LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \
} \ } \
} }
#define CUDA_CALL(func) \ #define CUDA_CALL(func) \
{ \ { \
cudaError_t e = (func); \ hipError_t e = (func); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ CHECK(e == hipSuccess || e == hipErrorDeinitialized) \
<< "CUDA: " << cudaGetErrorString(e); \ << "CUDA: " << hipGetErrorString(e); \
} }
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \ #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
{ \ { \
if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \ if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
(kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__); \ hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), (stream), __VA_ARGS__); \
cudaError_t e = cudaGetLastError(); \ hipError_t e = hipGetLastError(); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ CHECK(e == hipSuccess || e == hipErrorDeinitialized) \
<< "CUDA kernel launch error: " << cudaGetErrorString(e); \ << "CUDA kernel launch error: " << hipGetErrorString(e); \
} \ } \
} }
#define CUSPARSE_CALL(func) \ #define CUSPARSE_CALL(func) \
{ \ { \
cusparseStatus_t e = (func); \ hipsparseStatus_t e = (func); \
CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \ CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
} }
#define CUBLAS_CALL(func) \ #define CUBLAS_CALL(func) \
{ \ { \
cublasStatus_t e = (func); \ hipblasStatus_t e = (func); \
CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \ CHECK(e == HIPBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
} }
#define CURAND_CALL(func) \ #define CURAND_CALL(func) \
{ \ { \
curandStatus_t e = (func); \ hiprandStatus_t e = (func); \
CHECK(e == CURAND_STATUS_SUCCESS) \ CHECK(e == HIPRAND_STATUS_SUCCESS) \
<< "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \ << "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
<< __FILE__ << ":" << __LINE__; \ << __FILE__ << ":" << __LINE__; \
} }
inline const char* curandGetErrorString(curandStatus_t error) { inline const char* curandGetErrorString(hiprandStatus_t error) {
switch (error) { switch (error) {
case CURAND_STATUS_SUCCESS: case HIPRAND_STATUS_SUCCESS:
return "CURAND_STATUS_SUCCESS"; return "HIPRAND_STATUS_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH: case HIPRAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH"; return "HIPRAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED: case HIPRAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED"; return "HIPRAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED: case HIPRAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED"; return "HIPRAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR: case HIPRAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR"; return "HIPRAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE: case HIPRAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE"; return "HIPRAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE: case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE: case HIPRAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE"; return "HIPRAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE: case HIPRAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE"; return "HIPRAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED: case HIPRAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED"; return "HIPRAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH: case HIPRAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH"; return "HIPRAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR: case HIPRAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR"; return "HIPRAND_STATUS_INTERNAL_ERROR";
} }
// To suppress compiler warning. // To suppress compiler warning.
return "Unrecognized curand error string"; return "Unrecognized hiprand error string";
} }
/** /**
* @brief Cast data type to cudaDataType_t. * @brief Cast data type to hipDataType.
*/ */
template <typename T> template <typename T>
struct cuda_dtype { struct cuda_dtype {
static constexpr cudaDataType_t value = CUDA_R_32F; static constexpr hipDataType value = HIP_R_32F;
}; };
template <> template <>
struct cuda_dtype<__half> { struct cuda_dtype<__half> {
static constexpr cudaDataType_t value = CUDA_R_16F; static constexpr hipDataType value = HIP_R_16F;
}; };
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
struct cuda_dtype<__nv_bfloat16> { struct cuda_dtype<__hip_bfloat16> {
static constexpr cudaDataType_t value = CUDA_R_16BF; static constexpr hipDataType value = HIP_R_16BF;
}; };
#endif // BF16_ENABLED #endif // BF16_ENABLED
template <> template <>
struct cuda_dtype<float> { struct cuda_dtype<float> {
static constexpr cudaDataType_t value = CUDA_R_32F; static constexpr hipDataType value = HIP_R_32F;
}; };
template <> template <>
struct cuda_dtype<double> { struct cuda_dtype<double> {
static constexpr cudaDataType_t value = CUDA_R_64F; static constexpr hipDataType value = HIP_R_64F;
}; };
/* /*
...@@ -202,7 +203,7 @@ struct accum_dtype<__half> { ...@@ -202,7 +203,7 @@ struct accum_dtype<__half> {
#if BF16_ENABLED #if BF16_ENABLED
template <> template <>
struct accum_dtype<__nv_bfloat16> { struct accum_dtype<__hip_bfloat16> {
typedef float type; typedef float type;
}; };
#endif // BF16_ENABLED #endif // BF16_ENABLED
...@@ -217,23 +218,23 @@ struct accum_dtype<double> { ...@@ -217,23 +218,23 @@ struct accum_dtype<double> {
typedef double type; typedef double type;
}; };
#if CUDART_VERSION >= 11000 #if DTKRT_VERSION >= 11000
/** /**
* @brief Cast index data type to cusparseIndexType_t. * @brief Cast index data type to hipsparseIndexType_t.
*/ */
template <typename T> template <typename T>
struct cusparse_idtype { struct cusparse_idtype {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
}; };
template <> template <>
struct cusparse_idtype<int32_t> { struct cusparse_idtype<int32_t> {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
}; };
template <> template <>
struct cusparse_idtype<int64_t> { struct cusparse_idtype<int64_t> {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I; static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_64I;
}; };
#endif #endif
...@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> { ...@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> {
class CUDAThreadEntry { class CUDAThreadEntry {
public: public:
/** @brief The cusparse handler */ /** @brief The cusparse handler */
cusparseHandle_t cusparse_handle{nullptr}; hipsparseHandle_t cusparse_handle{nullptr};
/** @brief The cublas handler */ /** @brief The cublas handler */
cublasHandle_t cublas_handle{nullptr}; hipblasHandle_t cublas_handle{nullptr};
/** @brief thread local pool*/ /** @brief thread local pool*/
WorkspacePool pool; WorkspacePool pool;
/** @brief constructor */ /** @brief constructor */
...@@ -253,7 +254,7 @@ class CUDAThreadEntry { ...@@ -253,7 +254,7 @@ class CUDAThreadEntry {
}; };
/** @brief Get the current CUDA stream */ /** @brief Get the current CUDA stream */
cudaStream_t getCurrentCUDAStream(); hipStream_t getCurrentHIPStreamMasqueradingAsCUDA();
} // namespace runtime } // namespace runtime
} // namespace dgl } // namespace dgl
#endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_ #endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2017-2022 by Contributors * Copyright (c) 2017-2022 by Contributors
* @file cuda_device_api.cc * @file cuda_device_api.cc
* @brief GPU specific API * @brief GPU specific API
*/ */
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <dgl/runtime/registry.h> #include <dgl/runtime/registry.h>
#include <dgl/runtime/tensordispatch.h> #include <dgl/runtime/tensordispatch.h>
...@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI {
public: public:
CUDADeviceAPI() { CUDADeviceAPI() {
int count; int count;
auto err = cudaGetDeviceCount(&count); auto err = hipGetDeviceCount(&count);
switch (err) { switch (err) {
case cudaSuccess: case hipSuccess:
break; break;
default: default:
count = 0; count = 0;
cudaGetLastError(); hipGetLastError();
} }
is_available_ = count > 0; is_available_ = count > 0;
} }
...@@ -32,67 +33,68 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -32,67 +33,68 @@ class CUDADeviceAPI final : public DeviceAPI {
bool IsAvailable() final { return is_available_; } bool IsAvailable() final { return is_available_; }
void SetDevice(DGLContext ctx) final { void SetDevice(DGLContext ctx) final {
CUDA_CALL(cudaSetDevice(ctx.device_id)); CUDA_CALL(hipSetDevice(ctx.device_id));
} }
void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final { void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final {
int value = 0; int value = 0;
switch (kind) { switch (kind) {
case kExist: case kExist:
value = value =
(cudaDeviceGetAttribute( (hipDeviceGetAttribute(
&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) == &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id) ==
cudaSuccess); hipSuccess);
break; break;
case kMaxThreadsPerBlock: { case kMaxThreadsPerBlock: {
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id)); &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id));
break; break;
} }
case kWarpSize: { case kWarpSize: {
CUDA_CALL( CUDA_CALL(
cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id)); hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id));
break; break;
} }
case kMaxSharedMemoryPerBlock: { case kMaxSharedMemoryPerBlock: {
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id)); &value, hipDeviceAttributeMaxSharedMemoryPerBlock, ctx.device_id));
break; break;
} }
case kComputeVersion: { case kComputeVersion: {
std::ostringstream os; std::ostringstream os;
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&value, cudaDevAttrComputeCapabilityMajor, ctx.device_id)); &value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id));
os << value << "."; os << value << ".";
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&value, cudaDevAttrComputeCapabilityMinor, ctx.device_id)); &value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id));
os << value; os << value;
*rv = os.str(); *rv = os.str();
return; return;
} }
case kDeviceName: { case kDeviceName: {
cudaDeviceProp props; hipDeviceProp_t props;
CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id)); CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
*rv = std::string(props.name); *rv = std::string(props.name);
// printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
return; return;
} }
case kMaxClockRate: { case kMaxClockRate: {
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&value, cudaDevAttrClockRate, ctx.device_id)); &value, hipDeviceAttributeClockRate, ctx.device_id));
break; break;
} }
case kMultiProcessorCount: { case kMultiProcessorCount: {
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&value, cudaDevAttrMultiProcessorCount, ctx.device_id)); &value, hipDeviceAttributeMultiprocessorCount, ctx.device_id));
break; break;
} }
case kMaxThreadDimensions: { case kMaxThreadDimensions: {
int dims[3]; int dims[3];
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id)); &dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id));
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id)); &dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id));
CUDA_CALL(cudaDeviceGetAttribute( CUDA_CALL(hipDeviceGetAttribute(
&dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id)); &dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id));
std::stringstream ss; // use json string to return multiple int values; std::stringstream ss; // use json string to return multiple int values;
ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
...@@ -110,11 +112,11 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -110,11 +112,11 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable()) { if (tensor_dispatcher->IsAvailable()) {
return tensor_dispatcher->CUDAAllocWorkspace( return tensor_dispatcher->CUDAAllocWorkspace(
nbytes, getCurrentCUDAStream()); nbytes, getCurrentHIPStreamMasqueradingAsCUDA());
} }
CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes"; CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
void* ret; void* ret;
CUDA_CALL(cudaMalloc(&ret, nbytes)); CUDA_CALL(hipMalloc(&ret, nbytes));
return ret; return ret;
} }
...@@ -124,32 +126,32 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -124,32 +126,32 @@ class CUDADeviceAPI final : public DeviceAPI {
if (tensor_dispatcher->IsAvailable()) { if (tensor_dispatcher->IsAvailable()) {
return tensor_dispatcher->CUDAFreeWorkspace(ptr); return tensor_dispatcher->CUDAFreeWorkspace(ptr);
} }
CUDA_CALL(cudaFree(ptr)); CUDA_CALL(hipFree(ptr));
} }
void CopyDataFromTo( void CopyDataFromTo(
const void* from, size_t from_offset, void* to, size_t to_offset, const void* from, size_t from_offset, void* to, size_t to_offset,
size_t size, DGLContext ctx_from, DGLContext ctx_to, size_t size, DGLContext ctx_from, DGLContext ctx_to,
DGLDataType type_hint, DGLStreamHandle stream) { DGLDataType type_hint, DGLStreamHandle stream) {
cudaStream_t cu_stream = static_cast<cudaStream_t>(stream); hipStream_t cu_stream = static_cast<hipStream_t>(stream);
from = static_cast<const char*>(from) + from_offset; from = static_cast<const char*>(from) + from_offset;
to = static_cast<char*>(to) + to_offset; to = static_cast<char*>(to) + to_offset;
if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) { if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
CUDA_CALL(cudaSetDevice(ctx_from.device_id)); CUDA_CALL(hipSetDevice(ctx_from.device_id));
if (ctx_from.device_id == ctx_to.device_id) { if (ctx_from.device_id == ctx_to.device_id) {
GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream);
} else { } else {
CUDA_CALL(cudaMemcpyPeerAsync( CUDA_CALL(hipMemcpyPeerAsync(
to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream)); to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
} }
} else if ( } else if (
ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) { (ctx_from.device_type == kDGLCUDA || ctx_to.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) {
CUDA_CALL(cudaSetDevice(ctx_from.device_id)); CUDA_CALL(hipSetDevice(ctx_from.device_id));
GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream);
} else if ( } else if (
ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) { ctx_from.device_type == kDGLCPU && (ctx_to.device_type == kDGLCUDA||ctx_to.device_type == kDGLROCM)) {
CUDA_CALL(cudaSetDevice(ctx_to.device_id)); CUDA_CALL(hipSetDevice(ctx_to.device_id));
GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream);
} else { } else {
LOG(FATAL) << "expect copy from/to GPU or between GPU"; LOG(FATAL) << "expect copy from/to GPU or between GPU";
} }
...@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI {
} }
// To ensure correct behavior, `record_event` must be invoked anytime a // To ensure correct behavior, `record_event` must be invoked anytime a
// pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync // pointer from PyTorch CachingHostAllocator is used in a hipMemcpyAsync
// call. It provides a way to re-use freed pinned (page-locked) memory // call. It provides a way to re-use freed pinned (page-locked) memory
// allocations and avoid device sync due to cudaFreeHost calls. // allocations and avoid device sync due to hipHostFree calls.
void RecordedCopyDataFromTo( void RecordedCopyDataFromTo(
void* from, size_t from_offset, void* to, size_t to_offset, size_t size, void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint, DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint,
...@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI {
stream); stream);
auto tensor_dispatcher = TensorDispatcher::Global(); auto tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable()) { if (tensor_dispatcher->IsAvailable()) {
auto custream = static_cast<cudaStream_t>(stream); auto custream = static_cast<hipStream_t>(stream);
void* ptr = ctx_to.device_type == kDGLCPU ? to : from; void* ptr = ctx_to.device_type == kDGLCPU ? to : from;
int id = int id =
ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id; ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id;
...@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI {
} }
DGLStreamHandle CreateStream(DGLContext ctx) { DGLStreamHandle CreateStream(DGLContext ctx) {
CUDA_CALL(cudaSetDevice(ctx.device_id)); CUDA_CALL(hipSetDevice(ctx.device_id));
cudaStream_t retval; hipStream_t retval;
// make sure the legacy default stream won't block on this stream // make sure the legacy default stream won't block on this stream
CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking)); CUDA_CALL(hipStreamCreateWithFlags(&retval, hipStreamNonBlocking));
return static_cast<DGLStreamHandle>(retval); return static_cast<DGLStreamHandle>(retval);
} }
void FreeStream(DGLContext ctx, DGLStreamHandle stream) { void FreeStream(DGLContext ctx, DGLStreamHandle stream) {
CUDA_CALL(cudaSetDevice(ctx.device_id)); CUDA_CALL(hipSetDevice(ctx.device_id));
cudaStream_t cu_stream = static_cast<cudaStream_t>(stream); hipStream_t cu_stream = static_cast<hipStream_t>(stream);
CUDA_CALL(cudaStreamDestroy(cu_stream)); CUDA_CALL(hipStreamDestroy(cu_stream));
} }
void SyncStreamFromTo( void SyncStreamFromTo(
DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) { DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) {
CUDA_CALL(cudaSetDevice(ctx.device_id)); CUDA_CALL(hipSetDevice(ctx.device_id));
cudaStream_t src_stream = static_cast<cudaStream_t>(event_src); hipStream_t src_stream = static_cast<hipStream_t>(event_src);
cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst); hipStream_t dst_stream = static_cast<hipStream_t>(event_dst);
cudaEvent_t evt; hipEvent_t evt;
CUDA_CALL(cudaEventCreate(&evt)); CUDA_CALL(hipEventCreate(&evt));
CUDA_CALL(cudaEventRecord(evt, src_stream)); CUDA_CALL(hipEventRecord(evt, src_stream));
CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0)); CUDA_CALL(hipStreamWaitEvent(dst_stream, evt, 0));
CUDA_CALL(cudaEventDestroy(evt)); CUDA_CALL(hipEventDestroy(evt));
} }
void StreamSync(DGLContext ctx, DGLStreamHandle stream) final { void StreamSync(DGLContext ctx, DGLStreamHandle stream) final {
CUDA_CALL(cudaSetDevice(ctx.device_id)); CUDA_CALL(hipSetDevice(ctx.device_id));
CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream))); CUDA_CALL(hipStreamSynchronize(static_cast<hipStream_t>(stream)));
} }
/** NOTE: If the backend is PyTorch, we will use PyTorch's stream management, /** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
...@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI {
void SetStream(DGLContext ctx, DGLStreamHandle stream) final {} void SetStream(DGLContext ctx, DGLStreamHandle stream) final {}
DGLStreamHandle GetStream() const final { DGLStreamHandle GetStream() const final {
return static_cast<DGLStreamHandle>(getCurrentCUDAStream()); return static_cast<DGLStreamHandle>(getCurrentHIPStreamMasqueradingAsCUDA());
} }
/** NOTE: cudaHostRegister can be called from an arbitrary GPU device, /** NOTE: hipHostRegister can be called from an arbitrary GPU device,
* so we don't need to specify a ctx. * so we don't need to specify a ctx.
* The pinned memory can be seen by all CUDA contexts, * The pinned memory can be seen by all CUDA contexts,
* not just the one that performed the allocation * not just the one that performed the allocation
...@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI {
if (tensor_dispatcher->IsAvailable()) { if (tensor_dispatcher->IsAvailable()) {
tensor_dispatcher->CUDAHostAllocatorEmptyCache(); tensor_dispatcher->CUDAHostAllocatorEmptyCache();
} }
CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault)); CUDA_CALL(hipHostRegister(ptr, nbytes, hipHostRegisterDefault));
return true; return true;
} }
void UnpinData(void* ptr) { void UnpinData(void* ptr) {
if (ptr == nullptr) return; if (ptr == nullptr) return;
CUDA_CALL(cudaHostUnregister(ptr)); CUDA_CALL(hipHostUnregister(ptr));
} }
void* AllocPinnedDataSpace( void* AllocPinnedDataSpace(
...@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI {
// can't be a pinned tensor if CUDA context is unavailable. // can't be a pinned tensor if CUDA context is unavailable.
if (!is_available_) return false; if (!is_available_) return false;
cudaPointerAttributes attr; hipPointerAttribute_t attr;
cudaError_t status = cudaPointerGetAttributes(&attr, ptr); hipError_t status = hipPointerGetAttributes(&attr, ptr);
bool result = false; bool result = false;
switch (status) { switch (status) {
case cudaErrorInvalidValue: case hipErrorInvalidValue:
// might be a normal CPU tensor in CUDA 10.2- // might be a normal CPU tensor in CUDA 10.2-
cudaGetLastError(); // clear error hipGetLastError(); // clear error
break; break;
case cudaSuccess: case hipSuccess:
result = (attr.type == cudaMemoryTypeHost); result = (attr.type == hipMemoryTypeHost);
break; break;
case cudaErrorInitializationError: case hipErrorInitializationError:
case cudaErrorNoDevice: case hipErrorNoDevice:
case cudaErrorInsufficientDriver: case hipErrorInsufficientDriver:
case cudaErrorInvalidDevice: case hipErrorInvalidDevice:
// We don't want to fail in these particular cases since this function // We don't want to fail in these particular cases since this function
// can be called when users only want to run on CPU even if CUDA API is // can be called when users only want to run on CPU even if CUDA API is
// enabled, or in a forked subprocess where CUDA context cannot be // enabled, or in a forked subprocess where CUDA context cannot be
// initialized. So we just mark the CUDA context to unavailable and // initialized. So we just mark the CUDA context to unavailable and
// return. // return.
is_available_ = false; is_available_ = false;
cudaGetLastError(); // clear error hipGetLastError(); // clear error
break; break;
default: default:
LOG(FATAL) << "error while determining memory status: " LOG(FATAL) << "error while determining memory status: "
<< cudaGetErrorString(status); << hipGetErrorString(status);
break; break;
} }
...@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable()) if (tensor_dispatcher->IsAvailable())
return tensor_dispatcher->CUDAAllocWorkspace( return tensor_dispatcher->CUDAAllocWorkspace(
size, getCurrentCUDAStream()); size, getCurrentHIPStreamMasqueradingAsCUDA());
return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
} }
...@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI {
private: private:
static void GPUCopy( static void GPUCopy(
const void* from, void* to, size_t size, cudaMemcpyKind kind, const void* from, void* to, size_t size, hipMemcpyKind kind,
cudaStream_t stream) { hipStream_t stream) {
CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream)); CUDA_CALL(hipMemcpyAsync(to, from, size, kind, stream));
if (stream == 0 && kind == cudaMemcpyDeviceToHost) { if (stream == 0 && kind == hipMemcpyDeviceToHost) {
// only wait for the copy, when it's on the default stream, and it's to // only wait for the copy, when it's on the default stream, and it's to
// host memory // host memory
CUDA_CALL(cudaStreamSynchronize(stream)); CUDA_CALL(hipStreamSynchronize(stream));
} }
} }
...@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() { ...@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
return CUDAThreadStore::Get(); return CUDAThreadStore::Get();
} }
cudaStream_t getCurrentCUDAStream() { hipStream_t getCurrentHIPStreamMasqueradingAsCUDA() {
TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable()) if (tensor_dispatcher->IsAvailable())
return tensor_dispatcher->CUDAGetCurrentStream(); return tensor_dispatcher->CUDAGetCurrentStream();
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh * @file runtime/cuda/cuda_device_common.cuh
...@@ -10,7 +12,7 @@ ...@@ -10,7 +12,7 @@
#include <dgl/runtime/c_runtime_api.h> #include <dgl/runtime/c_runtime_api.h>
#include "cuda_common.h" #include "cuda_common.h"
#include "cuda_runtime.h" #include <hip/hip_runtime.h>
namespace dgl { namespace dgl {
namespace runtime { namespace runtime {
...@@ -228,7 +230,7 @@ class OrderedHashTable { ...@@ -228,7 +230,7 @@ class OrderedHashTable {
* @param stream The stream to use for initializing the hashtable. * @param stream The stream to use for initializing the hashtable.
*/ */
OrderedHashTable( OrderedHashTable(
const size_t size, DGLContext ctx, cudaStream_t stream, const size_t size, DGLContext ctx, hipStream_t stream,
const int scale = kDefaultScale); const int scale = kDefaultScale);
/** /**
...@@ -252,7 +254,7 @@ class OrderedHashTable { ...@@ -252,7 +254,7 @@ class OrderedHashTable {
*/ */
void FillWithDuplicates( void FillWithDuplicates(
const IdType* const input, const size_t num_input, IdType* const unique, const IdType* const input, const size_t num_input, IdType* const unique,
int64_t* const num_unique, cudaStream_t stream); int64_t* const num_unique, hipStream_t stream);
/** /**
* @brief Fill the hashtable with an array of unique keys. * @brief Fill the hashtable with an array of unique keys.
...@@ -262,7 +264,7 @@ class OrderedHashTable { ...@@ -262,7 +264,7 @@ class OrderedHashTable {
* @param stream The stream to perform operations on. * @param stream The stream to perform operations on.
*/ */
void FillWithUnique( void FillWithUnique(
const IdType* const input, const size_t num_input, cudaStream_t stream); const IdType* const input, const size_t num_input, hipStream_t stream);
/** /**
* @brief Get a verison of the hashtable usable from device functions. * @brief Get a verison of the hashtable usable from device functions.
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh * @file runtime/cuda/cuda_device_common.cuh
...@@ -5,7 +7,7 @@ ...@@ -5,7 +7,7 @@
*/ */
#include <cassert> #include <cassert>
#include <cub/cub.cuh> // NOLINT #include <hipcub/hipcub.hpp> // NOLINT
#include "../../array/cuda/atomic.cuh" #include "../../array/cuda/atomic.cuh"
#include "cuda_common.h" #include "cuda_common.h"
...@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> { ...@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return The mapping. * @return The mapping.
*/ */
inline __device__ Iterator Search(const IdType id) { inline __device__ Iterator Search(const IdType id) {
const IdType pos = SearchForPosition(id); // const IdType pos = SearchForPosition(id);
const IdType pos = DeviceOrderedHashTable<IdType>::SearchForPosition(id);
return GetMutable(pos); return GetMutable(pos);
} }
...@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> { ...@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return An iterator to inserted mapping. * @return An iterator to inserted mapping.
*/ */
inline __device__ Iterator Insert(const IdType id, const size_t index) { inline __device__ Iterator Insert(const IdType id, const size_t index) {
size_t pos = Hash(id); // size_t pos = Hash(id);
size_t pos = DeviceOrderedHashTable<IdType>::Hash(id);
// linearly scan for an empty slot or matching entry // linearly scan for an empty slot or matching entry
IdType delta = 1; IdType delta = 1;
while (!AttemptInsertAt(pos, id, index)) { while (!AttemptInsertAt(pos, id, index)) {
pos = Hash(pos + delta); // pos = Hash(pos + delta);
pos = DeviceOrderedHashTable<IdType>::Hash(pos+delta);
delta += 1; delta += 1;
} }
...@@ -246,7 +254,7 @@ __global__ void count_hashmap( ...@@ -246,7 +254,7 @@ __global__ void count_hashmap(
DeviceOrderedHashTable<IdType> table, IdType* const num_unique) { DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
assert(BLOCK_SIZE == blockDim.x); assert(BLOCK_SIZE == blockDim.x);
using BlockReduce = typename cub::BlockReduce<IdType, BLOCK_SIZE>; using BlockReduce = typename hipcub::BlockReduce<IdType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping; using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
const size_t block_start = TILE_SIZE * blockIdx.x; const size_t block_start = TILE_SIZE * blockIdx.x;
...@@ -300,7 +308,7 @@ __global__ void compact_hashmap( ...@@ -300,7 +308,7 @@ __global__ void compact_hashmap(
assert(BLOCK_SIZE == blockDim.x); assert(BLOCK_SIZE == blockDim.x);
using FlagType = uint16_t; using FlagType = uint16_t;
using BlockScan = typename cub::BlockScan<FlagType, BLOCK_SIZE>; using BlockScan = typename hipcub::BlockScan<FlagType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping; using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE; constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
...@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const { ...@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
template <typename IdType> template <typename IdType>
OrderedHashTable<IdType>::OrderedHashTable( OrderedHashTable<IdType>::OrderedHashTable(
const size_t size, DGLContext ctx, cudaStream_t stream, const int scale) const size_t size, DGLContext ctx, hipStream_t stream, const int scale)
: table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) { : table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
// make sure we will at least as many buckets as items. // make sure we will at least as many buckets as items.
CHECK_GT(scale, 0); CHECK_GT(scale, 0);
...@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable( ...@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
table_ = static_cast<Mapping*>( table_ = static_cast<Mapping*>(
device->AllocWorkspace(ctx_, sizeof(Mapping) * size_)); device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
CUDA_CALL(cudaMemsetAsync( CUDA_CALL(hipMemsetAsync(
table_, DeviceOrderedHashTable<IdType>::kEmptyKey, table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
sizeof(Mapping) * size_, stream)); sizeof(Mapping) * size_, stream));
} }
...@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() { ...@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
template <typename IdType> template <typename IdType>
void OrderedHashTable<IdType>::FillWithDuplicates( void OrderedHashTable<IdType>::FillWithDuplicates(
const IdType* const input, const size_t num_input, IdType* const unique, const IdType* const input, const size_t num_input, IdType* const unique,
int64_t* const num_unique, cudaStream_t stream) { int64_t* const num_unique, hipStream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx_); auto device = runtime::DeviceAPI::Get(ctx_);
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE; const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
...@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates( ...@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
input, num_input, device_table, item_prefix); input, num_input, device_table, item_prefix);
size_t workspace_bytes; size_t workspace_bytes;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr), nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), grid.x + 1, stream)); static_cast<IdType*>(nullptr), grid.x + 1, stream));
void* workspace = device->AllocWorkspace(ctx_, workspace_bytes); void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1, workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
stream)); stream));
device->FreeWorkspace(ctx_, workspace); device->FreeWorkspace(ctx_, workspace);
...@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates( ...@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
template <typename IdType> template <typename IdType>
void OrderedHashTable<IdType>::FillWithUnique( void OrderedHashTable<IdType>::FillWithUnique(
const IdType* const input, const size_t num_input, cudaStream_t stream) { const IdType* const input, const size_t num_input, hipStream_t stream) {
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE; const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
const dim3 grid(num_tiles); const dim3 grid(num_tiles);
......
// !!! This is a file automatically generated by hipify!!!
/*! /*!
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* *
...@@ -20,7 +21,7 @@ ...@@ -20,7 +21,7 @@
#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_ #ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
#define DGL_RUNTIME_CUDA_GPU_CACHE_H_ #define DGL_RUNTIME_CUDA_GPU_CACHE_H_
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/aten/array_ops.h> #include <dgl/aten/array_ops.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
...@@ -31,7 +32,7 @@ ...@@ -31,7 +32,7 @@
#include <nv_gpu_cache.hpp> #include <nv_gpu_cache.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "cuda_common.h"
namespace dgl { namespace dgl {
namespace runtime { namespace runtime {
...@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object { ...@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object {
: num_feats(num_feats), : num_feats(num_feats),
cache(std::make_unique<gpu_cache_t>( cache(std::make_unique<gpu_cache_t>(
(num_items + bucket_size - 1) / bucket_size, num_feats)) { (num_items + bucket_size - 1) / bucket_size, num_feats)) {
CUDA_CALL(cudaGetDevice(&cuda_device)); CUDA_CALL(hipGetDevice(&cuda_device));
} }
std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) { std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
const auto &ctx = keys->ctx; const auto &ctx = keys->ctx;
cudaStream_t stream = dgl::runtime::getCurrentCUDAStream(); hipStream_t stream = dgl::runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = dgl::runtime::DeviceAPI::Get(ctx); auto device = dgl::runtime::DeviceAPI::Get(ctx);
CHECK_EQ(ctx.device_type, kDGLCUDA) CHECK_EQ(ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device"; << "The keys should be on a CUDA device";
...@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object { ...@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object {
} }
void Replace(IdArray keys, NDArray values) { void Replace(IdArray keys, NDArray values) {
cudaStream_t stream = dgl::runtime::getCurrentCUDAStream(); hipStream_t stream = dgl::runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CHECK_EQ(keys->ctx.device_type, kDGLCUDA) CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device"; << "The keys should be on a CUDA device";
CHECK_EQ(keys->ctx.device_id, cuda_device) CHECK_EQ(keys->ctx.device_id, cuda_device)
......
...@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) { ...@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
} else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") { } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") {
f_name = "device_api.cuda"; f_name = "device_api.cuda";
} else if (target.length() >= 4 && target.substr(0, 4) == "rocm") { } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") {
f_name = "device_api.rocm"; f_name = "device_api.cuda";
} else if (target.length() >= 4 && target.substr(0, 4) == "llvm") { } else if (target.length() >= 4 && target.substr(0, 4) == "llvm") {
const PackedFunc* pf = const PackedFunc* pf =
runtime::Registry::Get("codegen.llvm_target_enabled"); runtime::Registry::Get("codegen.llvm_target_enabled");
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2017-2022 by Contributors * Copyright (c) 2017-2022 by Contributors
* @file ndarray.cc * @file ndarray.cc
...@@ -26,7 +27,7 @@ constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype; ...@@ -26,7 +27,7 @@ constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
constexpr DGLDataType DGLDataTypeTraits<__half>::dtype; constexpr DGLDataType DGLDataTypeTraits<__half>::dtype;
#if BF16_ENABLED #if BF16_ENABLED
constexpr DGLDataType DGLDataTypeTraits<__nv_bfloat16>::dtype; constexpr DGLDataType DGLDataTypeTraits<__hip_bfloat16>::dtype;
#endif // BF16_ENABLED #endif // BF16_ENABLED
#endif // DGL_USE_CUDA #endif // DGL_USE_CUDA
constexpr DGLDataType DGLDataTypeTraits<float>::dtype; constexpr DGLDataType DGLDataTypeTraits<float>::dtype;
...@@ -222,7 +223,7 @@ void NDArray::RecordedCopyFromTo( ...@@ -222,7 +223,7 @@ void NDArray::RecordedCopyFromTo(
CHECK(from->ctx.device_type != to->ctx.device_type) CHECK(from->ctx.device_type != to->ctx.device_type)
<< "Recoding event is only called for the copy between CPU and GPU."; << "Recoding event is only called for the copy between CPU and GPU.";
CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLCUDA) CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLROCM)
<< "At least one CUDA ctx needs to be involved."; << "At least one CUDA ctx needs to be involved.";
DeviceAPI::Get(kDGLCUDA)->RecordedCopyDataFromTo( DeviceAPI::Get(kDGLCUDA)->RecordedCopyDataFromTo(
...@@ -262,7 +263,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) { ...@@ -262,7 +263,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
void NDArray::UnpinContainer(NDArray::Container* ptr) { void NDArray::UnpinContainer(NDArray::Container* ptr) {
auto container_is_pinned = IsContainerPinned(ptr); auto container_is_pinned = IsContainerPinned(ptr);
// The tensor may be pinned outside of DGL via a different CUDA API, // The tensor may be pinned outside of DGL via a different CUDA API,
// so we cannot unpin it with cudaHostUnregister. // so we cannot unpin it with hipHostUnregister.
CHECK(ptr->pinned_by_dgl_ || !container_is_pinned) CHECK(ptr->pinned_by_dgl_ || !container_is_pinned)
<< "Cannot unpin a tensor that is pinned outside of DGL."; << "Cannot unpin a tensor that is pinned outside of DGL.";
// 1. not pinned, do nothing // 1. not pinned, do nothing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment