Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file graph/transform/cuda/knn.cu
* @brief k-nearest-neighbor (KNN) implementation (cuda)
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <algorithm>
#include <cub/cub.cuh> // NOLINT
#include <hipcub/hipcub.hpp> // NOLINT
#include <limits>
#include <string>
#include <type_traits>
......@@ -467,7 +469,7 @@ void BruteForceKNNCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
......@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
......@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda(
// get max shared memory per block in bytes
// determine block size according to this value
int max_sharedmem_per_block = 0;
CUDA_CALL(cudaDeviceGetAttribute(
&max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
CUDA_CALL(hipDeviceGetAttribute(
&max_sharedmem_per_block, hipDeviceAttributeMaxSharedMemoryPerBlock,
ctx.device_id));
const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
(k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
......@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda(
GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
query_offsets_data, num_block_per_segment, batch_size, block_size);
size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
device->FreeWorkspace(ctx, prefix_temp);
// wait for results
CUDA_CALL(cudaStreamSynchronize(stream));
CUDA_CALL(hipStreamSynchronize(stream));
int64_t num_blocks = 0, final_elem = 0,
copyoffset = (batch_size - 1) * sizeof(IdType);
......@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda(
/** @brief Setup rng state for nn-descent */
__global__ void SetupRngKernel(
curandState* states, const uint64_t seed, const size_t n) {
hiprandState_t* states, const uint64_t seed, const size_t n) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
curand_init(seed, id, 0, states + id);
hiprand_init(seed, id, 0, states + id);
}
}
......@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
curandState state;
curand_init(seed, point_idx, 0, &state);
hiprandState_t state;
hiprand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
......@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel(
current_central_nodes[i] = point_idx;
}
for (IdType i = k; i < segment_size; ++i) {
const IdType j = static_cast<IdType>(curand(&state) % (i + 1));
const IdType j = static_cast<IdType>(hiprand(&state) % (i + 1));
if (j < k) current_neighbors[j] = i + segment_start;
}
......@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
curandState state;
curand_init(seed, point_idx, 0, &state);
hiprandState_t state;
hiprand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
......@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
candidate_data[curr_num] = candidate;
} else {
IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = candidate;
}
++candidate_array[0];
......@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
candidate_data[curr_num] = reverse_candidate;
} else {
IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
}
++candidate_array[0];
......@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void NNDescent(
const NDArray& points, const IdArray& offsets, IdArray result, const int k,
const int num_iters, const int num_candidates, const double delta) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto& ctx = points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t num_nodes = points->shape[0];
......@@ -887,7 +889,7 @@ void NNDescent(
uint64_t seed;
int warp_size = 0;
CUDA_CALL(
cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id));
hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, ctx.device_id));
// We don't need large block sizes, since there's not much inter-thread
// communication
int64_t block_size = warp_size;
......@@ -911,7 +913,7 @@ void NNDescent(
IdType* total_num_updates_d =
static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
CUDA_CALL(cub::DeviceReduce::Sum(
CUDA_CALL(hipcub::DeviceReduce::Sum(
nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
stream));
IdType* sum_temp_storage =
......@@ -942,7 +944,7 @@ void NNDescent(
feature_size);
total_num_updates = 0;
CUDA_CALL(cub::DeviceReduce::Sum(
CUDA_CALL(hipcub::DeviceReduce::Sum(
sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
num_nodes, stream));
device->CopyDataFromTo(
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2018 by Contributors
* @file graph/traversal.cc
* @brief Graph traversal implementation
*/
#include "./traversal.h"
#include "traversal.h"
#include <dgl/packed_func_ext.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/unit_graph.cc
* @brief UnitGraph graph implementation
*/
#include "./unit_graph.h"
#include "unit_graph.h"
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
......@@ -11,7 +12,7 @@
#include <dgl/lazy.h>
#include "../c_api_common.h"
#include "./serialize/dglstream.h"
#include "serialize/dglstream.h"
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file ndarray_partition.h
......@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/workspace.h"
......@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_in = in_idx->shape[0];
......@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
}
const int64_t part_bits =
static_cast<int64_t>(std::ceil(std::log2(num_parts)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
......@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
......@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work");
"in hipcub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
......@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
"value of int.";
size_t hist_workspace_size;
CUDA_CALL(cub::DeviceHistogram::HistogramEven(
CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL(cub::DeviceHistogram::HistogramEven(
CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
......@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
template <DGLDeviceType XPU, typename IdType>
IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1) {
IdArray local_idx =
......@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder(
<< num_parts;
const auto& ctx = local_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1) {
IdArray global_idx =
......@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t num_in = in_idx->shape[0];
......@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
}
const int64_t part_bits =
static_cast<int64_t>(std::ceil(std::log2(num_parts)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
......@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
......@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work");
"in hipcub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
......@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
"value of int.";
size_t hist_workspace_size;
CUDA_CALL(cub::DeviceHistogram::HistogramEven(
CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL(cub::DeviceHistogram::HistogramEven(
CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
......@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
IdArray MapToLocalFromRange(
const int num_parts, IdArray range, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1 && global_idx->shape[0] > 0) {
IdArray local_idx =
......@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange(
<< num_parts;
const auto& ctx = local_idx->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_parts > 1 && local_idx->shape[0] > 0) {
IdArray global_idx =
......
......@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray in_idx) const override {
#ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA || ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
return impl::GeneratePermutationFromRemainder<kDGLCUDA, IdType>(
ArraySize(), NumParts(), in_idx);
......@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray MapToLocal(IdArray in_idx) const override {
#ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
return impl::MapToLocalFromRemainder<kDGLCUDA, IdType>(
NumParts(), in_idx);
......@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
#ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
return impl::MapToGlobalFromRemainder<kDGLCUDA, IdType>(
NumParts(), in_idx, part_id);
......@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition {
IdArray in_idx) const override {
#ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
if (ctx.device_type != range_->ctx.device_type ||
ctx.device_id != range_->ctx.device_id) {
LOG(FATAL) << "The range for the NDArrayPartition and the input "
......@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition {
IdArray MapToLocal(IdArray in_idx) const override {
#ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
return impl::MapToLocalFromRange<kDGLCUDA, IdType, RangeType>(
......@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition {
IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
#ifdef DGL_USE_CUDA
auto ctx = in_idx->ctx;
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
return impl::MapToGlobalFromRange<kDGLCUDA, IdType, RangeType>(
......
// !!! This is a file automatically generated by hipify!!!
/*!
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* All rights reserved.
......@@ -24,13 +25,13 @@
#include <cmath>
#ifdef __NVCC__
#include <curand_kernel.h>
#ifdef __HIPCC__
#include <hiprand/hiprand_kernel.h>
#else
#include <random>
#include "pcg_random.hpp"
#endif // __CUDA_ARCH__
#endif // __HIP_DEVICE_COMPILE__
#ifndef M_SQRT1_2
#define M_SQRT1_2 0.707106781186547524401
......@@ -58,24 +59,24 @@ class continuous_seed {
c[1] = std::sin(pi * r / 2);
}
#ifdef __CUDA_ARCH__
#ifdef __HIP_DEVICE_COMPILE__
__device__ inline float uniform(const uint64_t t) const {
const uint64_t kCurandSeed = 999961; // Could be any random number.
curandStatePhilox4_32_10_t rng;
curand_init(kCurandSeed, s[0], t, &rng);
hiprandStatePhilox4_32_10_t rng;
hiprand_init(kCurandSeed, s[0], t, &rng);
float rnd;
if (s[0] != s[1]) {
rnd = c[0] * curand_normal(&rng);
curand_init(kCurandSeed, s[1], t, &rng);
rnd += c[1] * curand_normal(&rng);
rnd = c[0] * hiprand_normal(&rng);
hiprand_init(kCurandSeed, s[1], t, &rng);
rnd += c[1] * hiprand_normal(&rng);
rnd = normcdff(rnd);
} else {
rnd = curand_uniform(&rng);
rnd = hiprand_uniform(&rng);
}
return rnd;
}
#else
inline float uniform(const uint64_t t) const {
__host__ inline float uniform(const uint64_t t) const {
pcg32 ng0(s[0], t);
float rnd;
if (s[0] != s[1]) {
......@@ -91,7 +92,7 @@ class continuous_seed {
}
return rnd;
}
#endif // __CUDA_ARCH__
#endif // __HIP_DEVICE_COMPILE__
};
} // namespace random
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.cc
* @brief Implementation of RPC utilities used by both server and client sides.
*/
#if defined(__linux__)
#include "./rpc.h"
#include "rpc.h"
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.h
......@@ -19,9 +20,9 @@
#include <unordered_map>
#include <vector>
#include "./network/common.h"
#include "./rpc_msg.h"
#include "./server_state.h"
#include "network/common.h"
#include "rpc_msg.h"
#include "server_state.h"
#include "network/socket_communicator.h"
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2016-2022 by Contributors
* @file c_runtime_api.cc
......@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) {
return "cpu";
case kDGLCUDA:
return "cuda";
case kDGLROCM:
return "cuda";
// add more device here once supported
default:
LOG(FATAL) << "unknown type =" << type;
......@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo(
}
bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
LOG(FATAL) << "Device does not support cudaHostRegister api.";
LOG(FATAL) << "Device does not support hipHostRegister api.";
return false;
}
void* DeviceAPI::AllocPinnedDataSpace(
size_t nbytes, void** ctx, void** deleter) {
LOG(FATAL) << "Device does not support cudaHostAlloc api.";
LOG(FATAL) << "Device does not support hipHostMalloc api.";
return nullptr;
}
......@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
}
void DeviceAPI::UnpinData(void* ptr) {
LOG(FATAL) << "Device does not support cudaHostUnregister api.";
LOG(FATAL) << "Device does not support hipHostUnregister api.";
}
} // namespace runtime
} // namespace dgl
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017 by Contributors
* @file cuda_common.h
......@@ -6,10 +7,10 @@
#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <cusparse.h>
#include <hipblas/hipblas.h>
#include <hip/hip_runtime.h>
#include <hiprand/hiprand.h>
#include <hipsparse/hipsparse.h>
#include <dgl/runtime/packed_func.h>
#include <memory>
......@@ -25,8 +26,8 @@ namespace runtime {
DGL's memory pool and the current cuda stream
runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto stream = runtime::getCurrentCUDAStream();
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
const auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
now, one can pass exec_policy to thrust functions
......@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) {
#define CUDA_DRIVER_CALL(x) \
{ \
CUresult result = x; \
if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
hipError_t result = x; \
if (result != hipSuccess && result != hipErrorDeinitialized) { \
const char* msg; \
cuGetErrorName(result, &msg); \
hipGetErrorName(result, &msg); \
LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \
} \
}
#define CUDA_CALL(func) \
{ \
cudaError_t e = (func); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
<< "CUDA: " << cudaGetErrorString(e); \
hipError_t e = (func); \
CHECK(e == hipSuccess || e == hipErrorDeinitialized) \
<< "CUDA: " << hipGetErrorString(e); \
}
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
{ \
if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
(kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__); \
cudaError_t e = cudaGetLastError(); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
<< "CUDA kernel launch error: " << cudaGetErrorString(e); \
hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), (stream), __VA_ARGS__); \
hipError_t e = hipGetLastError(); \
CHECK(e == hipSuccess || e == hipErrorDeinitialized) \
<< "CUDA kernel launch error: " << hipGetErrorString(e); \
} \
}
#define CUSPARSE_CALL(func) \
{ \
cusparseStatus_t e = (func); \
CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
hipsparseStatus_t e = (func); \
CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
}
#define CUBLAS_CALL(func) \
{ \
cublasStatus_t e = (func); \
CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
hipblasStatus_t e = (func); \
CHECK(e == HIPBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
}
#define CURAND_CALL(func) \
{ \
curandStatus_t e = (func); \
CHECK(e == CURAND_STATUS_SUCCESS) \
hiprandStatus_t e = (func); \
CHECK(e == HIPRAND_STATUS_SUCCESS) \
<< "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
<< __FILE__ << ":" << __LINE__; \
}
inline const char* curandGetErrorString(curandStatus_t error) {
inline const char* curandGetErrorString(hiprandStatus_t error) {
switch (error) {
case CURAND_STATUS_SUCCESS:
return "CURAND_STATUS_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
case HIPRAND_STATUS_SUCCESS:
return "HIPRAND_STATUS_SUCCESS";
case HIPRAND_STATUS_VERSION_MISMATCH:
return "HIPRAND_STATUS_VERSION_MISMATCH";
case HIPRAND_STATUS_NOT_INITIALIZED:
return "HIPRAND_STATUS_NOT_INITIALIZED";
case HIPRAND_STATUS_ALLOCATION_FAILED:
return "HIPRAND_STATUS_ALLOCATION_FAILED";
case HIPRAND_STATUS_TYPE_ERROR:
return "HIPRAND_STATUS_TYPE_ERROR";
case HIPRAND_STATUS_OUT_OF_RANGE:
return "HIPRAND_STATUS_OUT_OF_RANGE";
case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE:
return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE";
case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case HIPRAND_STATUS_LAUNCH_FAILURE:
return "HIPRAND_STATUS_LAUNCH_FAILURE";
case HIPRAND_STATUS_PREEXISTING_FAILURE:
return "HIPRAND_STATUS_PREEXISTING_FAILURE";
case HIPRAND_STATUS_INITIALIZATION_FAILED:
return "HIPRAND_STATUS_INITIALIZATION_FAILED";
case HIPRAND_STATUS_ARCH_MISMATCH:
return "HIPRAND_STATUS_ARCH_MISMATCH";
case HIPRAND_STATUS_INTERNAL_ERROR:
return "HIPRAND_STATUS_INTERNAL_ERROR";
}
// To suppress compiler warning.
return "Unrecognized curand error string";
return "Unrecognized hiprand error string";
}
/**
* @brief Cast data type to cudaDataType_t.
* @brief Cast data type to hipDataType.
*/
template <typename T>
struct cuda_dtype {
static constexpr cudaDataType_t value = CUDA_R_32F;
static constexpr hipDataType value = HIP_R_32F;
};
template <>
struct cuda_dtype<__half> {
static constexpr cudaDataType_t value = CUDA_R_16F;
static constexpr hipDataType value = HIP_R_16F;
};
#if BF16_ENABLED
template <>
struct cuda_dtype<__nv_bfloat16> {
static constexpr cudaDataType_t value = CUDA_R_16BF;
struct cuda_dtype<__hip_bfloat16> {
static constexpr hipDataType value = HIP_R_16BF;
};
#endif // BF16_ENABLED
template <>
struct cuda_dtype<float> {
static constexpr cudaDataType_t value = CUDA_R_32F;
static constexpr hipDataType value = HIP_R_32F;
};
template <>
struct cuda_dtype<double> {
static constexpr cudaDataType_t value = CUDA_R_64F;
static constexpr hipDataType value = HIP_R_64F;
};
/*
......@@ -202,7 +203,7 @@ struct accum_dtype<__half> {
#if BF16_ENABLED
template <>
struct accum_dtype<__nv_bfloat16> {
struct accum_dtype<__hip_bfloat16> {
typedef float type;
};
#endif // BF16_ENABLED
......@@ -217,23 +218,23 @@ struct accum_dtype<double> {
typedef double type;
};
#if CUDART_VERSION >= 11000
#if DTKRT_VERSION >= 11000
/**
* @brief Cast index data type to cusparseIndexType_t.
* @brief Cast index data type to hipsparseIndexType_t.
*/
template <typename T>
struct cusparse_idtype {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
};
template <>
struct cusparse_idtype<int32_t> {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
};
template <>
struct cusparse_idtype<int64_t> {
static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_64I;
};
#endif
......@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> {
class CUDAThreadEntry {
public:
/** @brief The cusparse handler */
cusparseHandle_t cusparse_handle{nullptr};
hipsparseHandle_t cusparse_handle{nullptr};
/** @brief The cublas handler */
cublasHandle_t cublas_handle{nullptr};
hipblasHandle_t cublas_handle{nullptr};
/** @brief thread local pool*/
WorkspacePool pool;
/** @brief constructor */
......@@ -253,7 +254,7 @@ class CUDAThreadEntry {
};
/** @brief Get the current CUDA stream */
cudaStream_t getCurrentCUDAStream();
hipStream_t getCurrentHIPStreamMasqueradingAsCUDA();
} // namespace runtime
} // namespace dgl
#endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017-2022 by Contributors
* @file cuda_device_api.cc
* @brief GPU specific API
*/
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/tensordispatch.h>
......@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI {
public:
CUDADeviceAPI() {
int count;
auto err = cudaGetDeviceCount(&count);
auto err = hipGetDeviceCount(&count);
switch (err) {
case cudaSuccess:
case hipSuccess:
break;
default:
count = 0;
cudaGetLastError();
hipGetLastError();
}
is_available_ = count > 0;
}
......@@ -32,67 +33,68 @@ class CUDADeviceAPI final : public DeviceAPI {
bool IsAvailable() final { return is_available_; }
void SetDevice(DGLContext ctx) final {
CUDA_CALL(cudaSetDevice(ctx.device_id));
CUDA_CALL(hipSetDevice(ctx.device_id));
}
void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final {
int value = 0;
switch (kind) {
case kExist:
value =
(cudaDeviceGetAttribute(
&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) ==
cudaSuccess);
(hipDeviceGetAttribute(
&value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id) ==
hipSuccess);
break;
case kMaxThreadsPerBlock: {
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id));
break;
}
case kWarpSize: {
CUDA_CALL(
cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id));
hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id));
break;
}
case kMaxSharedMemoryPerBlock: {
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&value, hipDeviceAttributeMaxSharedMemoryPerBlock, ctx.device_id));
break;
}
case kComputeVersion: {
std::ostringstream os;
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrComputeCapabilityMajor, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id));
os << value << ".";
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrComputeCapabilityMinor, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id));
os << value;
*rv = os.str();
return;
}
case kDeviceName: {
cudaDeviceProp props;
CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id));
hipDeviceProp_t props;
CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
*rv = std::string(props.name);
// printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
return;
}
case kMaxClockRate: {
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrClockRate, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&value, hipDeviceAttributeClockRate, ctx.device_id));
break;
}
case kMultiProcessorCount: {
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrMultiProcessorCount, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&value, hipDeviceAttributeMultiprocessorCount, ctx.device_id));
break;
}
case kMaxThreadDimensions: {
int dims[3];
CUDA_CALL(cudaDeviceGetAttribute(
&dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id));
CUDA_CALL(cudaDeviceGetAttribute(
&dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id));
CUDA_CALL(cudaDeviceGetAttribute(
&dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id));
CUDA_CALL(hipDeviceGetAttribute(
&dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id));
std::stringstream ss; // use json string to return multiple int values;
ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
......@@ -110,11 +112,11 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable()) {
return tensor_dispatcher->CUDAAllocWorkspace(
nbytes, getCurrentCUDAStream());
nbytes, getCurrentHIPStreamMasqueradingAsCUDA());
}
CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
void* ret;
CUDA_CALL(cudaMalloc(&ret, nbytes));
CUDA_CALL(hipMalloc(&ret, nbytes));
return ret;
}
......@@ -124,32 +126,32 @@ class CUDADeviceAPI final : public DeviceAPI {
if (tensor_dispatcher->IsAvailable()) {
return tensor_dispatcher->CUDAFreeWorkspace(ptr);
}
CUDA_CALL(cudaFree(ptr));
CUDA_CALL(hipFree(ptr));
}
void CopyDataFromTo(
const void* from, size_t from_offset, void* to, size_t to_offset,
size_t size, DGLContext ctx_from, DGLContext ctx_to,
DGLDataType type_hint, DGLStreamHandle stream) {
cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
hipStream_t cu_stream = static_cast<hipStream_t>(stream);
from = static_cast<const char*>(from) + from_offset;
to = static_cast<char*>(to) + to_offset;
if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) {
CUDA_CALL(cudaSetDevice(ctx_from.device_id));
if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
CUDA_CALL(hipSetDevice(ctx_from.device_id));
if (ctx_from.device_id == ctx_to.device_id) {
GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream);
} else {
CUDA_CALL(cudaMemcpyPeerAsync(
CUDA_CALL(hipMemcpyPeerAsync(
to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
}
} else if (
ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) {
CUDA_CALL(cudaSetDevice(ctx_from.device_id));
GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
(ctx_from.device_type == kDGLCUDA || ctx_to.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) {
CUDA_CALL(hipSetDevice(ctx_from.device_id));
GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream);
} else if (
ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) {
CUDA_CALL(cudaSetDevice(ctx_to.device_id));
GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
ctx_from.device_type == kDGLCPU && (ctx_to.device_type == kDGLCUDA||ctx_to.device_type == kDGLROCM)) {
CUDA_CALL(hipSetDevice(ctx_to.device_id));
GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream);
} else {
LOG(FATAL) << "expect copy from/to GPU or between GPU";
}
......@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI {
}
// To ensure correct behavior, `record_event` must be invoked anytime a
// pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync
// pointer from PyTorch CachingHostAllocator is used in a hipMemcpyAsync
// call. It provides a way to re-use freed pinned (page-locked) memory
// allocations and avoid device sync due to cudaFreeHost calls.
// allocations and avoid device sync due to hipHostFree calls.
void RecordedCopyDataFromTo(
void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint,
......@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI {
stream);
auto tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable()) {
auto custream = static_cast<cudaStream_t>(stream);
auto custream = static_cast<hipStream_t>(stream);
void* ptr = ctx_to.device_type == kDGLCPU ? to : from;
int id =
ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id;
......@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI {
}
DGLStreamHandle CreateStream(DGLContext ctx) {
CUDA_CALL(cudaSetDevice(ctx.device_id));
cudaStream_t retval;
CUDA_CALL(hipSetDevice(ctx.device_id));
hipStream_t retval;
// make sure the legacy default stream won't block on this stream
CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
CUDA_CALL(hipStreamCreateWithFlags(&retval, hipStreamNonBlocking));
return static_cast<DGLStreamHandle>(retval);
}
void FreeStream(DGLContext ctx, DGLStreamHandle stream) {
CUDA_CALL(cudaSetDevice(ctx.device_id));
cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
CUDA_CALL(cudaStreamDestroy(cu_stream));
CUDA_CALL(hipSetDevice(ctx.device_id));
hipStream_t cu_stream = static_cast<hipStream_t>(stream);
CUDA_CALL(hipStreamDestroy(cu_stream));
}
void SyncStreamFromTo(
DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) {
CUDA_CALL(cudaSetDevice(ctx.device_id));
cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
cudaEvent_t evt;
CUDA_CALL(cudaEventCreate(&evt));
CUDA_CALL(cudaEventRecord(evt, src_stream));
CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0));
CUDA_CALL(cudaEventDestroy(evt));
CUDA_CALL(hipSetDevice(ctx.device_id));
hipStream_t src_stream = static_cast<hipStream_t>(event_src);
hipStream_t dst_stream = static_cast<hipStream_t>(event_dst);
hipEvent_t evt;
CUDA_CALL(hipEventCreate(&evt));
CUDA_CALL(hipEventRecord(evt, src_stream));
CUDA_CALL(hipStreamWaitEvent(dst_stream, evt, 0));
CUDA_CALL(hipEventDestroy(evt));
}
void StreamSync(DGLContext ctx, DGLStreamHandle stream) final {
CUDA_CALL(cudaSetDevice(ctx.device_id));
CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
CUDA_CALL(hipSetDevice(ctx.device_id));
CUDA_CALL(hipStreamSynchronize(static_cast<hipStream_t>(stream)));
}
/** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
......@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI {
void SetStream(DGLContext ctx, DGLStreamHandle stream) final {}
DGLStreamHandle GetStream() const final {
return static_cast<DGLStreamHandle>(getCurrentCUDAStream());
return static_cast<DGLStreamHandle>(getCurrentHIPStreamMasqueradingAsCUDA());
}
/** NOTE: cudaHostRegister can be called from an arbitrary GPU device,
/** NOTE: hipHostRegister can be called from an arbitrary GPU device,
* so we don't need to specify a ctx.
* The pinned memory can be seen by all CUDA contexts,
* not just the one that performed the allocation
......@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI {
if (tensor_dispatcher->IsAvailable()) {
tensor_dispatcher->CUDAHostAllocatorEmptyCache();
}
CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
CUDA_CALL(hipHostRegister(ptr, nbytes, hipHostRegisterDefault));
return true;
}
void UnpinData(void* ptr) {
if (ptr == nullptr) return;
CUDA_CALL(cudaHostUnregister(ptr));
CUDA_CALL(hipHostUnregister(ptr));
}
void* AllocPinnedDataSpace(
......@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI {
// can't be a pinned tensor if CUDA context is unavailable.
if (!is_available_) return false;
cudaPointerAttributes attr;
cudaError_t status = cudaPointerGetAttributes(&attr, ptr);
hipPointerAttribute_t attr;
hipError_t status = hipPointerGetAttributes(&attr, ptr);
bool result = false;
switch (status) {
case cudaErrorInvalidValue:
case hipErrorInvalidValue:
// might be a normal CPU tensor in CUDA 10.2-
cudaGetLastError(); // clear error
hipGetLastError(); // clear error
break;
case cudaSuccess:
result = (attr.type == cudaMemoryTypeHost);
case hipSuccess:
result = (attr.type == hipMemoryTypeHost);
break;
case cudaErrorInitializationError:
case cudaErrorNoDevice:
case cudaErrorInsufficientDriver:
case cudaErrorInvalidDevice:
case hipErrorInitializationError:
case hipErrorNoDevice:
case hipErrorInsufficientDriver:
case hipErrorInvalidDevice:
// We don't want to fail in these particular cases since this function
// can be called when users only want to run on CPU even if CUDA API is
// enabled, or in a forked subprocess where CUDA context cannot be
// initialized. So we just mark the CUDA context to unavailable and
// return.
is_available_ = false;
cudaGetLastError(); // clear error
hipGetLastError(); // clear error
break;
default:
LOG(FATAL) << "error while determining memory status: "
<< cudaGetErrorString(status);
<< hipGetErrorString(status);
break;
}
......@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable())
return tensor_dispatcher->CUDAAllocWorkspace(
size, getCurrentCUDAStream());
size, getCurrentHIPStreamMasqueradingAsCUDA());
return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
}
......@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI {
private:
static void GPUCopy(
const void* from, void* to, size_t size, cudaMemcpyKind kind,
cudaStream_t stream) {
CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
if (stream == 0 && kind == cudaMemcpyDeviceToHost) {
const void* from, void* to, size_t size, hipMemcpyKind kind,
hipStream_t stream) {
CUDA_CALL(hipMemcpyAsync(to, from, size, kind, stream));
if (stream == 0 && kind == hipMemcpyDeviceToHost) {
// only wait for the copy, when it's on the default stream, and it's to
// host memory
CUDA_CALL(cudaStreamSynchronize(stream));
CUDA_CALL(hipStreamSynchronize(stream));
}
}
......@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
return CUDAThreadStore::Get();
}
cudaStream_t getCurrentCUDAStream() {
hipStream_t getCurrentHIPStreamMasqueradingAsCUDA() {
TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
if (tensor_dispatcher->IsAvailable())
return tensor_dispatcher->CUDAGetCurrentStream();
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
......@@ -10,7 +12,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include "cuda_common.h"
#include "cuda_runtime.h"
#include <hip/hip_runtime.h>
namespace dgl {
namespace runtime {
......@@ -228,7 +230,7 @@ class OrderedHashTable {
* @param stream The stream to use for initializing the hashtable.
*/
OrderedHashTable(
const size_t size, DGLContext ctx, cudaStream_t stream,
const size_t size, DGLContext ctx, hipStream_t stream,
const int scale = kDefaultScale);
/**
......@@ -252,7 +254,7 @@ class OrderedHashTable {
*/
void FillWithDuplicates(
const IdType* const input, const size_t num_input, IdType* const unique,
int64_t* const num_unique, cudaStream_t stream);
int64_t* const num_unique, hipStream_t stream);
/**
* @brief Fill the hashtable with an array of unique keys.
......@@ -262,7 +264,7 @@ class OrderedHashTable {
* @param stream The stream to perform operations on.
*/
void FillWithUnique(
const IdType* const input, const size_t num_input, cudaStream_t stream);
const IdType* const input, const size_t num_input, hipStream_t stream);
/**
* @brief Get a verison of the hashtable usable from device functions.
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
......@@ -5,7 +7,7 @@
*/
#include <cassert>
#include <cub/cub.cuh> // NOLINT
#include <hipcub/hipcub.hpp> // NOLINT
#include "../../array/cuda/atomic.cuh"
#include "cuda_common.h"
......@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return The mapping.
*/
inline __device__ Iterator Search(const IdType id) {
const IdType pos = SearchForPosition(id);
// const IdType pos = SearchForPosition(id);
const IdType pos = DeviceOrderedHashTable<IdType>::SearchForPosition(id);
return GetMutable(pos);
}
......@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return An iterator to inserted mapping.
*/
inline __device__ Iterator Insert(const IdType id, const size_t index) {
size_t pos = Hash(id);
// size_t pos = Hash(id);
size_t pos = DeviceOrderedHashTable<IdType>::Hash(id);
// linearly scan for an empty slot or matching entry
IdType delta = 1;
while (!AttemptInsertAt(pos, id, index)) {
pos = Hash(pos + delta);
// pos = Hash(pos + delta);
pos = DeviceOrderedHashTable<IdType>::Hash(pos+delta);
delta += 1;
}
......@@ -246,7 +254,7 @@ __global__ void count_hashmap(
DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
assert(BLOCK_SIZE == blockDim.x);
using BlockReduce = typename cub::BlockReduce<IdType, BLOCK_SIZE>;
using BlockReduce = typename hipcub::BlockReduce<IdType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
const size_t block_start = TILE_SIZE * blockIdx.x;
......@@ -300,7 +308,7 @@ __global__ void compact_hashmap(
assert(BLOCK_SIZE == blockDim.x);
using FlagType = uint16_t;
using BlockScan = typename cub::BlockScan<FlagType, BLOCK_SIZE>;
using BlockScan = typename hipcub::BlockScan<FlagType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
......@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
template <typename IdType>
OrderedHashTable<IdType>::OrderedHashTable(
const size_t size, DGLContext ctx, cudaStream_t stream, const int scale)
const size_t size, DGLContext ctx, hipStream_t stream, const int scale)
: table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
// make sure we will at least as many buckets as items.
CHECK_GT(scale, 0);
......@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
table_ = static_cast<Mapping*>(
device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
CUDA_CALL(cudaMemsetAsync(
CUDA_CALL(hipMemsetAsync(
table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
sizeof(Mapping) * size_, stream));
}
......@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
template <typename IdType>
void OrderedHashTable<IdType>::FillWithDuplicates(
const IdType* const input, const size_t num_input, IdType* const unique,
int64_t* const num_unique, cudaStream_t stream) {
int64_t* const num_unique, hipStream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx_);
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
......@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
input, num_input, device_table, item_prefix);
size_t workspace_bytes;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), grid.x + 1, stream));
void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
stream));
device->FreeWorkspace(ctx_, workspace);
......@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
template <typename IdType>
void OrderedHashTable<IdType>::FillWithUnique(
const IdType* const input, const size_t num_input, cudaStream_t stream) {
const IdType* const input, const size_t num_input, hipStream_t stream) {
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
const dim3 grid(num_tiles);
......
// !!! This is a file automatically generated by hipify!!!
/*!
* Copyright (c) 2022 by Contributors
*
......@@ -20,7 +21,7 @@
#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <dgl/array.h>
#include <dgl/aten/array_ops.h>
#include <dgl/packed_func_ext.h>
......@@ -31,7 +32,7 @@
#include <nv_gpu_cache.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "cuda_common.h"
namespace dgl {
namespace runtime {
......@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object {
: num_feats(num_feats),
cache(std::make_unique<gpu_cache_t>(
(num_items + bucket_size - 1) / bucket_size, num_feats)) {
CUDA_CALL(cudaGetDevice(&cuda_device));
CUDA_CALL(hipGetDevice(&cuda_device));
}
std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
const auto &ctx = keys->ctx;
cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
hipStream_t stream = dgl::runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = dgl::runtime::DeviceAPI::Get(ctx);
CHECK_EQ(ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
......@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object {
}
void Replace(IdArray keys, NDArray values) {
cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
hipStream_t stream = dgl::runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
CHECK_EQ(keys->ctx.device_id, cuda_device)
......
......@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
} else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") {
f_name = "device_api.cuda";
} else if (target.length() >= 4 && target.substr(0, 4) == "rocm") {
f_name = "device_api.rocm";
f_name = "device_api.cuda";
} else if (target.length() >= 4 && target.substr(0, 4) == "llvm") {
const PackedFunc* pf =
runtime::Registry::Get("codegen.llvm_target_enabled");
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017-2022 by Contributors
* @file ndarray.cc
......@@ -26,7 +27,7 @@ constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
#ifdef DGL_USE_CUDA
constexpr DGLDataType DGLDataTypeTraits<__half>::dtype;
#if BF16_ENABLED
constexpr DGLDataType DGLDataTypeTraits<__nv_bfloat16>::dtype;
constexpr DGLDataType DGLDataTypeTraits<__hip_bfloat16>::dtype;
#endif // BF16_ENABLED
#endif // DGL_USE_CUDA
constexpr DGLDataType DGLDataTypeTraits<float>::dtype;
......@@ -222,7 +223,7 @@ void NDArray::RecordedCopyFromTo(
CHECK(from->ctx.device_type != to->ctx.device_type)
<< "Recoding event is only called for the copy between CPU and GPU.";
CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLCUDA)
CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLROCM)
<< "At least one CUDA ctx needs to be involved.";
DeviceAPI::Get(kDGLCUDA)->RecordedCopyDataFromTo(
......@@ -262,7 +263,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
void NDArray::UnpinContainer(NDArray::Container* ptr) {
auto container_is_pinned = IsContainerPinned(ptr);
// The tensor may be pinned outside of DGL via a different CUDA API,
// so we cannot unpin it with cudaHostUnregister.
// so we cannot unpin it with hipHostUnregister.
CHECK(ptr->pinned_by_dgl_ || !container_is_pinned)
<< "Cannot unpin a tensor that is pinned outside of DGL.";
// 1. not pinned, do nothing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment