Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
272 additions
and
245 deletions
+272
-245
src/graph/transform/cuda/knn.hip
src/graph/transform/cuda/knn.hip
+24
-22
src/graph/traversal.cc
src/graph/traversal.cc
+2
-1
src/graph/unit_graph.cc
src/graph/unit_graph.cc
+3
-2
src/partition/cuda/partition_op.hip
src/partition/cuda/partition_op.hip
+21
-19
src/partition/ndarray_partition.cc
src/partition/ndarray_partition.cc
+6
-6
src/random/continuous_seed.h
src/random/continuous_seed.h
+13
-12
src/rpc/rpc.cc
src/rpc/rpc.cc
+2
-1
src/rpc/rpc.h
src/rpc/rpc.h
+4
-3
src/runtime/c_runtime_api.cc
src/runtime/c_runtime_api.cc
+6
-3
src/runtime/cuda/cuda_common.h
src/runtime/cuda/cuda_common.h
+67
-66
src/runtime/cuda/cuda_device_api.cc
src/runtime/cuda/cuda_device_api.cc
+87
-85
src/runtime/cuda/cuda_hashtable.cuh
src/runtime/cuda/cuda_hashtable.cuh
+6
-4
src/runtime/cuda/cuda_hashtable.hip
src/runtime/cuda/cuda_hashtable.hip
+20
-12
src/runtime/cuda/gpu_cache.hip
src/runtime/cuda/gpu_cache.hip
+6
-5
src/runtime/module.cc
src/runtime/module.cc
+1
-1
src/runtime/ndarray.cc
src/runtime/ndarray.cc
+4
-3
No files found.
src/graph/transform/cuda/knn.
cu
→
src/graph/transform/cuda/knn.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file graph/transform/cuda/knn.cu
* @file graph/transform/cuda/knn.cu
* @brief k-nearest-neighbor (KNN) implementation (cuda)
* @brief k-nearest-neighbor (KNN) implementation (cuda)
*/
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <algorithm>
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <limits>
#include <limits>
#include <string>
#include <string>
#include <type_traits>
#include <type_traits>
...
@@ -467,7 +469,7 @@ void BruteForceKNNCuda(
...
@@ -467,7 +469,7 @@ void BruteForceKNNCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = data_points->ctx;
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
const int64_t batch_size = data_offsets->shape[0] - 1;
...
@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda(
...
@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = data_points->ctx;
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
const int64_t batch_size = data_offsets->shape[0] - 1;
...
@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda(
...
@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda(
// get max shared memory per block in bytes
// get max shared memory per block in bytes
// determine block size according to this value
// determine block size according to this value
int max_sharedmem_per_block = 0;
int max_sharedmem_per_block = 0;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL(
hip
DeviceGetAttribute(
&
max_sharedmem_per_block
,
cudaDevAttr
MaxSharedMemoryPerBlock
,
&max_sharedmem_per_block,
hipDeviceAttribute
MaxSharedMemoryPerBlock,
ctx.device_id));
ctx.device_id));
const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
(k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
(k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
...
@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda(
...
@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda(
GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
query_offsets_data, num_block_per_segment, batch_size, block_size);
query_offsets_data, num_block_per_segment, batch_size, block_size);
size_t prefix_temp_size = 0;
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
batch_size, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
batch_size, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, prefix_temp);
// wait for results
// wait for results
CUDA_CALL
(
cuda
StreamSynchronize
(
stream
));
CUDA_CALL(
hip
StreamSynchronize(stream));
int64_t num_blocks = 0, final_elem = 0,
int64_t num_blocks = 0, final_elem = 0,
copyoffset = (batch_size - 1) * sizeof(IdType);
copyoffset = (batch_size - 1) * sizeof(IdType);
...
@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda(
...
@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda(
/** @brief Setup rng state for nn-descent */
/** @brief Setup rng state for nn-descent */
__global__ void SetupRngKernel(
__global__ void SetupRngKernel(
cu
randState
*
states
,
const
uint64_t
seed
,
const
size_t
n
)
{
hip
randState
_t
* states, const uint64_t seed, const size_t n) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
if (id < n) {
cu
rand_init
(
seed
,
id
,
0
,
states
+
id
);
hip
rand_init(seed, id, 0, states + id);
}
}
}
}
...
@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel(
...
@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
if (point_idx >= offsets[batch_size]) return;
cu
randState
state
;
hip
randState
_t
state;
cu
rand_init
(
seed
,
point_idx
,
0
,
&
state
);
hip
rand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
for (IdType b = 0; b < batch_size + 1; ++b) {
...
@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel(
...
@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel(
current_central_nodes[i] = point_idx;
current_central_nodes[i] = point_idx;
}
}
for (IdType i = k; i < segment_size; ++i) {
for (IdType i = k; i < segment_size; ++i) {
const
IdType
j
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
i
+
1
));
const IdType j = static_cast<IdType>(
hip
rand(&state) % (i + 1));
if (j < k) current_neighbors[j] = i + segment_start;
if (j < k) current_neighbors[j] = i + segment_start;
}
}
...
@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel(
...
@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
if (point_idx >= offsets[batch_size]) return;
cu
randState
state
;
hip
randState
_t
state;
cu
rand_init
(
seed
,
point_idx
,
0
,
&
state
);
hip
rand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
for (IdType b = 0; b < batch_size + 1; ++b) {
...
@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel(
...
@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
if (curr_num < num_candidates) {
candidate_data[curr_num] = candidate;
candidate_data[curr_num] = candidate;
} else {
} else {
IdType
pos
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
curr_num
+
1
));
IdType pos = static_cast<IdType>(
hip
rand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = candidate;
if (pos < num_candidates) candidate_data[pos] = candidate;
}
}
++candidate_array[0];
++candidate_array[0];
...
@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel(
...
@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
if (curr_num < num_candidates) {
candidate_data[curr_num] = reverse_candidate;
candidate_data[curr_num] = reverse_candidate;
} else {
} else {
IdType
pos
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
curr_num
+
1
));
IdType pos = static_cast<IdType>(
hip
rand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
}
}
++candidate_array[0];
++candidate_array[0];
...
@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
...
@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void NNDescent(
void NNDescent(
const NDArray& points, const IdArray& offsets, IdArray result, const int k,
const NDArray& points, const IdArray& offsets, IdArray result, const int k,
const int num_iters, const int num_candidates, const double delta) {
const int num_iters, const int num_candidates, const double delta) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = points->ctx;
const auto& ctx = points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t num_nodes = points->shape[0];
const int64_t num_nodes = points->shape[0];
...
@@ -887,7 +889,7 @@ void NNDescent(
...
@@ -887,7 +889,7 @@ void NNDescent(
uint64_t seed;
uint64_t seed;
int warp_size = 0;
int warp_size = 0;
CUDA_CALL(
CUDA_CALL(
cuda
DeviceGetAttribute
(
&
warp_size
,
cudaDevAttr
WarpSize
,
ctx
.
device_id
));
hip
DeviceGetAttribute(&warp_size,
hipDeviceAttribute
WarpSize, ctx.device_id));
// We don't need large block sizes, since there's not much inter-thread
// We don't need large block sizes, since there's not much inter-thread
// communication
// communication
int64_t block_size = warp_size;
int64_t block_size = warp_size;
...
@@ -911,7 +913,7 @@ void NNDescent(
...
@@ -911,7 +913,7 @@ void NNDescent(
IdType* total_num_updates_d =
IdType* total_num_updates_d =
static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
CUDA_CALL
(
cub
::
DeviceReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceReduce::Sum(
nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
stream));
stream));
IdType* sum_temp_storage =
IdType* sum_temp_storage =
...
@@ -942,7 +944,7 @@ void NNDescent(
...
@@ -942,7 +944,7 @@ void NNDescent(
feature_size);
feature_size);
total_num_updates = 0;
total_num_updates = 0;
CUDA_CALL
(
cub
::
DeviceReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceReduce::Sum(
sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
num_nodes, stream));
num_nodes, stream));
device->CopyDataFromTo(
device->CopyDataFromTo(
...
...
src/graph/traversal.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2018 by Contributors
* Copyright (c) 2018 by Contributors
* @file graph/traversal.cc
* @file graph/traversal.cc
* @brief Graph traversal implementation
* @brief Graph traversal implementation
*/
*/
#include "
./
traversal.h"
#include "traversal.h"
#include <dgl/packed_func_ext.h>
#include <dgl/packed_func_ext.h>
...
...
src/graph/unit_graph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2019 by Contributors
* Copyright (c) 2019 by Contributors
* @file graph/unit_graph.cc
* @file graph/unit_graph.cc
* @brief UnitGraph graph implementation
* @brief UnitGraph graph implementation
*/
*/
#include "
./
unit_graph.h"
#include "unit_graph.h"
#include <dgl/array.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/base_heterograph.h>
...
@@ -11,7 +12,7 @@
...
@@ -11,7 +12,7 @@
#include <dgl/lazy.h>
#include <dgl/lazy.h>
#include "../c_api_common.h"
#include "../c_api_common.h"
#include "
./
serialize/dglstream.h"
#include "serialize/dglstream.h"
namespace
dgl
{
namespace
dgl
{
...
...
src/partition/cuda/partition_op.
cu
→
src/partition/cuda/partition_op.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file ndarray_partition.h
* @file ndarray_partition.h
...
@@ -6,7 +8,7 @@
...
@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/workspace.h"
#include "../../runtime/workspace.h"
...
@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
...
@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
const auto& ctx = in_idx->ctx;
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
auto device = DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_in = in_idx->shape[0];
const int64_t num_in = in_idx->shape[0];
...
@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
...
@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
}
}
const int64_t part_bits =
const int64_t part_bits =
static_cast
<
int64_t
>
(
std
::
ceil
(
std
::
log2
(
num_parts
)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
Workspace<IdType> proc_id_in(device, ctx, num_in);
...
@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
...
@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
size_t sort_workspace_size;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
num_in, 0, part_bits, stream));
...
@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
...
@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
static_assert(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"
);
"in
hip
cub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
// add a compile time check against the cub version to allow
...
@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
...
@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
"value of int.";
"value of int.";
size_t hist_workspace_size;
size_t hist_workspace_size;
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<IdType>(0), static_cast<IdType>(num_parts),
...
@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
...
@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
const auto& ctx = global_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1) {
if (num_parts > 1) {
IdArray local_idx =
IdArray local_idx =
...
@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder(
...
@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder(
<< num_parts;
<< num_parts;
const auto& ctx = local_idx->ctx;
const auto& ctx = local_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1) {
if (num_parts > 1) {
IdArray global_idx =
IdArray global_idx =
...
@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
...
@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
const auto& ctx = in_idx->ctx;
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
auto device = DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_in = in_idx->shape[0];
const int64_t num_in = in_idx->shape[0];
...
@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
...
@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
}
}
const int64_t part_bits =
const int64_t part_bits =
static_cast
<
int64_t
>
(
std
::
ceil
(
std
::
log2
(
num_parts
)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
Workspace<IdType> proc_id_in(device, ctx, num_in);
...
@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
...
@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
size_t sort_workspace_size;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
num_in, 0, part_bits, stream));
...
@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
...
@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
static_assert(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"
);
"in
hip
cub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
// add a compile time check against the cub version to allow
...
@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
...
@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
"value of int.";
"value of int.";
size_t hist_workspace_size;
size_t hist_workspace_size;
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<IdType>(0), static_cast<IdType>(num_parts),
...
@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
...
@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
IdArray MapToLocalFromRange(
IdArray MapToLocalFromRange(
const int num_parts, IdArray range, IdArray global_idx) {
const int num_parts, IdArray range, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
const auto& ctx = global_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1 && global_idx->shape[0] > 0) {
if (num_parts > 1 && global_idx->shape[0] > 0) {
IdArray local_idx =
IdArray local_idx =
...
@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange(
...
@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange(
<< num_parts;
<< num_parts;
const auto& ctx = local_idx->ctx;
const auto& ctx = local_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1 && local_idx->shape[0] > 0) {
if (num_parts > 1 && local_idx->shape[0] > 0) {
IdArray global_idx =
IdArray global_idx =
...
...
src/partition/ndarray_partition.cc
View file @
6ac701f8
...
@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition {
...
@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
in_idx
)
const
override
{
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
GeneratePermutationFromRemainder
<
kDGLCUDA
,
IdType
>
(
return
impl
::
GeneratePermutationFromRemainder
<
kDGLCUDA
,
IdType
>
(
ArraySize
(),
NumParts
(),
in_idx
);
ArraySize
(),
NumParts
(),
in_idx
);
...
@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition {
...
@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
MapToLocalFromRemainder
<
kDGLCUDA
,
IdType
>
(
return
impl
::
MapToLocalFromRemainder
<
kDGLCUDA
,
IdType
>
(
NumParts
(),
in_idx
);
NumParts
(),
in_idx
);
...
@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition {
...
@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
MapToGlobalFromRemainder
<
kDGLCUDA
,
IdType
>
(
return
impl
::
MapToGlobalFromRemainder
<
kDGLCUDA
,
IdType
>
(
NumParts
(),
in_idx
,
part_id
);
NumParts
(),
in_idx
,
part_id
);
...
@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition {
...
@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition {
IdArray
in_idx
)
const
override
{
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
if
(
ctx
.
device_type
!=
range_
->
ctx
.
device_type
||
if
(
ctx
.
device_type
!=
range_
->
ctx
.
device_type
||
ctx
.
device_id
!=
range_
->
ctx
.
device_id
)
{
ctx
.
device_id
!=
range_
->
ctx
.
device_id
)
{
LOG
(
FATAL
)
<<
"The range for the NDArrayPartition and the input "
LOG
(
FATAL
)
<<
"The range for the NDArrayPartition and the input "
...
@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition {
...
@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition {
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
return
impl
::
MapToLocalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
return
impl
::
MapToLocalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
...
@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition {
...
@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition {
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
return
impl
::
MapToGlobalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
return
impl
::
MapToGlobalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
...
...
src/random/continuous_seed.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/*!
/*!
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* All rights reserved.
* All rights reserved.
...
@@ -24,13 +25,13 @@
...
@@ -24,13 +25,13 @@
#include <cmath>
#include <cmath>
#ifdef __
NV
CC__
#ifdef __
HIP
CC__
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#else
#else
#include <random>
#include <random>
#include "pcg_random.hpp"
#include "pcg_random.hpp"
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
#ifndef M_SQRT1_2
#ifndef M_SQRT1_2
#define M_SQRT1_2 0.707106781186547524401
#define M_SQRT1_2 0.707106781186547524401
...
@@ -58,24 +59,24 @@ class continuous_seed {
...
@@ -58,24 +59,24 @@ class continuous_seed {
c
[
1
]
=
std
::
sin
(
pi
*
r
/
2
);
c
[
1
]
=
std
::
sin
(
pi
*
r
/
2
);
}
}
#ifdef __
CUDA_ARCH
__
#ifdef __
HIP_DEVICE_COMPILE
__
__device__
inline
float
uniform
(
const
uint64_t
t
)
const
{
__device__
inline
float
uniform
(
const
uint64_t
t
)
const
{
const
uint64_t
kCurandSeed
=
999961
;
// Could be any random number.
const
uint64_t
kCurandSeed
=
999961
;
// Could be any random number.
cu
randStatePhilox4_32_10_t
rng
;
hip
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
kCurandSeed
,
s
[
0
],
t
,
&
rng
);
hip
rand_init
(
kCurandSeed
,
s
[
0
],
t
,
&
rng
);
float
rnd
;
float
rnd
;
if
(
s
[
0
]
!=
s
[
1
])
{
if
(
s
[
0
]
!=
s
[
1
])
{
rnd
=
c
[
0
]
*
cu
rand_normal
(
&
rng
);
rnd
=
c
[
0
]
*
hip
rand_normal
(
&
rng
);
cu
rand_init
(
kCurandSeed
,
s
[
1
],
t
,
&
rng
);
hip
rand_init
(
kCurandSeed
,
s
[
1
],
t
,
&
rng
);
rnd
+=
c
[
1
]
*
cu
rand_normal
(
&
rng
);
rnd
+=
c
[
1
]
*
hip
rand_normal
(
&
rng
);
rnd
=
normcdff
(
rnd
);
rnd
=
normcdff
(
rnd
);
}
else
{
}
else
{
rnd
=
cu
rand_uniform
(
&
rng
);
rnd
=
hip
rand_uniform
(
&
rng
);
}
}
return
rnd
;
return
rnd
;
}
}
#else
#else
inline
float
uniform
(
const
uint64_t
t
)
const
{
__host__
inline
float
uniform
(
const
uint64_t
t
)
const
{
pcg32
ng0
(
s
[
0
],
t
);
pcg32
ng0
(
s
[
0
],
t
);
float
rnd
;
float
rnd
;
if
(
s
[
0
]
!=
s
[
1
])
{
if
(
s
[
0
]
!=
s
[
1
])
{
...
@@ -91,7 +92,7 @@ class continuous_seed {
...
@@ -91,7 +92,7 @@ class continuous_seed {
}
}
return
rnd
;
return
rnd
;
}
}
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
};
};
}
// namespace random
}
// namespace random
...
...
src/rpc/rpc.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.cc
* @file rpc/rpc.cc
* @brief Implementation of RPC utilities used by both server and client sides.
* @brief Implementation of RPC utilities used by both server and client sides.
*/
*/
#if defined(__linux__)
#if defined(__linux__)
#include "
./
rpc.h"
#include "rpc.h"
#include <dgl/array.h>
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
#include <dgl/packed_func_ext.h>
...
...
src/rpc/rpc.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.h
* @file rpc/rpc.h
...
@@ -19,9 +20,9 @@
...
@@ -19,9 +20,9 @@
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "
./
network/common.h"
#include "network/common.h"
#include "
./
rpc_msg.h"
#include "rpc_msg.h"
#include "
./
server_state.h"
#include "server_state.h"
#include "network/socket_communicator.h"
#include "network/socket_communicator.h"
namespace
dgl
{
namespace
dgl
{
...
...
src/runtime/c_runtime_api.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2016-2022 by Contributors
* Copyright (c) 2016-2022 by Contributors
* @file c_runtime_api.cc
* @file c_runtime_api.cc
...
@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) {
...
@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) {
return
"cpu"
;
return
"cpu"
;
case
kDGLCUDA
:
case
kDGLCUDA
:
return
"cuda"
;
return
"cuda"
;
case
kDGLROCM
:
return
"cuda"
;
// add more device here once supported
// add more device here once supported
default:
default:
LOG
(
FATAL
)
<<
"unknown type ="
<<
type
;
LOG
(
FATAL
)
<<
"unknown type ="
<<
type
;
...
@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo(
...
@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo(
}
}
bool
DeviceAPI
::
PinData
(
void
*
ptr
,
size_t
nbytes
)
{
bool
DeviceAPI
::
PinData
(
void
*
ptr
,
size_t
nbytes
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
HostRegister api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
HostRegister api."
;
return
false
;
return
false
;
}
}
void
*
DeviceAPI
::
AllocPinnedDataSpace
(
void
*
DeviceAPI
::
AllocPinnedDataSpace
(
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
)
{
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
Host
A
lloc api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
Host
Ma
lloc api."
;
return
nullptr
;
return
nullptr
;
}
}
...
@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
...
@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
}
}
void
DeviceAPI
::
UnpinData
(
void
*
ptr
)
{
void
DeviceAPI
::
UnpinData
(
void
*
ptr
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
HostUnregister api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
HostUnregister api."
;
}
}
}
// namespace runtime
}
// namespace runtime
}
// namespace dgl
}
// namespace dgl
...
...
src/runtime/cuda/cuda_common.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2017 by Contributors
* Copyright (c) 2017 by Contributors
* @file cuda_common.h
* @file cuda_common.h
...
@@ -6,10 +7,10 @@
...
@@ -6,10 +7,10 @@
#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#include <
cu
blas
_v2
.h>
#include <
hip
blas
/hipblas
.h>
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <
cu
rand.h>
#include <
hiprand/hip
rand.h>
#include <
cu
sparse.h>
#include <
hipsparse/hip
sparse.h>
#include <dgl/runtime/packed_func.h>
#include <dgl/runtime/packed_func.h>
#include <memory>
#include <memory>
...
@@ -25,8 +26,8 @@ namespace runtime {
...
@@ -25,8 +26,8 @@ namespace runtime {
DGL's memory pool and the current cuda stream
DGL's memory pool and the current cuda stream
runtime::CUDAWorkspaceAllocator allocator(ctx);
runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto stream = runtime::getCurrent
CUDA
Stream();
const auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto exec_policy = thrust::
cuda
::par_nosync(allocator).on(stream);
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
now, one can pass exec_policy to thrust functions
now, one can pass exec_policy to thrust functions
...
@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) {
...
@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) {
#define CUDA_DRIVER_CALL(x) \
#define CUDA_DRIVER_CALL(x) \
{ \
{ \
CUresul
t result = x; \
hipError_
t result = x; \
if (result !=
CUDA_SUCCESS
&& result !=
CUDA_ERROR_DEINITIALIZED
) { \
if (result !=
hipSuccess
&& result !=
hipErrorDeinitialized
) { \
const char* msg; \
const char* msg; \
cu
GetErrorName(result, &msg); \
hip
GetErrorName(result, &msg); \
LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \
LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \
} \
} \
}
}
#define CUDA_CALL(func) \
#define CUDA_CALL(func) \
{ \
{ \
cuda
Error_t e = (func); \
hip
Error_t e = (func); \
CHECK(e ==
cuda
Success || e ==
cuda
Error
CudartUnloading
) \
CHECK(e ==
hip
Success || e ==
hip
Error
Deinitialized
) \
<< "CUDA: " <<
cuda
GetErrorString(e); \
<< "CUDA: " <<
hip
GetErrorString(e); \
}
}
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
{ \
{ \
if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
(kernel)
<<<
(nblks), (nthrs), (shmem), (stream)
>>>(
__VA_ARGS__); \
hipLaunchKernelGGL((
(kernel)
), dim3(
(nblks)
)
,
dim3(
(nthrs)
)
, (shmem), (stream)
,
__VA_ARGS__); \
cuda
Error_t e =
cuda
GetLastError(); \
hip
Error_t e =
hip
GetLastError(); \
CHECK(e ==
cuda
Success || e ==
cuda
Error
CudartUnloading
) \
CHECK(e ==
hip
Success || e ==
hip
Error
Deinitialized
) \
<< "CUDA kernel launch error: " <<
cuda
GetErrorString(e); \
<< "CUDA kernel launch error: " <<
hip
GetErrorString(e); \
} \
} \
}
}
#define CUSPARSE_CALL(func) \
#define CUSPARSE_CALL(func) \
{ \
{ \
cu
sparseStatus_t e = (func); \
hip
sparseStatus_t e = (func); \
CHECK(e ==
CU
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
CHECK(e ==
HIP
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
}
}
#define CUBLAS_CALL(func) \
#define CUBLAS_CALL(func) \
{ \
{ \
cu
blasStatus_t e = (func); \
hip
blasStatus_t e = (func); \
CHECK(e ==
CU
BLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
CHECK(e ==
HIP
BLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
}
}
#define CURAND_CALL(func) \
#define CURAND_CALL(func) \
{ \
{ \
cu
randStatus_t e = (func); \
hip
randStatus_t e = (func); \
CHECK(e ==
CU
RAND_STATUS_SUCCESS) \
CHECK(e ==
HIP
RAND_STATUS_SUCCESS) \
<< "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
<< "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
<< __FILE__ << ":" << __LINE__; \
<< __FILE__ << ":" << __LINE__; \
}
}
inline
const
char
*
curandGetErrorString
(
cu
randStatus_t
error
)
{
inline
const
char
*
curandGetErrorString
(
hip
randStatus_t
error
)
{
switch
(
error
)
{
switch
(
error
)
{
case
CU
RAND_STATUS_SUCCESS
:
case
HIP
RAND_STATUS_SUCCESS
:
return
"
CU
RAND_STATUS_SUCCESS"
;
return
"
HIP
RAND_STATUS_SUCCESS"
;
case
CU
RAND_STATUS_VERSION_MISMATCH
:
case
HIP
RAND_STATUS_VERSION_MISMATCH
:
return
"
CU
RAND_STATUS_VERSION_MISMATCH"
;
return
"
HIP
RAND_STATUS_VERSION_MISMATCH"
;
case
CU
RAND_STATUS_NOT_INITIALIZED
:
case
HIP
RAND_STATUS_NOT_INITIALIZED
:
return
"
CU
RAND_STATUS_NOT_INITIALIZED"
;
return
"
HIP
RAND_STATUS_NOT_INITIALIZED"
;
case
CU
RAND_STATUS_ALLOCATION_FAILED
:
case
HIP
RAND_STATUS_ALLOCATION_FAILED
:
return
"
CU
RAND_STATUS_ALLOCATION_FAILED"
;
return
"
HIP
RAND_STATUS_ALLOCATION_FAILED"
;
case
CU
RAND_STATUS_TYPE_ERROR
:
case
HIP
RAND_STATUS_TYPE_ERROR
:
return
"
CU
RAND_STATUS_TYPE_ERROR"
;
return
"
HIP
RAND_STATUS_TYPE_ERROR"
;
case
CU
RAND_STATUS_OUT_OF_RANGE
:
case
HIP
RAND_STATUS_OUT_OF_RANGE
:
return
"
CU
RAND_STATUS_OUT_OF_RANGE"
;
return
"
HIP
RAND_STATUS_OUT_OF_RANGE"
;
case
CU
RAND_STATUS_LENGTH_NOT_MULTIPLE
:
case
HIP
RAND_STATUS_LENGTH_NOT_MULTIPLE
:
return
"
CU
RAND_STATUS_LENGTH_NOT_MULTIPLE"
;
return
"
HIP
RAND_STATUS_LENGTH_NOT_MULTIPLE"
;
case
CU
RAND_STATUS_DOUBLE_PRECISION_REQUIRED
:
case
HIP
RAND_STATUS_DOUBLE_PRECISION_REQUIRED
:
return
"
CU
RAND_STATUS_DOUBLE_PRECISION_REQUIRED"
;
return
"
HIP
RAND_STATUS_DOUBLE_PRECISION_REQUIRED"
;
case
CU
RAND_STATUS_LAUNCH_FAILURE
:
case
HIP
RAND_STATUS_LAUNCH_FAILURE
:
return
"
CU
RAND_STATUS_LAUNCH_FAILURE"
;
return
"
HIP
RAND_STATUS_LAUNCH_FAILURE"
;
case
CU
RAND_STATUS_PREEXISTING_FAILURE
:
case
HIP
RAND_STATUS_PREEXISTING_FAILURE
:
return
"
CU
RAND_STATUS_PREEXISTING_FAILURE"
;
return
"
HIP
RAND_STATUS_PREEXISTING_FAILURE"
;
case
CU
RAND_STATUS_INITIALIZATION_FAILED
:
case
HIP
RAND_STATUS_INITIALIZATION_FAILED
:
return
"
CU
RAND_STATUS_INITIALIZATION_FAILED"
;
return
"
HIP
RAND_STATUS_INITIALIZATION_FAILED"
;
case
CU
RAND_STATUS_ARCH_MISMATCH
:
case
HIP
RAND_STATUS_ARCH_MISMATCH
:
return
"
CU
RAND_STATUS_ARCH_MISMATCH"
;
return
"
HIP
RAND_STATUS_ARCH_MISMATCH"
;
case
CU
RAND_STATUS_INTERNAL_ERROR
:
case
HIP
RAND_STATUS_INTERNAL_ERROR
:
return
"
CU
RAND_STATUS_INTERNAL_ERROR"
;
return
"
HIP
RAND_STATUS_INTERNAL_ERROR"
;
}
}
// To suppress compiler warning.
// To suppress compiler warning.
return
"Unrecognized
cu
rand error string"
;
return
"Unrecognized
hip
rand error string"
;
}
}
/**
/**
* @brief Cast data type to
cuda
DataType
_t
.
* @brief Cast data type to
hip
DataType.
*/
*/
template
<
typename
T
>
template
<
typename
T
>
struct
cuda_dtype
{
struct
cuda_dtype
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_32F
;
static
constexpr
hip
DataType
value
=
HIP
_R_32F
;
};
};
template
<
>
template
<
>
struct
cuda_dtype
<
__half
>
{
struct
cuda_dtype
<
__half
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_16F
;
static
constexpr
hip
DataType
value
=
HIP
_R_16F
;
};
};
#if BF16_ENABLED
#if BF16_ENABLED
template
<
>
template
<
>
struct
cuda_dtype
<
__
nv
_bfloat16
>
{
struct
cuda_dtype
<
__
hip
_bfloat16
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_16BF
;
static
constexpr
hip
DataType
value
=
HIP
_R_16BF
;
};
};
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template
<
>
template
<
>
struct
cuda_dtype
<
float
>
{
struct
cuda_dtype
<
float
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_32F
;
static
constexpr
hip
DataType
value
=
HIP
_R_32F
;
};
};
template
<
>
template
<
>
struct
cuda_dtype
<
double
>
{
struct
cuda_dtype
<
double
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_64F
;
static
constexpr
hip
DataType
value
=
HIP
_R_64F
;
};
};
/*
/*
...
@@ -202,7 +203,7 @@ struct accum_dtype<__half> {
...
@@ -202,7 +203,7 @@ struct accum_dtype<__half> {
#if BF16_ENABLED
#if BF16_ENABLED
template
<
>
template
<
>
struct
accum_dtype
<
__
nv
_bfloat16
>
{
struct
accum_dtype
<
__
hip
_bfloat16
>
{
typedef
float
type
;
typedef
float
type
;
};
};
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -217,23 +218,23 @@ struct accum_dtype<double> {
...
@@ -217,23 +218,23 @@ struct accum_dtype<double> {
typedef
double
type
;
typedef
double
type
;
};
};
#if
CUDA
RT_VERSION >= 11000
#if
DTK
RT_VERSION >= 11000
/**
/**
* @brief Cast index data type to
cu
sparseIndexType_t.
* @brief Cast index data type to
hip
sparseIndexType_t.
*/
*/
template
<
typename
T
>
template
<
typename
T
>
struct
cusparse_idtype
{
struct
cusparse_idtype
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_32I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_32I
;
};
};
template
<
>
template
<
>
struct
cusparse_idtype
<
int32_t
>
{
struct
cusparse_idtype
<
int32_t
>
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_32I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_32I
;
};
};
template
<
>
template
<
>
struct
cusparse_idtype
<
int64_t
>
{
struct
cusparse_idtype
<
int64_t
>
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_64I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_64I
;
};
};
#endif
#endif
...
@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> {
...
@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> {
class
CUDAThreadEntry
{
class
CUDAThreadEntry
{
public:
public:
/** @brief The cusparse handler */
/** @brief The cusparse handler */
cu
sparseHandle_t
cusparse_handle
{
nullptr
};
hip
sparseHandle_t
cusparse_handle
{
nullptr
};
/** @brief The cublas handler */
/** @brief The cublas handler */
cu
blasHandle_t
cublas_handle
{
nullptr
};
hip
blasHandle_t
cublas_handle
{
nullptr
};
/** @brief thread local pool*/
/** @brief thread local pool*/
WorkspacePool
pool
;
WorkspacePool
pool
;
/** @brief constructor */
/** @brief constructor */
...
@@ -253,7 +254,7 @@ class CUDAThreadEntry {
...
@@ -253,7 +254,7 @@ class CUDAThreadEntry {
};
};
/** @brief Get the current CUDA stream */
/** @brief Get the current CUDA stream */
cuda
Stream_t
getCurrent
CUDA
Stream
();
hip
Stream_t
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
}
// namespace runtime
}
// namespace runtime
}
// namespace dgl
}
// namespace dgl
#endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
src/runtime/cuda/cuda_device_api.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2017-2022 by Contributors
* Copyright (c) 2017-2022 by Contributors
* @file cuda_device_api.cc
* @file cuda_device_api.cc
* @brief GPU specific API
* @brief GPU specific API
*/
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/tensordispatch.h>
#include <dgl/runtime/tensordispatch.h>
...
@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI {
public:
public:
CUDADeviceAPI
()
{
CUDADeviceAPI
()
{
int
count
;
int
count
;
auto
err
=
cuda
GetDeviceCount
(
&
count
);
auto
err
=
hip
GetDeviceCount
(
&
count
);
switch
(
err
)
{
switch
(
err
)
{
case
cuda
Success
:
case
hip
Success
:
break
;
break
;
default:
default:
count
=
0
;
count
=
0
;
cuda
GetLastError
();
hip
GetLastError
();
}
}
is_available_
=
count
>
0
;
is_available_
=
count
>
0
;
}
}
...
@@ -32,67 +33,68 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -32,67 +33,68 @@ class CUDADeviceAPI final : public DeviceAPI {
bool
IsAvailable
()
final
{
return
is_available_
;
}
bool
IsAvailable
()
final
{
return
is_available_
;
}
void
SetDevice
(
DGLContext
ctx
)
final
{
void
SetDevice
(
DGLContext
ctx
)
final
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
}
}
void
GetAttr
(
DGLContext
ctx
,
DeviceAttrKind
kind
,
DGLRetValue
*
rv
)
final
{
void
GetAttr
(
DGLContext
ctx
,
DeviceAttrKind
kind
,
DGLRetValue
*
rv
)
final
{
int
value
=
0
;
int
value
=
0
;
switch
(
kind
)
{
switch
(
kind
)
{
case
kExist
:
case
kExist
:
value
=
value
=
(
cuda
DeviceGetAttribute
(
(
hip
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxThreadsPerBlock
,
ctx
.
device_id
)
==
&
value
,
hipDeviceAttribute
MaxThreadsPerBlock
,
ctx
.
device_id
)
==
cuda
Success
);
hip
Success
);
break
;
break
;
case
kMaxThreadsPerBlock
:
{
case
kMaxThreadsPerBlock
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxThreadsPerBlock
,
ctx
.
device_id
));
&
value
,
hipDeviceAttribute
MaxThreadsPerBlock
,
ctx
.
device_id
));
break
;
break
;
}
}
case
kWarpSize
:
{
case
kWarpSize
:
{
CUDA_CALL
(
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
WarpSize
,
ctx
.
device_id
));
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
WarpSize
,
ctx
.
device_id
));
break
;
break
;
}
}
case
kMaxSharedMemoryPerBlock
:
{
case
kMaxSharedMemoryPerBlock
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxSharedMemoryPerBlock
,
ctx
.
device_id
));
&
value
,
hipDeviceAttribute
MaxSharedMemoryPerBlock
,
ctx
.
device_id
));
break
;
break
;
}
}
case
kComputeVersion
:
{
case
kComputeVersion
:
{
std
::
ostringstream
os
;
std
::
ostringstream
os
;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ComputeCapabilityMajor
,
ctx
.
device_id
));
&
value
,
hipDeviceAttribute
ComputeCapabilityMajor
,
ctx
.
device_id
));
os
<<
value
<<
"."
;
os
<<
value
<<
"."
;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ComputeCapabilityMinor
,
ctx
.
device_id
));
&
value
,
hipDeviceAttribute
ComputeCapabilityMinor
,
ctx
.
device_id
));
os
<<
value
;
os
<<
value
;
*
rv
=
os
.
str
();
*
rv
=
os
.
str
();
return
;
return
;
}
}
case
kDeviceName
:
{
case
kDeviceName
:
{
cuda
DeviceProp
props
;
hip
DeviceProp
_t
props
;
CUDA_CALL
(
cuda
GetDeviceProperties
(
&
props
,
ctx
.
device_id
));
CUDA_CALL
(
hip
GetDeviceProperties
(
&
props
,
ctx
.
device_id
));
*
rv
=
std
::
string
(
props
.
name
);
*
rv
=
std
::
string
(
props
.
name
);
// printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
return
;
return
;
}
}
case
kMaxClockRate
:
{
case
kMaxClockRate
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ClockRate
,
ctx
.
device_id
));
&
value
,
hipDeviceAttribute
ClockRate
,
ctx
.
device_id
));
break
;
break
;
}
}
case
kMultiProcessorCount
:
{
case
kMultiProcessorCount
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
cudaDevAttr
Multi
P
rocessorCount
,
ctx
.
device_id
));
&
value
,
hipDeviceAttribute
Multi
p
rocessorCount
,
ctx
.
device_id
));
break
;
break
;
}
}
case
kMaxThreadDimensions
:
{
case
kMaxThreadDimensions
:
{
int
dims
[
3
];
int
dims
[
3
];
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
0
],
cudaDevAttr
MaxBlockDimX
,
ctx
.
device_id
));
&
dims
[
0
],
hipDeviceAttribute
MaxBlockDimX
,
ctx
.
device_id
));
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
1
],
cudaDevAttr
MaxBlockDimY
,
ctx
.
device_id
));
&
dims
[
1
],
hipDeviceAttribute
MaxBlockDimY
,
ctx
.
device_id
));
CUDA_CALL
(
cuda
DeviceGetAttribute
(
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
2
],
cudaDevAttr
MaxBlockDimZ
,
ctx
.
device_id
));
&
dims
[
2
],
hipDeviceAttribute
MaxBlockDimZ
,
ctx
.
device_id
));
std
::
stringstream
ss
;
// use json string to return multiple int values;
std
::
stringstream
ss
;
// use json string to return multiple int values;
ss
<<
"["
<<
dims
[
0
]
<<
", "
<<
dims
[
1
]
<<
", "
<<
dims
[
2
]
<<
"]"
;
ss
<<
"["
<<
dims
[
0
]
<<
", "
<<
dims
[
1
]
<<
", "
<<
dims
[
2
]
<<
"]"
;
...
@@ -110,11 +112,11 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -110,11 +112,11 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
{
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
nbytes
,
getCurrent
CUDA
Stream
());
nbytes
,
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
}
}
CHECK_EQ
(
256
%
alignment
,
0U
)
<<
"CUDA space is aligned at 256 bytes"
;
CHECK_EQ
(
256
%
alignment
,
0U
)
<<
"CUDA space is aligned at 256 bytes"
;
void
*
ret
;
void
*
ret
;
CUDA_CALL
(
cuda
Malloc
(
&
ret
,
nbytes
));
CUDA_CALL
(
hip
Malloc
(
&
ret
,
nbytes
));
return
ret
;
return
ret
;
}
}
...
@@ -124,32 +126,32 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -124,32 +126,32 @@ class CUDADeviceAPI final : public DeviceAPI {
if
(
tensor_dispatcher
->
IsAvailable
())
{
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CUDAFreeWorkspace
(
ptr
);
return
tensor_dispatcher
->
CUDAFreeWorkspace
(
ptr
);
}
}
CUDA_CALL
(
cuda
Free
(
ptr
));
CUDA_CALL
(
hip
Free
(
ptr
));
}
}
void
CopyDataFromTo
(
void
CopyDataFromTo
(
const
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
const
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
DGLStreamHandle
stream
)
{
DGLDataType
type_hint
,
DGLStreamHandle
stream
)
{
cuda
Stream_t
cu_stream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
hip
Stream_t
cu_stream
=
static_cast
<
hip
Stream_t
>
(
stream
);
from
=
static_cast
<
const
char
*>
(
from
)
+
from_offset
;
from
=
static_cast
<
const
char
*>
(
from
)
+
from_offset
;
to
=
static_cast
<
char
*>
(
to
)
+
to_offset
;
to
=
static_cast
<
char
*>
(
to
)
+
to_offset
;
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCUDA
)
{
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
&&
ctx_to
.
device_type
==
kDGLROCM
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_from
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx_from
.
device_id
));
if
(
ctx_from
.
device_id
==
ctx_to
.
device_id
)
{
if
(
ctx_from
.
device_id
==
ctx_to
.
device_id
)
{
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyDeviceToDevice
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyDeviceToDevice
,
cu_stream
);
}
else
{
}
else
{
CUDA_CALL
(
cuda
MemcpyPeerAsync
(
CUDA_CALL
(
hip
MemcpyPeerAsync
(
to
,
ctx_to
.
device_id
,
from
,
ctx_from
.
device_id
,
size
,
cu_stream
));
to
,
ctx_to
.
device_id
,
from
,
ctx_from
.
device_id
,
size
,
cu_stream
));
}
}
}
else
if
(
}
else
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
(
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
)
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_from
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx_from
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyDeviceToHost
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyDeviceToHost
,
cu_stream
);
}
else
if
(
}
else
if
(
ctx_from
.
device_type
==
kDGLCPU
&&
ctx_to
.
device_type
==
kDGLCUDA
)
{
ctx_from
.
device_type
==
kDGLCPU
&&
(
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
)
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_to
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx_to
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyHostToDevice
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyHostToDevice
,
cu_stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU"
;
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU"
;
}
}
...
@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI {
}
}
// To ensure correct behavior, `record_event` must be invoked anytime a
// To ensure correct behavior, `record_event` must be invoked anytime a
// pointer from PyTorch CachingHostAllocator is used in a
cuda
MemcpyAsync
// pointer from PyTorch CachingHostAllocator is used in a
hip
MemcpyAsync
// call. It provides a way to re-use freed pinned (page-locked) memory
// call. It provides a way to re-use freed pinned (page-locked) memory
// allocations and avoid device sync due to
cudaFreeHost
calls.
// allocations and avoid device sync due to
hipHostFree
calls.
void
RecordedCopyDataFromTo
(
void
RecordedCopyDataFromTo
(
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
...
@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI {
stream
);
stream
);
auto
tensor_dispatcher
=
TensorDispatcher
::
Global
();
auto
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
{
if
(
tensor_dispatcher
->
IsAvailable
())
{
auto
custream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
auto
custream
=
static_cast
<
hip
Stream_t
>
(
stream
);
void
*
ptr
=
ctx_to
.
device_type
==
kDGLCPU
?
to
:
from
;
void
*
ptr
=
ctx_to
.
device_type
==
kDGLCPU
?
to
:
from
;
int
id
=
int
id
=
ctx_to
.
device_type
==
kDGLCPU
?
ctx_from
.
device_id
:
ctx_to
.
device_id
;
ctx_to
.
device_type
==
kDGLCPU
?
ctx_from
.
device_id
:
ctx_to
.
device_id
;
...
@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI {
}
}
DGLStreamHandle
CreateStream
(
DGLContext
ctx
)
{
DGLStreamHandle
CreateStream
(
DGLContext
ctx
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
retval
;
hip
Stream_t
retval
;
// make sure the legacy default stream won't block on this stream
// make sure the legacy default stream won't block on this stream
CUDA_CALL
(
cuda
StreamCreateWithFlags
(
&
retval
,
cuda
StreamNonBlocking
));
CUDA_CALL
(
hip
StreamCreateWithFlags
(
&
retval
,
hip
StreamNonBlocking
));
return
static_cast
<
DGLStreamHandle
>
(
retval
);
return
static_cast
<
DGLStreamHandle
>
(
retval
);
}
}
void
FreeStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
{
void
FreeStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
cu_stream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
hip
Stream_t
cu_stream
=
static_cast
<
hip
Stream_t
>
(
stream
);
CUDA_CALL
(
cuda
StreamDestroy
(
cu_stream
));
CUDA_CALL
(
hip
StreamDestroy
(
cu_stream
));
}
}
void
SyncStreamFromTo
(
void
SyncStreamFromTo
(
DGLContext
ctx
,
DGLStreamHandle
event_src
,
DGLStreamHandle
event_dst
)
{
DGLContext
ctx
,
DGLStreamHandle
event_src
,
DGLStreamHandle
event_dst
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
src_stream
=
static_cast
<
cuda
Stream_t
>
(
event_src
);
hip
Stream_t
src_stream
=
static_cast
<
hip
Stream_t
>
(
event_src
);
cuda
Stream_t
dst_stream
=
static_cast
<
cuda
Stream_t
>
(
event_dst
);
hip
Stream_t
dst_stream
=
static_cast
<
hip
Stream_t
>
(
event_dst
);
cuda
Event_t
evt
;
hip
Event_t
evt
;
CUDA_CALL
(
cuda
EventCreate
(
&
evt
));
CUDA_CALL
(
hip
EventCreate
(
&
evt
));
CUDA_CALL
(
cuda
EventRecord
(
evt
,
src_stream
));
CUDA_CALL
(
hip
EventRecord
(
evt
,
src_stream
));
CUDA_CALL
(
cuda
StreamWaitEvent
(
dst_stream
,
evt
,
0
));
CUDA_CALL
(
hip
StreamWaitEvent
(
dst_stream
,
evt
,
0
));
CUDA_CALL
(
cuda
EventDestroy
(
evt
));
CUDA_CALL
(
hip
EventDestroy
(
evt
));
}
}
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
cuda
StreamSynchronize
(
static_cast
<
cuda
Stream_t
>
(
stream
)));
CUDA_CALL
(
hip
StreamSynchronize
(
static_cast
<
hip
Stream_t
>
(
stream
)));
}
}
/** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
/** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
...
@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI {
void
SetStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{}
void
SetStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{}
DGLStreamHandle
GetStream
()
const
final
{
DGLStreamHandle
GetStream
()
const
final
{
return
static_cast
<
DGLStreamHandle
>
(
getCurrent
CUDA
Stream
());
return
static_cast
<
DGLStreamHandle
>
(
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
}
}
/** NOTE:
cuda
HostRegister can be called from an arbitrary GPU device,
/** NOTE:
hip
HostRegister can be called from an arbitrary GPU device,
* so we don't need to specify a ctx.
* so we don't need to specify a ctx.
* The pinned memory can be seen by all CUDA contexts,
* The pinned memory can be seen by all CUDA contexts,
* not just the one that performed the allocation
* not just the one that performed the allocation
...
@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI {
if
(
tensor_dispatcher
->
IsAvailable
())
{
if
(
tensor_dispatcher
->
IsAvailable
())
{
tensor_dispatcher
->
CUDAHostAllocatorEmptyCache
();
tensor_dispatcher
->
CUDAHostAllocatorEmptyCache
();
}
}
CUDA_CALL
(
cuda
HostRegister
(
ptr
,
nbytes
,
cuda
HostRegisterDefault
));
CUDA_CALL
(
hip
HostRegister
(
ptr
,
nbytes
,
hip
HostRegisterDefault
));
return
true
;
return
true
;
}
}
void
UnpinData
(
void
*
ptr
)
{
void
UnpinData
(
void
*
ptr
)
{
if
(
ptr
==
nullptr
)
return
;
if
(
ptr
==
nullptr
)
return
;
CUDA_CALL
(
cuda
HostUnregister
(
ptr
));
CUDA_CALL
(
hip
HostUnregister
(
ptr
));
}
}
void
*
AllocPinnedDataSpace
(
void
*
AllocPinnedDataSpace
(
...
@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI {
// can't be a pinned tensor if CUDA context is unavailable.
// can't be a pinned tensor if CUDA context is unavailable.
if
(
!
is_available_
)
return
false
;
if
(
!
is_available_
)
return
false
;
cuda
PointerAttribute
s
attr
;
hip
PointerAttribute
_t
attr
;
cuda
Error_t
status
=
cuda
PointerGetAttributes
(
&
attr
,
ptr
);
hip
Error_t
status
=
hip
PointerGetAttributes
(
&
attr
,
ptr
);
bool
result
=
false
;
bool
result
=
false
;
switch
(
status
)
{
switch
(
status
)
{
case
cuda
ErrorInvalidValue
:
case
hip
ErrorInvalidValue
:
// might be a normal CPU tensor in CUDA 10.2-
// might be a normal CPU tensor in CUDA 10.2-
cuda
GetLastError
();
// clear error
hip
GetLastError
();
// clear error
break
;
break
;
case
cuda
Success
:
case
hip
Success
:
result
=
(
attr
.
type
==
cuda
MemoryTypeHost
);
result
=
(
attr
.
type
==
hip
MemoryTypeHost
);
break
;
break
;
case
cuda
ErrorInitializationError
:
case
hip
ErrorInitializationError
:
case
cuda
ErrorNoDevice
:
case
hip
ErrorNoDevice
:
case
cuda
ErrorInsufficientDriver
:
case
hip
ErrorInsufficientDriver
:
case
cuda
ErrorInvalidDevice
:
case
hip
ErrorInvalidDevice
:
// We don't want to fail in these particular cases since this function
// We don't want to fail in these particular cases since this function
// can be called when users only want to run on CPU even if CUDA API is
// can be called when users only want to run on CPU even if CUDA API is
// enabled, or in a forked subprocess where CUDA context cannot be
// enabled, or in a forked subprocess where CUDA context cannot be
// initialized. So we just mark the CUDA context to unavailable and
// initialized. So we just mark the CUDA context to unavailable and
// return.
// return.
is_available_
=
false
;
is_available_
=
false
;
cuda
GetLastError
();
// clear error
hip
GetLastError
();
// clear error
break
;
break
;
default:
default:
LOG
(
FATAL
)
<<
"error while determining memory status: "
LOG
(
FATAL
)
<<
"error while determining memory status: "
<<
cuda
GetErrorString
(
status
);
<<
hip
GetErrorString
(
status
);
break
;
break
;
}
}
...
@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
size
,
getCurrent
CUDA
Stream
());
size
,
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
}
}
...
@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI {
private:
private:
static
void
GPUCopy
(
static
void
GPUCopy
(
const
void
*
from
,
void
*
to
,
size_t
size
,
cuda
MemcpyKind
kind
,
const
void
*
from
,
void
*
to
,
size_t
size
,
hip
MemcpyKind
kind
,
cuda
Stream_t
stream
)
{
hip
Stream_t
stream
)
{
CUDA_CALL
(
cuda
MemcpyAsync
(
to
,
from
,
size
,
kind
,
stream
));
CUDA_CALL
(
hip
MemcpyAsync
(
to
,
from
,
size
,
kind
,
stream
));
if
(
stream
==
0
&&
kind
==
cuda
MemcpyDeviceToHost
)
{
if
(
stream
==
0
&&
kind
==
hip
MemcpyDeviceToHost
)
{
// only wait for the copy, when it's on the default stream, and it's to
// only wait for the copy, when it's on the default stream, and it's to
// host memory
// host memory
CUDA_CALL
(
cuda
StreamSynchronize
(
stream
));
CUDA_CALL
(
hip
StreamSynchronize
(
stream
));
}
}
}
}
...
@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
...
@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
return
CUDAThreadStore
::
Get
();
return
CUDAThreadStore
::
Get
();
}
}
cuda
Stream_t
getCurrent
CUDA
Stream
()
{
hip
Stream_t
getCurrent
HIP
Stream
MasqueradingAsCUDA
()
{
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CUDAGetCurrentStream
();
return
tensor_dispatcher
->
CUDAGetCurrentStream
();
...
...
src/runtime/cuda/cuda_hashtable.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
* @file runtime/cuda/cuda_device_common.cuh
...
@@ -10,7 +12,7 @@
...
@@ -10,7 +12,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include <dgl/runtime/c_runtime_api.h>
#include "cuda_common.h"
#include "cuda_common.h"
#include
"cuda
_runtime.h
"
#include
<hip/hip
_runtime.h
>
namespace
dgl
{
namespace
dgl
{
namespace
runtime
{
namespace
runtime
{
...
@@ -228,7 +230,7 @@ class OrderedHashTable {
...
@@ -228,7 +230,7 @@ class OrderedHashTable {
* @param stream The stream to use for initializing the hashtable.
* @param stream The stream to use for initializing the hashtable.
*/
*/
OrderedHashTable
(
OrderedHashTable
(
const
size_t
size
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
const
size_t
size
,
DGLContext
ctx
,
hip
Stream_t
stream
,
const
int
scale
=
kDefaultScale
);
const
int
scale
=
kDefaultScale
);
/**
/**
...
@@ -252,7 +254,7 @@ class OrderedHashTable {
...
@@ -252,7 +254,7 @@ class OrderedHashTable {
*/
*/
void
FillWithDuplicates
(
void
FillWithDuplicates
(
const
IdType
*
const
input
,
const
size_t
num_input
,
IdType
*
const
unique
,
const
IdType
*
const
input
,
const
size_t
num_input
,
IdType
*
const
unique
,
int64_t
*
const
num_unique
,
cuda
Stream_t
stream
);
int64_t
*
const
num_unique
,
hip
Stream_t
stream
);
/**
/**
* @brief Fill the hashtable with an array of unique keys.
* @brief Fill the hashtable with an array of unique keys.
...
@@ -262,7 +264,7 @@ class OrderedHashTable {
...
@@ -262,7 +264,7 @@ class OrderedHashTable {
* @param stream The stream to perform operations on.
* @param stream The stream to perform operations on.
*/
*/
void
FillWithUnique
(
void
FillWithUnique
(
const
IdType
*
const
input
,
const
size_t
num_input
,
cuda
Stream_t
stream
);
const
IdType
*
const
input
,
const
size_t
num_input
,
hip
Stream_t
stream
);
/**
/**
* @brief Get a verison of the hashtable usable from device functions.
* @brief Get a verison of the hashtable usable from device functions.
...
...
src/runtime/cuda/cuda_hashtable.
cu
→
src/runtime/cuda/cuda_hashtable.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
* @file runtime/cuda/cuda_device_common.cuh
...
@@ -5,7 +7,7 @@
...
@@ -5,7 +7,7 @@
*/
*/
#include <cassert>
#include <cassert>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include "../../array/cuda/atomic.cuh"
#include "../../array/cuda/atomic.cuh"
#include "cuda_common.h"
#include "cuda_common.h"
...
@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
...
@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return The mapping.
* @return The mapping.
*/
*/
inline __device__ Iterator Search(const IdType id) {
inline __device__ Iterator Search(const IdType id) {
const
IdType
pos
=
SearchForPosition
(
id
);
// const IdType pos = SearchForPosition(id);
const IdType pos = DeviceOrderedHashTable<IdType>::SearchForPosition(id);
return GetMutable(pos);
return GetMutable(pos);
}
}
...
@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
...
@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return An iterator to inserted mapping.
* @return An iterator to inserted mapping.
*/
*/
inline __device__ Iterator Insert(const IdType id, const size_t index) {
inline __device__ Iterator Insert(const IdType id, const size_t index) {
size_t
pos
=
Hash
(
id
);
// size_t pos = Hash(id);
size_t pos = DeviceOrderedHashTable<IdType>::Hash(id);
// linearly scan for an empty slot or matching entry
// linearly scan for an empty slot or matching entry
IdType delta = 1;
IdType delta = 1;
while (!AttemptInsertAt(pos, id, index)) {
while (!AttemptInsertAt(pos, id, index)) {
pos
=
Hash
(
pos
+
delta
);
// pos = Hash(pos + delta);
pos = DeviceOrderedHashTable<IdType>::Hash(pos+delta);
delta += 1;
delta += 1;
}
}
...
@@ -246,7 +254,7 @@ __global__ void count_hashmap(
...
@@ -246,7 +254,7 @@ __global__ void count_hashmap(
DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
assert(BLOCK_SIZE == blockDim.x);
assert(BLOCK_SIZE == blockDim.x);
using
BlockReduce
=
typename
cub
::
BlockReduce
<
IdType
,
BLOCK_SIZE
>
;
using BlockReduce = typename
hip
cub::BlockReduce<IdType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
const size_t block_start = TILE_SIZE * blockIdx.x;
const size_t block_start = TILE_SIZE * blockIdx.x;
...
@@ -300,7 +308,7 @@ __global__ void compact_hashmap(
...
@@ -300,7 +308,7 @@ __global__ void compact_hashmap(
assert(BLOCK_SIZE == blockDim.x);
assert(BLOCK_SIZE == blockDim.x);
using FlagType = uint16_t;
using FlagType = uint16_t;
using
BlockScan
=
typename
cub
::
BlockScan
<
FlagType
,
BLOCK_SIZE
>
;
using BlockScan = typename
hip
cub::BlockScan<FlagType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
...
@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
...
@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
template <typename IdType>
template <typename IdType>
OrderedHashTable<IdType>::OrderedHashTable(
OrderedHashTable<IdType>::OrderedHashTable(
const
size_t
size
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
const
int
scale
)
const size_t size, DGLContext ctx,
hip
Stream_t stream, const int scale)
: table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
: table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
// make sure we will at least as many buckets as items.
// make sure we will at least as many buckets as items.
CHECK_GT(scale, 0);
CHECK_GT(scale, 0);
...
@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
...
@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
table_ = static_cast<Mapping*>(
table_ = static_cast<Mapping*>(
device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
sizeof(Mapping) * size_, stream));
sizeof(Mapping) * size_, stream));
}
}
...
@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
...
@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
template <typename IdType>
template <typename IdType>
void OrderedHashTable<IdType>::FillWithDuplicates(
void OrderedHashTable<IdType>::FillWithDuplicates(
const IdType* const input, const size_t num_input, IdType* const unique,
const IdType* const input, const size_t num_input, IdType* const unique,
int64_t
*
const
num_unique
,
cuda
Stream_t
stream
)
{
int64_t* const num_unique,
hip
Stream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx_);
auto device = runtime::DeviceAPI::Get(ctx_);
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
...
@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
...
@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
input, num_input, device_table, item_prefix);
input, num_input, device_table, item_prefix);
size_t workspace_bytes;
size_t workspace_bytes;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), grid.x + 1, stream));
static_cast<IdType*>(nullptr), grid.x + 1, stream));
void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
stream));
stream));
device->FreeWorkspace(ctx_, workspace);
device->FreeWorkspace(ctx_, workspace);
...
@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
...
@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
template <typename IdType>
template <typename IdType>
void OrderedHashTable<IdType>::FillWithUnique(
void OrderedHashTable<IdType>::FillWithUnique(
const
IdType
*
const
input
,
const
size_t
num_input
,
cuda
Stream_t
stream
)
{
const IdType* const input, const size_t num_input,
hip
Stream_t stream) {
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
const dim3 grid(num_tiles);
const dim3 grid(num_tiles);
...
...
src/runtime/cuda/gpu_cache.
cu
→
src/runtime/cuda/gpu_cache.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/*!
/*!
* Copyright (c) 2022 by Contributors
* Copyright (c) 2022 by Contributors
*
*
...
@@ -20,7 +21,7 @@
...
@@ -20,7 +21,7 @@
#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/array.h>
#include <dgl/array.h>
#include <dgl/aten/array_ops.h>
#include <dgl/aten/array_ops.h>
#include <dgl/packed_func_ext.h>
#include <dgl/packed_func_ext.h>
...
@@ -31,7 +32,7 @@
...
@@ -31,7 +32,7 @@
#include <nv_gpu_cache.hpp>
#include <nv_gpu_cache.hpp>
#include "
../../runtime/cuda/
cuda_common.h"
#include "cuda_common.h"
namespace dgl {
namespace dgl {
namespace runtime {
namespace runtime {
...
@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object {
...
@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object {
: num_feats(num_feats),
: num_feats(num_feats),
cache(std::make_unique<gpu_cache_t>(
cache(std::make_unique<gpu_cache_t>(
(num_items + bucket_size - 1) / bucket_size, num_feats)) {
(num_items + bucket_size - 1) / bucket_size, num_feats)) {
CUDA_CALL
(
cuda
GetDevice
(
&
cuda_device
));
CUDA_CALL(
hip
GetDevice(&cuda_device));
}
}
std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
const auto &ctx = keys->ctx;
const auto &ctx = keys->ctx;
cuda
Stream_t
stream
=
dgl
::
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = dgl::runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = dgl::runtime::DeviceAPI::Get(ctx);
auto device = dgl::runtime::DeviceAPI::Get(ctx);
CHECK_EQ(ctx.device_type, kDGLCUDA)
CHECK_EQ(ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
<< "The keys should be on a CUDA device";
...
@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object {
...
@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object {
}
}
void Replace(IdArray keys, NDArray values) {
void Replace(IdArray keys, NDArray values) {
cuda
Stream_t
stream
=
dgl
::
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = dgl::runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
<< "The keys should be on a CUDA device";
CHECK_EQ(keys->ctx.device_id, cuda_device)
CHECK_EQ(keys->ctx.device_id, cuda_device)
...
...
src/runtime/module.cc
View file @
6ac701f8
...
@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
...
@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
}
else
if
(
target
.
length
()
>=
5
&&
target
.
substr
(
0
,
5
)
==
"nvptx"
)
{
}
else
if
(
target
.
length
()
>=
5
&&
target
.
substr
(
0
,
5
)
==
"nvptx"
)
{
f_name
=
"device_api.cuda"
;
f_name
=
"device_api.cuda"
;
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"rocm"
)
{
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"rocm"
)
{
f_name
=
"device_api.
rocm
"
;
f_name
=
"device_api.
cuda
"
;
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"llvm"
)
{
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"llvm"
)
{
const
PackedFunc
*
pf
=
const
PackedFunc
*
pf
=
runtime
::
Registry
::
Get
(
"codegen.llvm_target_enabled"
);
runtime
::
Registry
::
Get
(
"codegen.llvm_target_enabled"
);
...
...
src/runtime/ndarray.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2017-2022 by Contributors
* Copyright (c) 2017-2022 by Contributors
* @file ndarray.cc
* @file ndarray.cc
...
@@ -26,7 +27,7 @@ constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
...
@@ -26,7 +27,7 @@ constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
constexpr
DGLDataType
DGLDataTypeTraits
<
__half
>::
dtype
;
constexpr
DGLDataType
DGLDataTypeTraits
<
__half
>::
dtype
;
#if BF16_ENABLED
#if BF16_ENABLED
constexpr
DGLDataType
DGLDataTypeTraits
<
__
nv
_bfloat16
>::
dtype
;
constexpr
DGLDataType
DGLDataTypeTraits
<
__
hip
_bfloat16
>::
dtype
;
#endif // BF16_ENABLED
#endif // BF16_ENABLED
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
constexpr
DGLDataType
DGLDataTypeTraits
<
float
>::
dtype
;
constexpr
DGLDataType
DGLDataTypeTraits
<
float
>::
dtype
;
...
@@ -222,7 +223,7 @@ void NDArray::RecordedCopyFromTo(
...
@@ -222,7 +223,7 @@ void NDArray::RecordedCopyFromTo(
CHECK
(
from
->
ctx
.
device_type
!=
to
->
ctx
.
device_type
)
CHECK
(
from
->
ctx
.
device_type
!=
to
->
ctx
.
device_type
)
<<
"Recoding event is only called for the copy between CPU and GPU."
;
<<
"Recoding event is only called for the copy between CPU and GPU."
;
CHECK
(
from
->
ctx
.
device_type
==
kDGLCUDA
||
to
->
ctx
.
device_type
==
kDGL
CUDA
)
CHECK
(
from
->
ctx
.
device_type
==
kDGLCUDA
||
to
->
ctx
.
device_type
==
kDGL
ROCM
)
<<
"At least one CUDA ctx needs to be involved."
;
<<
"At least one CUDA ctx needs to be involved."
;
DeviceAPI
::
Get
(
kDGLCUDA
)
->
RecordedCopyDataFromTo
(
DeviceAPI
::
Get
(
kDGLCUDA
)
->
RecordedCopyDataFromTo
(
...
@@ -262,7 +263,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
...
@@ -262,7 +263,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
void
NDArray
::
UnpinContainer
(
NDArray
::
Container
*
ptr
)
{
void
NDArray
::
UnpinContainer
(
NDArray
::
Container
*
ptr
)
{
auto
container_is_pinned
=
IsContainerPinned
(
ptr
);
auto
container_is_pinned
=
IsContainerPinned
(
ptr
);
// The tensor may be pinned outside of DGL via a different CUDA API,
// The tensor may be pinned outside of DGL via a different CUDA API,
// so we cannot unpin it with
cuda
HostUnregister.
// so we cannot unpin it with
hip
HostUnregister.
CHECK
(
ptr
->
pinned_by_dgl_
||
!
container_is_pinned
)
CHECK
(
ptr
->
pinned_by_dgl_
||
!
container_is_pinned
)
<<
"Cannot unpin a tensor that is pinned outside of DGL."
;
<<
"Cannot unpin a tensor that is pinned outside of DGL."
;
// 1. not pinned, do nothing
// 1. not pinned, do nothing
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment