Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
272 additions
and
245 deletions
+272
-245
src/graph/transform/cuda/knn.hip
src/graph/transform/cuda/knn.hip
+24
-22
src/graph/traversal.cc
src/graph/traversal.cc
+2
-1
src/graph/unit_graph.cc
src/graph/unit_graph.cc
+3
-2
src/partition/cuda/partition_op.hip
src/partition/cuda/partition_op.hip
+21
-19
src/partition/ndarray_partition.cc
src/partition/ndarray_partition.cc
+6
-6
src/random/continuous_seed.h
src/random/continuous_seed.h
+13
-12
src/rpc/rpc.cc
src/rpc/rpc.cc
+2
-1
src/rpc/rpc.h
src/rpc/rpc.h
+4
-3
src/runtime/c_runtime_api.cc
src/runtime/c_runtime_api.cc
+6
-3
src/runtime/cuda/cuda_common.h
src/runtime/cuda/cuda_common.h
+67
-66
src/runtime/cuda/cuda_device_api.cc
src/runtime/cuda/cuda_device_api.cc
+87
-85
src/runtime/cuda/cuda_hashtable.cuh
src/runtime/cuda/cuda_hashtable.cuh
+6
-4
src/runtime/cuda/cuda_hashtable.hip
src/runtime/cuda/cuda_hashtable.hip
+20
-12
src/runtime/cuda/gpu_cache.hip
src/runtime/cuda/gpu_cache.hip
+6
-5
src/runtime/module.cc
src/runtime/module.cc
+1
-1
src/runtime/ndarray.cc
src/runtime/ndarray.cc
+4
-3
No files found.
src/graph/transform/cuda/knn.
cu
→
src/graph/transform/cuda/knn.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file graph/transform/cuda/knn.cu
* @brief k-nearest-neighbor (KNN) implementation (cuda)
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <limits>
#include <string>
#include <type_traits>
...
...
@@ -467,7 +469,7 @@ void BruteForceKNNCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
...
...
@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
...
...
@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda(
// get max shared memory per block in bytes
// determine block size according to this value
int max_sharedmem_per_block = 0;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
max_sharedmem_per_block
,
cudaDevAttr
MaxSharedMemoryPerBlock
,
CUDA_CALL(
hip
DeviceGetAttribute(
&max_sharedmem_per_block,
hipDeviceAttribute
MaxSharedMemoryPerBlock,
ctx.device_id));
const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
(k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
...
...
@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda(
GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
query_offsets_data, num_block_per_segment, batch_size, block_size);
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
device->FreeWorkspace(ctx, prefix_temp);
// wait for results
CUDA_CALL
(
cuda
StreamSynchronize
(
stream
));
CUDA_CALL(
hip
StreamSynchronize(stream));
int64_t num_blocks = 0, final_elem = 0,
copyoffset = (batch_size - 1) * sizeof(IdType);
...
...
@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda(
/** @brief Setup rng state for nn-descent */
__global__ void SetupRngKernel(
cu
randState
*
states
,
const
uint64_t
seed
,
const
size_t
n
)
{
hip
randState
_t
* states, const uint64_t seed, const size_t n) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
cu
rand_init
(
seed
,
id
,
0
,
states
+
id
);
hip
rand_init(seed, id, 0, states + id);
}
}
...
...
@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
cu
randState
state
;
cu
rand_init
(
seed
,
point_idx
,
0
,
&
state
);
hip
randState
_t
state;
hip
rand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
...
...
@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel(
current_central_nodes[i] = point_idx;
}
for (IdType i = k; i < segment_size; ++i) {
const
IdType
j
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
i
+
1
));
const IdType j = static_cast<IdType>(
hip
rand(&state) % (i + 1));
if (j < k) current_neighbors[j] = i + segment_start;
}
...
...
@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
cu
randState
state
;
cu
rand_init
(
seed
,
point_idx
,
0
,
&
state
);
hip
randState
_t
state;
hip
rand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
...
...
@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
candidate_data[curr_num] = candidate;
} else {
IdType
pos
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
curr_num
+
1
));
IdType pos = static_cast<IdType>(
hip
rand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = candidate;
}
++candidate_array[0];
...
...
@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
candidate_data[curr_num] = reverse_candidate;
} else {
IdType
pos
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
curr_num
+
1
));
IdType pos = static_cast<IdType>(
hip
rand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
}
++candidate_array[0];
...
...
@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void NNDescent(
const NDArray& points, const IdArray& offsets, IdArray result, const int k,
const int num_iters, const int num_candidates, const double delta) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t num_nodes = points->shape[0];
...
...
@@ -887,7 +889,7 @@ void NNDescent(
uint64_t seed;
int warp_size = 0;
CUDA_CALL(
cuda
DeviceGetAttribute
(
&
warp_size
,
cudaDevAttr
WarpSize
,
ctx
.
device_id
));
hip
DeviceGetAttribute(&warp_size,
hipDeviceAttribute
WarpSize, ctx.device_id));
// We don't need large block sizes, since there's not much inter-thread
// communication
int64_t block_size = warp_size;
...
...
@@ -911,7 +913,7 @@ void NNDescent(
IdType* total_num_updates_d =
static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
CUDA_CALL
(
cub
::
DeviceReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceReduce::Sum(
nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
stream));
IdType* sum_temp_storage =
...
...
@@ -942,7 +944,7 @@ void NNDescent(
feature_size);
total_num_updates = 0;
CUDA_CALL
(
cub
::
DeviceReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceReduce::Sum(
sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
num_nodes, stream));
device->CopyDataFromTo(
...
...
src/graph/traversal.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2018 by Contributors
* @file graph/traversal.cc
* @brief Graph traversal implementation
*/
#include "
./
traversal.h"
#include "traversal.h"
#include <dgl/packed_func_ext.h>
...
...
src/graph/unit_graph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/unit_graph.cc
* @brief UnitGraph graph implementation
*/
#include "
./
unit_graph.h"
#include "unit_graph.h"
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
...
...
@@ -11,7 +12,7 @@
#include <dgl/lazy.h>
#include "../c_api_common.h"
#include "
./
serialize/dglstream.h"
#include "serialize/dglstream.h"
namespace
dgl
{
...
...
src/partition/cuda/partition_op.
cu
→
src/partition/cuda/partition_op.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file ndarray_partition.h
...
...
@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/workspace.h"
...
...
@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_in = in_idx->shape[0];
...
...
@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
}
const int64_t part_bits =
static_cast
<
int64_t
>
(
std
::
ceil
(
std
::
log2
(
num_parts
)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
...
...
@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
...
...
@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"
);
"in
hip
cub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
...
...
@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
"value of int.";
size_t hist_workspace_size;
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
...
...
@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
template <DGLDeviceType XPU, typename IdType>
IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1) {
IdArray local_idx =
...
...
@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder(
<< num_parts;
const auto& ctx = local_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1) {
IdArray global_idx =
...
...
@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_in = in_idx->shape[0];
...
...
@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
}
const int64_t part_bits =
static_cast
<
int64_t
>
(
std
::
ceil
(
std
::
log2
(
num_parts
)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
...
...
@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
...
...
@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"
);
"in
hip
cub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
...
...
@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
"value of int.";
size_t hist_workspace_size;
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
...
...
@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
IdArray MapToLocalFromRange(
const int num_parts, IdArray range, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1 && global_idx->shape[0] > 0) {
IdArray local_idx =
...
...
@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange(
<< num_parts;
const auto& ctx = local_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1 && local_idx->shape[0] > 0) {
IdArray global_idx =
...
...
src/partition/ndarray_partition.cc
View file @
6ac701f8
...
...
@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
GeneratePermutationFromRemainder
<
kDGLCUDA
,
IdType
>
(
ArraySize
(),
NumParts
(),
in_idx
);
...
...
@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
MapToLocalFromRemainder
<
kDGLCUDA
,
IdType
>
(
NumParts
(),
in_idx
);
...
...
@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
MapToGlobalFromRemainder
<
kDGLCUDA
,
IdType
>
(
NumParts
(),
in_idx
,
part_id
);
...
...
@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition {
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
if
(
ctx
.
device_type
!=
range_
->
ctx
.
device_type
||
ctx
.
device_id
!=
range_
->
ctx
.
device_id
)
{
LOG
(
FATAL
)
<<
"The range for the NDArrayPartition and the input "
...
...
@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition {
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
return
impl
::
MapToLocalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
...
...
@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition {
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
return
impl
::
MapToGlobalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
...
...
src/random/continuous_seed.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/*!
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* All rights reserved.
...
...
@@ -24,13 +25,13 @@
#include <cmath>
#ifdef __
NV
CC__
#include <
cu
rand_kernel.h>
#ifdef __
HIP
CC__
#include <
hiprand/hip
rand_kernel.h>
#else
#include <random>
#include "pcg_random.hpp"
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
#ifndef M_SQRT1_2
#define M_SQRT1_2 0.707106781186547524401
...
...
@@ -58,24 +59,24 @@ class continuous_seed {
c
[
1
]
=
std
::
sin
(
pi
*
r
/
2
);
}
#ifdef __
CUDA_ARCH
__
#ifdef __
HIP_DEVICE_COMPILE
__
__device__
inline
float
uniform
(
const
uint64_t
t
)
const
{
const
uint64_t
kCurandSeed
=
999961
;
// Could be any random number.
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
kCurandSeed
,
s
[
0
],
t
,
&
rng
);
hip
randStatePhilox4_32_10_t
rng
;
hip
rand_init
(
kCurandSeed
,
s
[
0
],
t
,
&
rng
);
float
rnd
;
if
(
s
[
0
]
!=
s
[
1
])
{
rnd
=
c
[
0
]
*
cu
rand_normal
(
&
rng
);
cu
rand_init
(
kCurandSeed
,
s
[
1
],
t
,
&
rng
);
rnd
+=
c
[
1
]
*
cu
rand_normal
(
&
rng
);
rnd
=
c
[
0
]
*
hip
rand_normal
(
&
rng
);
hip
rand_init
(
kCurandSeed
,
s
[
1
],
t
,
&
rng
);
rnd
+=
c
[
1
]
*
hip
rand_normal
(
&
rng
);
rnd
=
normcdff
(
rnd
);
}
else
{
rnd
=
cu
rand_uniform
(
&
rng
);
rnd
=
hip
rand_uniform
(
&
rng
);
}
return
rnd
;
}
#else
inline
float
uniform
(
const
uint64_t
t
)
const
{
__host__
inline
float
uniform
(
const
uint64_t
t
)
const
{
pcg32
ng0
(
s
[
0
],
t
);
float
rnd
;
if
(
s
[
0
]
!=
s
[
1
])
{
...
...
@@ -91,7 +92,7 @@ class continuous_seed {
}
return
rnd
;
}
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
};
}
// namespace random
...
...
src/rpc/rpc.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.cc
* @brief Implementation of RPC utilities used by both server and client sides.
*/
#if defined(__linux__)
#include "
./
rpc.h"
#include "rpc.h"
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
...
...
src/rpc/rpc.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.h
...
...
@@ -19,9 +20,9 @@
#include <unordered_map>
#include <vector>
#include "
./
network/common.h"
#include "
./
rpc_msg.h"
#include "
./
server_state.h"
#include "network/common.h"
#include "rpc_msg.h"
#include "server_state.h"
#include "network/socket_communicator.h"
namespace
dgl
{
...
...
src/runtime/c_runtime_api.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2016-2022 by Contributors
* @file c_runtime_api.cc
...
...
@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) {
return
"cpu"
;
case
kDGLCUDA
:
return
"cuda"
;
case
kDGLROCM
:
return
"cuda"
;
// add more device here once supported
default:
LOG
(
FATAL
)
<<
"unknown type ="
<<
type
;
...
...
@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo(
}
bool
DeviceAPI
::
PinData
(
void
*
ptr
,
size_t
nbytes
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
HostRegister api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
HostRegister api."
;
return
false
;
}
void
*
DeviceAPI
::
AllocPinnedDataSpace
(
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
Host
A
lloc api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
Host
Ma
lloc api."
;
return
nullptr
;
}
...
...
@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
}
void
DeviceAPI
::
UnpinData
(
void
*
ptr
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
HostUnregister api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
HostUnregister api."
;
}
}
// namespace runtime
}
// namespace dgl
...
...
src/runtime/cuda/cuda_common.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017 by Contributors
* @file cuda_common.h
...
...
@@ -6,10 +7,10 @@
#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#include <
cu
blas
_v2
.h>
#include <
cuda
_runtime.h>
#include <
cu
rand.h>
#include <
cu
sparse.h>
#include <
hip
blas
/hipblas
.h>
#include <
hip/hip
_runtime.h>
#include <
hiprand/hip
rand.h>
#include <
hipsparse/hip
sparse.h>
#include <dgl/runtime/packed_func.h>
#include <memory>
...
...
@@ -25,8 +26,8 @@ namespace runtime {
DGL's memory pool and the current cuda stream
runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto stream = runtime::getCurrent
CUDA
Stream();
const auto exec_policy = thrust::
cuda
::par_nosync(allocator).on(stream);
const auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
now, one can pass exec_policy to thrust functions
...
...
@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) {
#define CUDA_DRIVER_CALL(x) \
{ \
CUresul
t result = x; \
if (result !=
CUDA_SUCCESS
&& result !=
CUDA_ERROR_DEINITIALIZED
) { \
hipError_
t result = x; \
if (result !=
hipSuccess
&& result !=
hipErrorDeinitialized
) { \
const char* msg; \
cu
GetErrorName(result, &msg); \
hip
GetErrorName(result, &msg); \
LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \
} \
}
#define CUDA_CALL(func) \
{ \
cuda
Error_t e = (func); \
CHECK(e ==
cuda
Success || e ==
cuda
Error
CudartUnloading
) \
<< "CUDA: " <<
cuda
GetErrorString(e); \
hip
Error_t e = (func); \
CHECK(e ==
hip
Success || e ==
hip
Error
Deinitialized
) \
<< "CUDA: " <<
hip
GetErrorString(e); \
}
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
{ \
if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
(kernel)
<<<
(nblks), (nthrs), (shmem), (stream)
>>>(
__VA_ARGS__); \
cuda
Error_t e =
cuda
GetLastError(); \
CHECK(e ==
cuda
Success || e ==
cuda
Error
CudartUnloading
) \
<< "CUDA kernel launch error: " <<
cuda
GetErrorString(e); \
hipLaunchKernelGGL((
(kernel)
), dim3(
(nblks)
)
,
dim3(
(nthrs)
)
, (shmem), (stream)
,
__VA_ARGS__); \
hip
Error_t e =
hip
GetLastError(); \
CHECK(e ==
hip
Success || e ==
hip
Error
Deinitialized
) \
<< "CUDA kernel launch error: " <<
hip
GetErrorString(e); \
} \
}
#define CUSPARSE_CALL(func) \
{ \
cu
sparseStatus_t e = (func); \
CHECK(e ==
CU
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
hip
sparseStatus_t e = (func); \
CHECK(e ==
HIP
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
}
#define CUBLAS_CALL(func) \
{ \
cu
blasStatus_t e = (func); \
CHECK(e ==
CU
BLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
hip
blasStatus_t e = (func); \
CHECK(e ==
HIP
BLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
}
#define CURAND_CALL(func) \
{ \
cu
randStatus_t e = (func); \
CHECK(e ==
CU
RAND_STATUS_SUCCESS) \
hip
randStatus_t e = (func); \
CHECK(e ==
HIP
RAND_STATUS_SUCCESS) \
<< "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
<< __FILE__ << ":" << __LINE__; \
}
inline
const
char
*
curandGetErrorString
(
cu
randStatus_t
error
)
{
inline
const
char
*
curandGetErrorString
(
hip
randStatus_t
error
)
{
switch
(
error
)
{
case
CU
RAND_STATUS_SUCCESS
:
return
"
CU
RAND_STATUS_SUCCESS"
;
case
CU
RAND_STATUS_VERSION_MISMATCH
:
return
"
CU
RAND_STATUS_VERSION_MISMATCH"
;
case
CU
RAND_STATUS_NOT_INITIALIZED
:
return
"
CU
RAND_STATUS_NOT_INITIALIZED"
;
case
CU
RAND_STATUS_ALLOCATION_FAILED
:
return
"
CU
RAND_STATUS_ALLOCATION_FAILED"
;
case
CU
RAND_STATUS_TYPE_ERROR
:
return
"
CU
RAND_STATUS_TYPE_ERROR"
;
case
CU
RAND_STATUS_OUT_OF_RANGE
:
return
"
CU
RAND_STATUS_OUT_OF_RANGE"
;
case
CU
RAND_STATUS_LENGTH_NOT_MULTIPLE
:
return
"
CU
RAND_STATUS_LENGTH_NOT_MULTIPLE"
;
case
CU
RAND_STATUS_DOUBLE_PRECISION_REQUIRED
:
return
"
CU
RAND_STATUS_DOUBLE_PRECISION_REQUIRED"
;
case
CU
RAND_STATUS_LAUNCH_FAILURE
:
return
"
CU
RAND_STATUS_LAUNCH_FAILURE"
;
case
CU
RAND_STATUS_PREEXISTING_FAILURE
:
return
"
CU
RAND_STATUS_PREEXISTING_FAILURE"
;
case
CU
RAND_STATUS_INITIALIZATION_FAILED
:
return
"
CU
RAND_STATUS_INITIALIZATION_FAILED"
;
case
CU
RAND_STATUS_ARCH_MISMATCH
:
return
"
CU
RAND_STATUS_ARCH_MISMATCH"
;
case
CU
RAND_STATUS_INTERNAL_ERROR
:
return
"
CU
RAND_STATUS_INTERNAL_ERROR"
;
case
HIP
RAND_STATUS_SUCCESS
:
return
"
HIP
RAND_STATUS_SUCCESS"
;
case
HIP
RAND_STATUS_VERSION_MISMATCH
:
return
"
HIP
RAND_STATUS_VERSION_MISMATCH"
;
case
HIP
RAND_STATUS_NOT_INITIALIZED
:
return
"
HIP
RAND_STATUS_NOT_INITIALIZED"
;
case
HIP
RAND_STATUS_ALLOCATION_FAILED
:
return
"
HIP
RAND_STATUS_ALLOCATION_FAILED"
;
case
HIP
RAND_STATUS_TYPE_ERROR
:
return
"
HIP
RAND_STATUS_TYPE_ERROR"
;
case
HIP
RAND_STATUS_OUT_OF_RANGE
:
return
"
HIP
RAND_STATUS_OUT_OF_RANGE"
;
case
HIP
RAND_STATUS_LENGTH_NOT_MULTIPLE
:
return
"
HIP
RAND_STATUS_LENGTH_NOT_MULTIPLE"
;
case
HIP
RAND_STATUS_DOUBLE_PRECISION_REQUIRED
:
return
"
HIP
RAND_STATUS_DOUBLE_PRECISION_REQUIRED"
;
case
HIP
RAND_STATUS_LAUNCH_FAILURE
:
return
"
HIP
RAND_STATUS_LAUNCH_FAILURE"
;
case
HIP
RAND_STATUS_PREEXISTING_FAILURE
:
return
"
HIP
RAND_STATUS_PREEXISTING_FAILURE"
;
case
HIP
RAND_STATUS_INITIALIZATION_FAILED
:
return
"
HIP
RAND_STATUS_INITIALIZATION_FAILED"
;
case
HIP
RAND_STATUS_ARCH_MISMATCH
:
return
"
HIP
RAND_STATUS_ARCH_MISMATCH"
;
case
HIP
RAND_STATUS_INTERNAL_ERROR
:
return
"
HIP
RAND_STATUS_INTERNAL_ERROR"
;
}
// To suppress compiler warning.
return
"Unrecognized
cu
rand error string"
;
return
"Unrecognized
hip
rand error string"
;
}
/**
* @brief Cast data type to
cuda
DataType
_t
.
* @brief Cast data type to
hip
DataType.
*/
template
<
typename
T
>
struct
cuda_dtype
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_32F
;
static
constexpr
hip
DataType
value
=
HIP
_R_32F
;
};
template
<
>
struct
cuda_dtype
<
__half
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_16F
;
static
constexpr
hip
DataType
value
=
HIP
_R_16F
;
};
#if BF16_ENABLED
template
<
>
struct
cuda_dtype
<
__
nv
_bfloat16
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_16BF
;
struct
cuda_dtype
<
__
hip
_bfloat16
>
{
static
constexpr
hip
DataType
value
=
HIP
_R_16BF
;
};
#endif // BF16_ENABLED
template
<
>
struct
cuda_dtype
<
float
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_32F
;
static
constexpr
hip
DataType
value
=
HIP
_R_32F
;
};
template
<
>
struct
cuda_dtype
<
double
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_64F
;
static
constexpr
hip
DataType
value
=
HIP
_R_64F
;
};
/*
...
...
@@ -202,7 +203,7 @@ struct accum_dtype<__half> {
#if BF16_ENABLED
template
<
>
struct
accum_dtype
<
__
nv
_bfloat16
>
{
struct
accum_dtype
<
__
hip
_bfloat16
>
{
typedef
float
type
;
};
#endif // BF16_ENABLED
...
...
@@ -217,23 +218,23 @@ struct accum_dtype<double> {
typedef
double
type
;
};
#if
CUDA
RT_VERSION >= 11000
#if
DTK
RT_VERSION >= 11000
/**
* @brief Cast index data type to
cu
sparseIndexType_t.
* @brief Cast index data type to
hip
sparseIndexType_t.
*/
template
<
typename
T
>
struct
cusparse_idtype
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_32I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_32I
;
};
template
<
>
struct
cusparse_idtype
<
int32_t
>
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_32I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_32I
;
};
template
<
>
struct
cusparse_idtype
<
int64_t
>
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_64I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_64I
;
};
#endif
...
...
@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> {
class
CUDAThreadEntry
{
public:
/** @brief The cusparse handler */
cu
sparseHandle_t
cusparse_handle
{
nullptr
};
hip
sparseHandle_t
cusparse_handle
{
nullptr
};
/** @brief The cublas handler */
cu
blasHandle_t
cublas_handle
{
nullptr
};
hip
blasHandle_t
cublas_handle
{
nullptr
};
/** @brief thread local pool*/
WorkspacePool
pool
;
/** @brief constructor */
...
...
@@ -253,7 +254,7 @@ class CUDAThreadEntry {
};
/** @brief Get the current CUDA stream */
cuda
Stream_t
getCurrent
CUDA
Stream
();
hip
Stream_t
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
}
// namespace runtime
}
// namespace dgl
#endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
src/runtime/cuda/cuda_device_api.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017-2022 by Contributors
* @file cuda_device_api.cc
* @brief GPU specific API
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/tensordispatch.h>
...
...
@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI {
public:
CUDADeviceAPI
()
{
int
count
;
auto
err
=
cuda
GetDeviceCount
(
&
count
);
auto
err
=
hip
GetDeviceCount
(
&
count
);
switch
(
err
)
{
case
cuda
Success
:
case
hip
Success
:
break
;
default:
count
=
0
;
cuda
GetLastError
();
hip
GetLastError
();
}
is_available_
=
count
>
0
;
}
...
...
@@ -32,67 +33,68 @@ class CUDADeviceAPI final : public DeviceAPI {
bool
IsAvailable
()
final
{
return
is_available_
;
}
void
SetDevice
(
DGLContext
ctx
)
final
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
}
void
GetAttr
(
DGLContext
ctx
,
DeviceAttrKind
kind
,
DGLRetValue
*
rv
)
final
{
int
value
=
0
;
switch
(
kind
)
{
case
kExist
:
value
=
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxThreadsPerBlock
,
ctx
.
device_id
)
==
cuda
Success
);
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
MaxThreadsPerBlock
,
ctx
.
device_id
)
==
hip
Success
);
break
;
case
kMaxThreadsPerBlock
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxThreadsPerBlock
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
MaxThreadsPerBlock
,
ctx
.
device_id
));
break
;
}
case
kWarpSize
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
WarpSize
,
ctx
.
device_id
));
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
WarpSize
,
ctx
.
device_id
));
break
;
}
case
kMaxSharedMemoryPerBlock
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxSharedMemoryPerBlock
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
MaxSharedMemoryPerBlock
,
ctx
.
device_id
));
break
;
}
case
kComputeVersion
:
{
std
::
ostringstream
os
;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ComputeCapabilityMajor
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
ComputeCapabilityMajor
,
ctx
.
device_id
));
os
<<
value
<<
"."
;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ComputeCapabilityMinor
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
ComputeCapabilityMinor
,
ctx
.
device_id
));
os
<<
value
;
*
rv
=
os
.
str
();
return
;
}
case
kDeviceName
:
{
cuda
DeviceProp
props
;
CUDA_CALL
(
cuda
GetDeviceProperties
(
&
props
,
ctx
.
device_id
));
hip
DeviceProp
_t
props
;
CUDA_CALL
(
hip
GetDeviceProperties
(
&
props
,
ctx
.
device_id
));
*
rv
=
std
::
string
(
props
.
name
);
// printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
return
;
}
case
kMaxClockRate
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ClockRate
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
ClockRate
,
ctx
.
device_id
));
break
;
}
case
kMultiProcessorCount
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
Multi
P
rocessorCount
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
Multi
p
rocessorCount
,
ctx
.
device_id
));
break
;
}
case
kMaxThreadDimensions
:
{
int
dims
[
3
];
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
dims
[
0
],
cudaDevAttr
MaxBlockDimX
,
ctx
.
device_id
));
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
dims
[
1
],
cudaDevAttr
MaxBlockDimY
,
ctx
.
device_id
));
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
dims
[
2
],
cudaDevAttr
MaxBlockDimZ
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
0
],
hipDeviceAttribute
MaxBlockDimX
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
1
],
hipDeviceAttribute
MaxBlockDimY
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
2
],
hipDeviceAttribute
MaxBlockDimZ
,
ctx
.
device_id
));
std
::
stringstream
ss
;
// use json string to return multiple int values;
ss
<<
"["
<<
dims
[
0
]
<<
", "
<<
dims
[
1
]
<<
", "
<<
dims
[
2
]
<<
"]"
;
...
...
@@ -110,11 +112,11 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
nbytes
,
getCurrent
CUDA
Stream
());
nbytes
,
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
}
CHECK_EQ
(
256
%
alignment
,
0U
)
<<
"CUDA space is aligned at 256 bytes"
;
void
*
ret
;
CUDA_CALL
(
cuda
Malloc
(
&
ret
,
nbytes
));
CUDA_CALL
(
hip
Malloc
(
&
ret
,
nbytes
));
return
ret
;
}
...
...
@@ -124,32 +126,32 @@ class CUDADeviceAPI final : public DeviceAPI {
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CUDAFreeWorkspace
(
ptr
);
}
CUDA_CALL
(
cuda
Free
(
ptr
));
CUDA_CALL
(
hip
Free
(
ptr
));
}
void
CopyDataFromTo
(
const
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
DGLStreamHandle
stream
)
{
cuda
Stream_t
cu_stream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
hip
Stream_t
cu_stream
=
static_cast
<
hip
Stream_t
>
(
stream
);
from
=
static_cast
<
const
char
*>
(
from
)
+
from_offset
;
to
=
static_cast
<
char
*>
(
to
)
+
to_offset
;
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCUDA
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_from
.
device_id
));
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
&&
ctx_to
.
device_type
==
kDGLROCM
)
{
CUDA_CALL
(
hip
SetDevice
(
ctx_from
.
device_id
));
if
(
ctx_from
.
device_id
==
ctx_to
.
device_id
)
{
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyDeviceToDevice
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyDeviceToDevice
,
cu_stream
);
}
else
{
CUDA_CALL
(
cuda
MemcpyPeerAsync
(
CUDA_CALL
(
hip
MemcpyPeerAsync
(
to
,
ctx_to
.
device_id
,
from
,
ctx_from
.
device_id
,
size
,
cu_stream
));
}
}
else
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_from
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyDeviceToHost
,
cu_stream
);
(
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
)
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
CUDA_CALL
(
hip
SetDevice
(
ctx_from
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyDeviceToHost
,
cu_stream
);
}
else
if
(
ctx_from
.
device_type
==
kDGLCPU
&&
ctx_to
.
device_type
==
kDGLCUDA
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_to
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyHostToDevice
,
cu_stream
);
ctx_from
.
device_type
==
kDGLCPU
&&
(
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
)
)
{
CUDA_CALL
(
hip
SetDevice
(
ctx_to
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyHostToDevice
,
cu_stream
);
}
else
{
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU"
;
}
...
...
@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI {
}
// To ensure correct behavior, `record_event` must be invoked anytime a
// pointer from PyTorch CachingHostAllocator is used in a
cuda
MemcpyAsync
// pointer from PyTorch CachingHostAllocator is used in a
hip
MemcpyAsync
// call. It provides a way to re-use freed pinned (page-locked) memory
// allocations and avoid device sync due to
cudaFreeHost
calls.
// allocations and avoid device sync due to
hipHostFree
calls.
void
RecordedCopyDataFromTo
(
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
...
...
@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI {
stream
);
auto
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
{
auto
custream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
auto
custream
=
static_cast
<
hip
Stream_t
>
(
stream
);
void
*
ptr
=
ctx_to
.
device_type
==
kDGLCPU
?
to
:
from
;
int
id
=
ctx_to
.
device_type
==
kDGLCPU
?
ctx_from
.
device_id
:
ctx_to
.
device_id
;
...
...
@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI {
}
DGLStreamHandle
CreateStream
(
DGLContext
ctx
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
retval
;
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
hip
Stream_t
retval
;
// make sure the legacy default stream won't block on this stream
CUDA_CALL
(
cuda
StreamCreateWithFlags
(
&
retval
,
cuda
StreamNonBlocking
));
CUDA_CALL
(
hip
StreamCreateWithFlags
(
&
retval
,
hip
StreamNonBlocking
));
return
static_cast
<
DGLStreamHandle
>
(
retval
);
}
void
FreeStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
cu_stream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
CUDA_CALL
(
cuda
StreamDestroy
(
cu_stream
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
hip
Stream_t
cu_stream
=
static_cast
<
hip
Stream_t
>
(
stream
);
CUDA_CALL
(
hip
StreamDestroy
(
cu_stream
));
}
void
SyncStreamFromTo
(
DGLContext
ctx
,
DGLStreamHandle
event_src
,
DGLStreamHandle
event_dst
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
src_stream
=
static_cast
<
cuda
Stream_t
>
(
event_src
);
cuda
Stream_t
dst_stream
=
static_cast
<
cuda
Stream_t
>
(
event_dst
);
cuda
Event_t
evt
;
CUDA_CALL
(
cuda
EventCreate
(
&
evt
));
CUDA_CALL
(
cuda
EventRecord
(
evt
,
src_stream
));
CUDA_CALL
(
cuda
StreamWaitEvent
(
dst_stream
,
evt
,
0
));
CUDA_CALL
(
cuda
EventDestroy
(
evt
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
hip
Stream_t
src_stream
=
static_cast
<
hip
Stream_t
>
(
event_src
);
hip
Stream_t
dst_stream
=
static_cast
<
hip
Stream_t
>
(
event_dst
);
hip
Event_t
evt
;
CUDA_CALL
(
hip
EventCreate
(
&
evt
));
CUDA_CALL
(
hip
EventRecord
(
evt
,
src_stream
));
CUDA_CALL
(
hip
StreamWaitEvent
(
dst_stream
,
evt
,
0
));
CUDA_CALL
(
hip
EventDestroy
(
evt
));
}
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
cuda
StreamSynchronize
(
static_cast
<
cuda
Stream_t
>
(
stream
)));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
StreamSynchronize
(
static_cast
<
hip
Stream_t
>
(
stream
)));
}
/** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
...
...
@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI {
void
SetStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{}
DGLStreamHandle
GetStream
()
const
final
{
return
static_cast
<
DGLStreamHandle
>
(
getCurrent
CUDA
Stream
());
return
static_cast
<
DGLStreamHandle
>
(
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
}
/** NOTE:
cuda
HostRegister can be called from an arbitrary GPU device,
/** NOTE:
hip
HostRegister can be called from an arbitrary GPU device,
* so we don't need to specify a ctx.
* The pinned memory can be seen by all CUDA contexts,
* not just the one that performed the allocation
...
...
@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI {
if
(
tensor_dispatcher
->
IsAvailable
())
{
tensor_dispatcher
->
CUDAHostAllocatorEmptyCache
();
}
CUDA_CALL
(
cuda
HostRegister
(
ptr
,
nbytes
,
cuda
HostRegisterDefault
));
CUDA_CALL
(
hip
HostRegister
(
ptr
,
nbytes
,
hip
HostRegisterDefault
));
return
true
;
}
void
UnpinData
(
void
*
ptr
)
{
if
(
ptr
==
nullptr
)
return
;
CUDA_CALL
(
cuda
HostUnregister
(
ptr
));
CUDA_CALL
(
hip
HostUnregister
(
ptr
));
}
void
*
AllocPinnedDataSpace
(
...
...
@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI {
// can't be a pinned tensor if CUDA context is unavailable.
if
(
!
is_available_
)
return
false
;
cuda
PointerAttribute
s
attr
;
cuda
Error_t
status
=
cuda
PointerGetAttributes
(
&
attr
,
ptr
);
hip
PointerAttribute
_t
attr
;
hip
Error_t
status
=
hip
PointerGetAttributes
(
&
attr
,
ptr
);
bool
result
=
false
;
switch
(
status
)
{
case
cuda
ErrorInvalidValue
:
case
hip
ErrorInvalidValue
:
// might be a normal CPU tensor in CUDA 10.2-
cuda
GetLastError
();
// clear error
hip
GetLastError
();
// clear error
break
;
case
cuda
Success
:
result
=
(
attr
.
type
==
cuda
MemoryTypeHost
);
case
hip
Success
:
result
=
(
attr
.
type
==
hip
MemoryTypeHost
);
break
;
case
cuda
ErrorInitializationError
:
case
cuda
ErrorNoDevice
:
case
cuda
ErrorInsufficientDriver
:
case
cuda
ErrorInvalidDevice
:
case
hip
ErrorInitializationError
:
case
hip
ErrorNoDevice
:
case
hip
ErrorInsufficientDriver
:
case
hip
ErrorInvalidDevice
:
// We don't want to fail in these particular cases since this function
// can be called when users only want to run on CPU even if CUDA API is
// enabled, or in a forked subprocess where CUDA context cannot be
// initialized. So we just mark the CUDA context to unavailable and
// return.
is_available_
=
false
;
cuda
GetLastError
();
// clear error
hip
GetLastError
();
// clear error
break
;
default:
LOG
(
FATAL
)
<<
"error while determining memory status: "
<<
cuda
GetErrorString
(
status
);
<<
hip
GetErrorString
(
status
);
break
;
}
...
...
@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
size
,
getCurrent
CUDA
Stream
());
size
,
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
}
...
...
@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI {
private:
static
void
GPUCopy
(
const
void
*
from
,
void
*
to
,
size_t
size
,
cuda
MemcpyKind
kind
,
cuda
Stream_t
stream
)
{
CUDA_CALL
(
cuda
MemcpyAsync
(
to
,
from
,
size
,
kind
,
stream
));
if
(
stream
==
0
&&
kind
==
cuda
MemcpyDeviceToHost
)
{
const
void
*
from
,
void
*
to
,
size_t
size
,
hip
MemcpyKind
kind
,
hip
Stream_t
stream
)
{
CUDA_CALL
(
hip
MemcpyAsync
(
to
,
from
,
size
,
kind
,
stream
));
if
(
stream
==
0
&&
kind
==
hip
MemcpyDeviceToHost
)
{
// only wait for the copy, when it's on the default stream, and it's to
// host memory
CUDA_CALL
(
cuda
StreamSynchronize
(
stream
));
CUDA_CALL
(
hip
StreamSynchronize
(
stream
));
}
}
...
...
@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
return
CUDAThreadStore
::
Get
();
}
cuda
Stream_t
getCurrent
CUDA
Stream
()
{
hip
Stream_t
getCurrent
HIP
Stream
MasqueradingAsCUDA
()
{
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CUDAGetCurrentStream
();
...
...
src/runtime/cuda/cuda_hashtable.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
...
...
@@ -10,7 +12,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include "cuda_common.h"
#include
"cuda
_runtime.h
"
#include
<hip/hip
_runtime.h
>
namespace
dgl
{
namespace
runtime
{
...
...
@@ -228,7 +230,7 @@ class OrderedHashTable {
* @param stream The stream to use for initializing the hashtable.
*/
OrderedHashTable
(
const
size_t
size
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
const
size_t
size
,
DGLContext
ctx
,
hip
Stream_t
stream
,
const
int
scale
=
kDefaultScale
);
/**
...
...
@@ -252,7 +254,7 @@ class OrderedHashTable {
*/
void
FillWithDuplicates
(
const
IdType
*
const
input
,
const
size_t
num_input
,
IdType
*
const
unique
,
int64_t
*
const
num_unique
,
cuda
Stream_t
stream
);
int64_t
*
const
num_unique
,
hip
Stream_t
stream
);
/**
* @brief Fill the hashtable with an array of unique keys.
...
...
@@ -262,7 +264,7 @@ class OrderedHashTable {
* @param stream The stream to perform operations on.
*/
void
FillWithUnique
(
const
IdType
*
const
input
,
const
size_t
num_input
,
cuda
Stream_t
stream
);
const
IdType
*
const
input
,
const
size_t
num_input
,
hip
Stream_t
stream
);
/**
* @brief Get a verison of the hashtable usable from device functions.
...
...
src/runtime/cuda/cuda_hashtable.
cu
→
src/runtime/cuda/cuda_hashtable.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
...
...
@@ -5,7 +7,7 @@
*/
#include <cassert>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include "../../array/cuda/atomic.cuh"
#include "cuda_common.h"
...
...
@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return The mapping.
*/
inline __device__ Iterator Search(const IdType id) {
const
IdType
pos
=
SearchForPosition
(
id
);
// const IdType pos = SearchForPosition(id);
const IdType pos = DeviceOrderedHashTable<IdType>::SearchForPosition(id);
return GetMutable(pos);
}
...
...
@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return An iterator to inserted mapping.
*/
inline __device__ Iterator Insert(const IdType id, const size_t index) {
size_t
pos
=
Hash
(
id
);
// size_t pos = Hash(id);
size_t pos = DeviceOrderedHashTable<IdType>::Hash(id);
// linearly scan for an empty slot or matching entry
IdType delta = 1;
while (!AttemptInsertAt(pos, id, index)) {
pos
=
Hash
(
pos
+
delta
);
// pos = Hash(pos + delta);
pos = DeviceOrderedHashTable<IdType>::Hash(pos+delta);
delta += 1;
}
...
...
@@ -246,7 +254,7 @@ __global__ void count_hashmap(
DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
assert(BLOCK_SIZE == blockDim.x);
using
BlockReduce
=
typename
cub
::
BlockReduce
<
IdType
,
BLOCK_SIZE
>
;
using BlockReduce = typename
hip
cub::BlockReduce<IdType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
const size_t block_start = TILE_SIZE * blockIdx.x;
...
...
@@ -300,7 +308,7 @@ __global__ void compact_hashmap(
assert(BLOCK_SIZE == blockDim.x);
using FlagType = uint16_t;
using
BlockScan
=
typename
cub
::
BlockScan
<
FlagType
,
BLOCK_SIZE
>
;
using BlockScan = typename
hip
cub::BlockScan<FlagType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
...
...
@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
template <typename IdType>
OrderedHashTable<IdType>::OrderedHashTable(
const
size_t
size
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
const
int
scale
)
const size_t size, DGLContext ctx,
hip
Stream_t stream, const int scale)
: table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
// make sure we will at least as many buckets as items.
CHECK_GT(scale, 0);
...
...
@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
table_ = static_cast<Mapping*>(
device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
sizeof(Mapping) * size_, stream));
}
...
...
@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
template <typename IdType>
void OrderedHashTable<IdType>::FillWithDuplicates(
const IdType* const input, const size_t num_input, IdType* const unique,
int64_t
*
const
num_unique
,
cuda
Stream_t
stream
)
{
int64_t* const num_unique,
hip
Stream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx_);
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
...
...
@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
input, num_input, device_table, item_prefix);
size_t workspace_bytes;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), grid.x + 1, stream));
void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
stream));
device->FreeWorkspace(ctx_, workspace);
...
...
@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
template <typename IdType>
void OrderedHashTable<IdType>::FillWithUnique(
const
IdType
*
const
input
,
const
size_t
num_input
,
cuda
Stream_t
stream
)
{
const IdType* const input, const size_t num_input,
hip
Stream_t stream) {
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
const dim3 grid(num_tiles);
...
...
src/runtime/cuda/gpu_cache.
cu
→
src/runtime/cuda/gpu_cache.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/*!
* Copyright (c) 2022 by Contributors
*
...
...
@@ -20,7 +21,7 @@
#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/array.h>
#include <dgl/aten/array_ops.h>
#include <dgl/packed_func_ext.h>
...
...
@@ -31,7 +32,7 @@
#include <nv_gpu_cache.hpp>
#include "
../../runtime/cuda/
cuda_common.h"
#include "cuda_common.h"
namespace dgl {
namespace runtime {
...
...
@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object {
: num_feats(num_feats),
cache(std::make_unique<gpu_cache_t>(
(num_items + bucket_size - 1) / bucket_size, num_feats)) {
CUDA_CALL
(
cuda
GetDevice
(
&
cuda_device
));
CUDA_CALL(
hip
GetDevice(&cuda_device));
}
std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
const auto &ctx = keys->ctx;
cuda
Stream_t
stream
=
dgl
::
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = dgl::runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = dgl::runtime::DeviceAPI::Get(ctx);
CHECK_EQ(ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
...
...
@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object {
}
void Replace(IdArray keys, NDArray values) {
cuda
Stream_t
stream
=
dgl
::
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = dgl::runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
CHECK_EQ(keys->ctx.device_id, cuda_device)
...
...
src/runtime/module.cc
View file @
6ac701f8
...
...
@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
}
else
if
(
target
.
length
()
>=
5
&&
target
.
substr
(
0
,
5
)
==
"nvptx"
)
{
f_name
=
"device_api.cuda"
;
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"rocm"
)
{
f_name
=
"device_api.
rocm
"
;
f_name
=
"device_api.
cuda
"
;
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"llvm"
)
{
const
PackedFunc
*
pf
=
runtime
::
Registry
::
Get
(
"codegen.llvm_target_enabled"
);
...
...
src/runtime/ndarray.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017-2022 by Contributors
* @file ndarray.cc
...
...
@@ -26,7 +27,7 @@ constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
#ifdef DGL_USE_CUDA
constexpr
DGLDataType
DGLDataTypeTraits
<
__half
>::
dtype
;
#if BF16_ENABLED
constexpr
DGLDataType
DGLDataTypeTraits
<
__
nv
_bfloat16
>::
dtype
;
constexpr
DGLDataType
DGLDataTypeTraits
<
__
hip
_bfloat16
>::
dtype
;
#endif // BF16_ENABLED
#endif // DGL_USE_CUDA
constexpr
DGLDataType
DGLDataTypeTraits
<
float
>::
dtype
;
...
...
@@ -222,7 +223,7 @@ void NDArray::RecordedCopyFromTo(
CHECK
(
from
->
ctx
.
device_type
!=
to
->
ctx
.
device_type
)
<<
"Recoding event is only called for the copy between CPU and GPU."
;
CHECK
(
from
->
ctx
.
device_type
==
kDGLCUDA
||
to
->
ctx
.
device_type
==
kDGL
CUDA
)
CHECK
(
from
->
ctx
.
device_type
==
kDGLCUDA
||
to
->
ctx
.
device_type
==
kDGL
ROCM
)
<<
"At least one CUDA ctx needs to be involved."
;
DeviceAPI
::
Get
(
kDGLCUDA
)
->
RecordedCopyDataFromTo
(
...
...
@@ -262,7 +263,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
void
NDArray
::
UnpinContainer
(
NDArray
::
Container
*
ptr
)
{
auto
container_is_pinned
=
IsContainerPinned
(
ptr
);
// The tensor may be pinned outside of DGL via a different CUDA API,
// so we cannot unpin it with
cuda
HostUnregister.
// so we cannot unpin it with
hip
HostUnregister.
CHECK
(
ptr
->
pinned_by_dgl_
||
!
container_is_pinned
)
<<
"Cannot unpin a tensor that is pinned outside of DGL."
;
// 1. not pinned, do nothing
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment