Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
74d88bf8
Commit
74d88bf8
authored
Feb 20, 2025
by
sangwz
Browse files
Merge branch 'dtk25.04' of
http://developer.sourcefind.cn/codes/OpenDAS/dgl
into 2.2.1
parents
2a1ac588
314cedc1
Changes
179
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
298 additions
and
267 deletions
+298
-267
src/graph/serialize/heterograph_serialize.cc
src/graph/serialize/heterograph_serialize.cc
+3
-2
src/graph/subgraph.cc
src/graph/subgraph.cc
+2
-1
src/graph/transform/cuda/cuda_compact_graph.hip
src/graph/transform/cuda/cuda_compact_graph.hip
+5
-4
src/graph/transform/cuda/cuda_map_edges.cuh
src/graph/transform/cuda/cuda_map_edges.cuh
+4
-3
src/graph/transform/cuda/cuda_to_block.hip
src/graph/transform/cuda/cuda_to_block.hip
+13
-12
src/graph/transform/cuda/knn.hip
src/graph/transform/cuda/knn.hip
+24
-22
src/graph/traversal.cc
src/graph/traversal.cc
+2
-1
src/graph/unit_graph.cc
src/graph/unit_graph.cc
+3
-2
src/partition/cuda/partition_op.hip
src/partition/cuda/partition_op.hip
+21
-19
src/partition/ndarray_partition.cc
src/partition/ndarray_partition.cc
+7
-7
src/random/continuous_seed.h
src/random/continuous_seed.h
+13
-12
src/rpc/rpc.cc
src/rpc/rpc.cc
+2
-1
src/rpc/rpc.h
src/rpc/rpc.h
+4
-3
src/runtime/c_runtime_api.cc
src/runtime/c_runtime_api.cc
+6
-3
src/runtime/cuda/cuda_common.h
src/runtime/cuda/cuda_common.h
+67
-66
src/runtime/cuda/cuda_device_api.cc
src/runtime/cuda/cuda_device_api.cc
+88
-86
src/runtime/cuda/cuda_hashtable.cuh
src/runtime/cuda/cuda_hashtable.cuh
+6
-4
src/runtime/cuda/cuda_hashtable.hip
src/runtime/cuda/cuda_hashtable.hip
+20
-12
src/runtime/cuda/gpu_cache.hip
src/runtime/cuda/gpu_cache.hip
+7
-6
src/runtime/module.cc
src/runtime/module.cc
+1
-1
No files found.
src/graph/serialize/heterograph_serialize.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/serialize/heterograph_serialize.cc
...
...
@@ -48,8 +49,8 @@
#include <vector>
#include "../heterograph.h"
#include "
./
dglstream.h"
#include "
./
graph_serialize.h"
#include "dglstream.h"
#include "graph_serialize.h"
#include "dmlc/memory_io.h"
namespace
dgl
{
...
...
src/graph/subgraph.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/subgraph.cc
* @brief Functions for extracting subgraphs.
*/
#include "
./
heterograph.h"
#include "heterograph.h"
using
namespace
dgl
::
runtime
;
namespace
dgl
{
...
...
src/graph/transform/cuda/cuda_compact_graph.
cu
→
src/graph/transform/cuda/cuda_compact_graph.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2021 Contributors
*
...
...
@@ -18,7 +19,7 @@
* all given graphs with the same set of nodes.
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
...
...
@@ -55,10 +56,10 @@ template <typename IdType>
void BuildNodeMaps(
const std::vector<IdArray> &input_nodes,
DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
std
::
vector
<
IdArray
>
*
const
unique_nodes_device
,
cuda
Stream_t
stream
)
{
std::vector<IdArray> *const unique_nodes_device,
hip
Stream_t stream) {
const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
stream));
...
...
@@ -81,7 +82,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
const std::vector<IdArray> &always_preserve) {
const auto &ctx = graphs[0]->Context();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CHECK_EQ(ctx.device_type, kDGLCUDA);
...
...
src/graph/transform/cuda/cuda_map_edges.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2022 Contributors
*
...
...
@@ -22,7 +23,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include <dgl/base_heterograph.h>
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/runtime/c_runtime_api.h>
#include <algorithm>
...
...
@@ -113,7 +114,7 @@ class DeviceNodeMap {
DeviceNodeMap
(
const
std
::
vector
<
int64_t
>&
num_nodes
,
const
int64_t
offset
,
DGLContext
ctx
,
cuda
Stream_t
stream
)
DGLContext
ctx
,
hip
Stream_t
stream
)
:
num_types_
(
num_nodes
.
size
()),
rhs_offset_
(
offset
),
hash_tables_
(),
...
...
@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) {
template
<
typename
IdType
>
std
::
tuple
<
std
::
vector
<
IdArray
>
,
std
::
vector
<
IdArray
>>
MapEdges
(
HeteroGraphPtr
graph
,
const
std
::
vector
<
EdgeArray
>&
edge_sets
,
const
DeviceNodeMap
<
IdType
>&
node_map
,
cuda
Stream_t
stream
)
{
const
DeviceNodeMap
<
IdType
>&
node_map
,
hip
Stream_t
stream
)
{
constexpr
const
int
BLOCK_SIZE
=
128
;
constexpr
const
size_t
TILE_SIZE
=
1024
;
...
...
src/graph/transform/cuda/cuda_to_block.
cu
→
src/graph/transform/cuda/cuda_to_block.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2021 Contributors
*
...
...
@@ -20,7 +21,7 @@
* Tested via python wrapper: python/dgl/path/to/to_block.py
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
...
...
@@ -69,10 +70,10 @@ class DeviceNodeMapMaker {
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
std
::
vector
<
IdArray
>*
const
lhs_device
,
cuda
Stream_t
stream
)
{
std::vector<IdArray>* const lhs_device,
hip
Stream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
// possibly dublicate lhs nodes
...
...
@@ -112,7 +113,7 @@ class DeviceNodeMapMaker {
void Make(
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap
<
IdType
>*
const
node_maps
,
cuda
Stream_t
stream
)
{
DeviceNodeMap<IdType>* const node_maps,
hip
Stream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
// unique lhs nodes
...
...
@@ -155,7 +156,7 @@ struct CUDAIdsMapper {
std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
const bool generate_lhs_nodes = lhs_nodes.empty();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// Allocate space for map creation process.
DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
...
...
@@ -168,7 +169,7 @@ struct CUDAIdsMapper {
}
}
cuda
Event_t
copyEvent
;
hip
Event_t copyEvent;
NDArray new_len_tensor;
// Populate the mappings.
if (generate_lhs_nodes) {
...
...
@@ -179,7 +180,7 @@ struct CUDAIdsMapper {
src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
stream);
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
CUDA_CALL(
hip
EventCreate(©Event));
if (TensorDispatcher::Global()->IsAvailable()) {
new_len_tensor = NDArray::PinnedEmpty(
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
...
...
@@ -190,11 +191,11 @@ struct CUDAIdsMapper {
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
DGLContext{kDGLCPU, 0});
}
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
new_len_tensor->data, count_lhs_device,
sizeof(*num_nodes_per_type.data()) * num_ntypes,
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
device->FreeWorkspace(ctx, count_lhs_device);
} else {
...
...
@@ -209,8 +210,8 @@ struct CUDAIdsMapper {
if (generate_lhs_nodes) {
// wait for the previous copy
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL(
hip
EventDestroy(copyEvent));
// Resize lhs nodes.
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
...
...
src/graph/transform/cuda/knn.
cu
→
src/graph/transform/cuda/knn.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file graph/transform/cuda/knn.cu
* @brief k-nearest-neighbor (KNN) implementation (cuda)
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <limits>
#include <string>
#include <type_traits>
...
...
@@ -467,7 +469,7 @@ void BruteForceKNNCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
...
...
@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda(
const NDArray& data_points, const IdArray& data_offsets,
const NDArray& query_points, const IdArray& query_offsets, const int k,
IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = data_points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t batch_size = data_offsets->shape[0] - 1;
...
...
@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda(
// get max shared memory per block in bytes
// determine block size according to this value
int max_sharedmem_per_block = 0;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
max_sharedmem_per_block
,
cudaDevAttr
MaxSharedMemoryPerBlock
,
CUDA_CALL(
hip
DeviceGetAttribute(
&max_sharedmem_per_block,
hipDeviceAttribute
MaxSharedMemoryPerBlock,
ctx.device_id));
const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
(k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
...
...
@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda(
GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
query_offsets_data, num_block_per_segment, batch_size, block_size);
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
batch_size, stream));
device->FreeWorkspace(ctx, prefix_temp);
// wait for results
CUDA_CALL
(
cuda
StreamSynchronize
(
stream
));
CUDA_CALL(
hip
StreamSynchronize(stream));
int64_t num_blocks = 0, final_elem = 0,
copyoffset = (batch_size - 1) * sizeof(IdType);
...
...
@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda(
/** @brief Setup rng state for nn-descent */
__global__ void SetupRngKernel(
cu
randState
*
states
,
const
uint64_t
seed
,
const
size_t
n
)
{
hip
randState
_t
* states, const uint64_t seed, const size_t n) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
cu
rand_init
(
seed
,
id
,
0
,
states
+
id
);
hip
rand_init(seed, id, 0, states + id);
}
}
...
...
@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
cu
randState
state
;
cu
rand_init
(
seed
,
point_idx
,
0
,
&
state
);
hip
randState
_t
state;
hip
rand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
...
...
@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel(
current_central_nodes[i] = point_idx;
}
for (IdType i = k; i < segment_size; ++i) {
const
IdType
j
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
i
+
1
));
const IdType j = static_cast<IdType>(
hip
rand(&state) % (i + 1));
if (j < k) current_neighbors[j] = i + segment_start;
}
...
...
@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel(
const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
IdType batch_idx = 0;
if (point_idx >= offsets[batch_size]) return;
cu
randState
state
;
cu
rand_init
(
seed
,
point_idx
,
0
,
&
state
);
hip
randState
_t
state;
hip
rand_init(seed, point_idx, 0, &state);
// find the segment location in the input batch
for (IdType b = 0; b < batch_size + 1; ++b) {
...
...
@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
candidate_data[curr_num] = candidate;
} else {
IdType
pos
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
curr_num
+
1
));
IdType pos = static_cast<IdType>(
hip
rand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = candidate;
}
++candidate_array[0];
...
...
@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel(
if (curr_num < num_candidates) {
candidate_data[curr_num] = reverse_candidate;
} else {
IdType
pos
=
static_cast
<
IdType
>
(
cu
rand
(
&
state
)
%
(
curr_num
+
1
));
IdType pos = static_cast<IdType>(
hip
rand(&state) % (curr_num + 1));
if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
}
++candidate_array[0];
...
...
@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void NNDescent(
const NDArray& points, const IdArray& offsets, IdArray result, const int k,
const int num_iters, const int num_candidates, const double delta) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = points->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t num_nodes = points->shape[0];
...
...
@@ -887,7 +889,7 @@ void NNDescent(
uint64_t seed;
int warp_size = 0;
CUDA_CALL(
cuda
DeviceGetAttribute
(
&
warp_size
,
cudaDevAttr
WarpSize
,
ctx
.
device_id
));
hip
DeviceGetAttribute(&warp_size,
hipDeviceAttribute
WarpSize, ctx.device_id));
// We don't need large block sizes, since there's not much inter-thread
// communication
int64_t block_size = warp_size;
...
...
@@ -911,7 +913,7 @@ void NNDescent(
IdType* total_num_updates_d =
static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
CUDA_CALL
(
cub
::
DeviceReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceReduce::Sum(
nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
stream));
IdType* sum_temp_storage =
...
...
@@ -942,7 +944,7 @@ void NNDescent(
feature_size);
total_num_updates = 0;
CUDA_CALL
(
cub
::
DeviceReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceReduce::Sum(
sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
num_nodes, stream));
device->CopyDataFromTo(
...
...
src/graph/traversal.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2018 by Contributors
* @file graph/traversal.cc
* @brief Graph traversal implementation
*/
#include "
./
traversal.h"
#include "traversal.h"
#include <dgl/packed_func_ext.h>
...
...
src/graph/unit_graph.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/unit_graph.cc
* @brief UnitGraph graph implementation
*/
#include "
./
unit_graph.h"
#include "unit_graph.h"
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
...
...
@@ -11,7 +12,7 @@
#include <dgl/lazy.h>
#include "../c_api_common.h"
#include "
./
serialize/dglstream.h"
#include "serialize/dglstream.h"
namespace
dgl
{
...
...
src/partition/cuda/partition_op.
cu
→
src/partition/cuda/partition_op.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file ndarray_partition.h
...
...
@@ -6,7 +8,7 @@
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/workspace.h"
...
...
@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_in = in_idx->shape[0];
...
...
@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
}
const int64_t part_bits =
static_cast
<
int64_t
>
(
std
::
ceil
(
std
::
log2
(
num_parts
)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
...
...
@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
...
...
@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"
);
"in
hip
cub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
...
...
@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
"value of int.";
size_t hist_workspace_size;
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
...
...
@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
template <DGLDeviceType XPU, typename IdType>
IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1) {
IdArray local_idx =
...
...
@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder(
<< num_parts;
const auto& ctx = local_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1) {
IdArray global_idx =
...
...
@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
const auto& ctx = in_idx->ctx;
auto device = DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_in = in_idx->shape[0];
...
...
@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
}
const int64_t part_bits =
static_cast
<
int64_t
>
(
std
::
ceil
(
std
::
log2
(
num_parts
)));
static_cast<int64_t>(::ceil(std::log2(num_parts)));
// First, generate a mapping of indexes to processors
Workspace<IdType> proc_id_in(device, ctx, num_in);
...
...
@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
size_t sort_workspace_size;
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
stream));
Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
num_in, 0, part_bits, stream));
...
...
@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
static_assert(
sizeof(AtomicCount) == sizeof(*out_counts),
"AtomicCount must be the same width as int64_t for atomicAdd "
"in cub::DeviceHistogram::HistogramEven() to work"
);
"in
hip
cub::DeviceHistogram::HistogramEven() to work");
// TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
// add a compile time check against the cub version to allow
...
...
@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
"value of int.";
size_t hist_workspace_size;
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
nullptr, hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
static_cast<int>(num_in), stream));
Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
CUDA_CALL
(
cub
::
DeviceHistogram
::
HistogramEven
(
CUDA_CALL(
hip
cub::DeviceHistogram::HistogramEven(
hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
static_cast<IdType>(0), static_cast<IdType>(num_parts),
...
...
@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
IdArray MapToLocalFromRange(
const int num_parts, IdArray range, IdArray global_idx) {
const auto& ctx = global_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1 && global_idx->shape[0] > 0) {
IdArray local_idx =
...
...
@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange(
<< num_parts;
const auto& ctx = local_idx->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if (num_parts > 1 && local_idx->shape[0] > 0) {
IdArray global_idx =
...
...
src/partition/ndarray_partition.cc
View file @
74d88bf8
...
...
@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
GeneratePermutationFromRemainder
<
kDGLCUDA
,
IdType
>
(
ArraySize
(),
NumParts
(),
in_idx
);
...
...
@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
MapToLocalFromRemainder
<
kDGLCUDA
,
IdType
>
(
NumParts
(),
in_idx
);
...
...
@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition {
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
return
impl
::
MapToGlobalFromRemainder
<
kDGLCUDA
,
IdType
>
(
NumParts
(),
in_idx
,
part_id
);
...
...
@@ -107,7 +107,7 @@ class RangePartition : public NDArrayPartition {
// we have only one CPU context, and can safely copy the array to that.
range_cpu_
(
range
.
CopyTo
(
DGLContext
{
kDGLCPU
,
0
}))
{
auto
ctx
=
range
->
ctx
;
if
(
ctx
.
device_type
!=
kDGLCUDA
)
{
if
(
ctx
.
device_type
!=
kDGLCUDA
&&
ctx
.
device_type
!=
kDGLROCM
)
{
LOG
(
FATAL
)
<<
"The range for an NDArrayPartition is only supported "
" on GPUs. Transfer the range to the target device before "
"creating the partition."
;
...
...
@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition {
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
if
(
ctx
.
device_type
!=
range_
->
ctx
.
device_type
||
ctx
.
device_id
!=
range_
->
ctx
.
device_id
)
{
LOG
(
FATAL
)
<<
"The range for the NDArrayPartition and the input "
...
...
@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition {
IdArray
MapToLocal
(
IdArray
in_idx
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
return
impl
::
MapToLocalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
...
...
@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition {
IdArray
MapToGlobal
(
IdArray
in_idx
,
const
int
part_id
)
const
override
{
#ifdef DGL_USE_CUDA
auto
ctx
=
in_idx
->
ctx
;
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
ATEN_ID_TYPE_SWITCH
(
in_idx
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
range_
->
dtype
,
RangeType
,
{
return
impl
::
MapToGlobalFromRange
<
kDGLCUDA
,
IdType
,
RangeType
>
(
...
...
src/random/continuous_seed.h
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/*!
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* All rights reserved.
...
...
@@ -24,13 +25,13 @@
#include <cmath>
#ifdef __
NV
CC__
#include <
cu
rand_kernel.h>
#ifdef __
HIP
CC__
#include <
hiprand/hip
rand_kernel.h>
#else
#include <random>
#include "pcg_random.hpp"
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
#ifndef M_SQRT1_2
#define M_SQRT1_2 0.707106781186547524401
...
...
@@ -58,24 +59,24 @@ class continuous_seed {
c
[
1
]
=
std
::
sin
(
pi
*
r
/
2
);
}
#ifdef __
CUDA_ARCH
__
#ifdef __
HIP_DEVICE_COMPILE
__
__device__
inline
float
uniform
(
const
uint64_t
t
)
const
{
const
uint64_t
kCurandSeed
=
999961
;
// Could be any random number.
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
kCurandSeed
,
s
[
0
],
t
,
&
rng
);
hip
randStatePhilox4_32_10_t
rng
;
hip
rand_init
(
kCurandSeed
,
s
[
0
],
t
,
&
rng
);
float
rnd
;
if
(
s
[
0
]
!=
s
[
1
])
{
rnd
=
c
[
0
]
*
cu
rand_normal
(
&
rng
);
cu
rand_init
(
kCurandSeed
,
s
[
1
],
t
,
&
rng
);
rnd
+=
c
[
1
]
*
cu
rand_normal
(
&
rng
);
rnd
=
c
[
0
]
*
hip
rand_normal
(
&
rng
);
hip
rand_init
(
kCurandSeed
,
s
[
1
],
t
,
&
rng
);
rnd
+=
c
[
1
]
*
hip
rand_normal
(
&
rng
);
rnd
=
normcdff
(
rnd
);
}
else
{
rnd
=
cu
rand_uniform
(
&
rng
);
rnd
=
hip
rand_uniform
(
&
rng
);
}
return
rnd
;
}
#else
inline
float
uniform
(
const
uint64_t
t
)
const
{
__host__
inline
float
uniform
(
const
uint64_t
t
)
const
{
pcg32
ng0
(
s
[
0
],
t
);
float
rnd
;
if
(
s
[
0
]
!=
s
[
1
])
{
...
...
@@ -91,7 +92,7 @@ class continuous_seed {
}
return
rnd
;
}
#endif // __
CUDA_ARCH
__
#endif // __
HIP_DEVICE_COMPILE
__
};
}
// namespace random
...
...
src/rpc/rpc.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.cc
* @brief Implementation of RPC utilities used by both server and client sides.
*/
#if defined(__linux__)
#include "
./
rpc.h"
#include "rpc.h"
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
...
...
src/rpc/rpc.h
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file rpc/rpc.h
...
...
@@ -19,9 +20,9 @@
#include <unordered_map>
#include <vector>
#include "
./
network/common.h"
#include "
./
rpc_msg.h"
#include "
./
server_state.h"
#include "network/common.h"
#include "rpc_msg.h"
#include "server_state.h"
#include "network/socket_communicator.h"
namespace
dgl
{
...
...
src/runtime/c_runtime_api.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2016-2022 by Contributors
* @file c_runtime_api.cc
...
...
@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) {
return
"cpu"
;
case
kDGLCUDA
:
return
"cuda"
;
case
kDGLROCM
:
return
"cuda"
;
// add more device here once supported
default:
LOG
(
FATAL
)
<<
"unknown type ="
<<
type
;
...
...
@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo(
}
bool
DeviceAPI
::
PinData
(
void
*
ptr
,
size_t
nbytes
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
HostRegister api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
HostRegister api."
;
return
false
;
}
void
*
DeviceAPI
::
AllocPinnedDataSpace
(
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
Host
A
lloc api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
Host
Ma
lloc api."
;
return
nullptr
;
}
...
...
@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
}
void
DeviceAPI
::
UnpinData
(
void
*
ptr
)
{
LOG
(
FATAL
)
<<
"Device does not support
cuda
HostUnregister api."
;
LOG
(
FATAL
)
<<
"Device does not support
hip
HostUnregister api."
;
}
}
// namespace runtime
}
// namespace dgl
...
...
src/runtime/cuda/cuda_common.h
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017 by Contributors
* @file cuda_common.h
...
...
@@ -6,10 +7,10 @@
#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#include <
cu
blas
_v2
.h>
#include <
cuda
_runtime.h>
#include <
cu
rand.h>
#include <
cu
sparse.h>
#include <
hip
blas
/hipblas
.h>
#include <
hip/hip
_runtime.h>
#include <
hiprand/hip
rand.h>
#include <
hipsparse/hip
sparse.h>
#include <dgl/runtime/packed_func.h>
#include <memory>
...
...
@@ -25,8 +26,8 @@ namespace runtime {
DGL's memory pool and the current cuda stream
runtime::CUDAWorkspaceAllocator allocator(ctx);
const auto stream = runtime::getCurrent
CUDA
Stream();
const auto exec_policy = thrust::
cuda
::par_nosync(allocator).on(stream);
const auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
now, one can pass exec_policy to thrust functions
...
...
@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) {
#define CUDA_DRIVER_CALL(x) \
{ \
CUresul
t result = x; \
if (result !=
CUDA_SUCCESS
&& result !=
CUDA_ERROR_DEINITIALIZED
) { \
hipError_
t result = x; \
if (result !=
hipSuccess
&& result !=
hipErrorDeinitialized
) { \
const char* msg; \
cu
GetErrorName(result, &msg); \
hip
GetErrorName(result, &msg); \
LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \
} \
}
#define CUDA_CALL(func) \
{ \
cuda
Error_t e = (func); \
CHECK(e ==
cuda
Success || e ==
cuda
Error
CudartUnloading
) \
<< "CUDA: " <<
cuda
GetErrorString(e); \
hip
Error_t e = (func); \
CHECK(e ==
hip
Success || e ==
hip
Error
Deinitialized
) \
<< "CUDA: " <<
hip
GetErrorString(e); \
}
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
{ \
if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
(kernel)
<<<
(nblks), (nthrs), (shmem), (stream)
>>>(
__VA_ARGS__); \
cuda
Error_t e =
cuda
GetLastError(); \
CHECK(e ==
cuda
Success || e ==
cuda
Error
CudartUnloading
) \
<< "CUDA kernel launch error: " <<
cuda
GetErrorString(e); \
hipLaunchKernelGGL((
(kernel)
), dim3(
(nblks)
)
,
dim3(
(nthrs)
)
, (shmem), (stream)
,
__VA_ARGS__); \
hip
Error_t e =
hip
GetLastError(); \
CHECK(e ==
hip
Success || e ==
hip
Error
Deinitialized
) \
<< "CUDA kernel launch error: " <<
hip
GetErrorString(e); \
} \
}
#define CUSPARSE_CALL(func) \
{ \
cu
sparseStatus_t e = (func); \
CHECK(e ==
CU
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
hip
sparseStatus_t e = (func); \
CHECK(e ==
HIP
SPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
}
#define CUBLAS_CALL(func) \
{ \
cu
blasStatus_t e = (func); \
CHECK(e ==
CU
BLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
hip
blasStatus_t e = (func); \
CHECK(e ==
HIP
BLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
}
#define CURAND_CALL(func) \
{ \
cu
randStatus_t e = (func); \
CHECK(e ==
CU
RAND_STATUS_SUCCESS) \
hip
randStatus_t e = (func); \
CHECK(e ==
HIP
RAND_STATUS_SUCCESS) \
<< "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
<< __FILE__ << ":" << __LINE__; \
}
inline
const
char
*
curandGetErrorString
(
cu
randStatus_t
error
)
{
inline
const
char
*
curandGetErrorString
(
hip
randStatus_t
error
)
{
switch
(
error
)
{
case
CU
RAND_STATUS_SUCCESS
:
return
"
CU
RAND_STATUS_SUCCESS"
;
case
CU
RAND_STATUS_VERSION_MISMATCH
:
return
"
CU
RAND_STATUS_VERSION_MISMATCH"
;
case
CU
RAND_STATUS_NOT_INITIALIZED
:
return
"
CU
RAND_STATUS_NOT_INITIALIZED"
;
case
CU
RAND_STATUS_ALLOCATION_FAILED
:
return
"
CU
RAND_STATUS_ALLOCATION_FAILED"
;
case
CU
RAND_STATUS_TYPE_ERROR
:
return
"
CU
RAND_STATUS_TYPE_ERROR"
;
case
CU
RAND_STATUS_OUT_OF_RANGE
:
return
"
CU
RAND_STATUS_OUT_OF_RANGE"
;
case
CU
RAND_STATUS_LENGTH_NOT_MULTIPLE
:
return
"
CU
RAND_STATUS_LENGTH_NOT_MULTIPLE"
;
case
CU
RAND_STATUS_DOUBLE_PRECISION_REQUIRED
:
return
"
CU
RAND_STATUS_DOUBLE_PRECISION_REQUIRED"
;
case
CU
RAND_STATUS_LAUNCH_FAILURE
:
return
"
CU
RAND_STATUS_LAUNCH_FAILURE"
;
case
CU
RAND_STATUS_PREEXISTING_FAILURE
:
return
"
CU
RAND_STATUS_PREEXISTING_FAILURE"
;
case
CU
RAND_STATUS_INITIALIZATION_FAILED
:
return
"
CU
RAND_STATUS_INITIALIZATION_FAILED"
;
case
CU
RAND_STATUS_ARCH_MISMATCH
:
return
"
CU
RAND_STATUS_ARCH_MISMATCH"
;
case
CU
RAND_STATUS_INTERNAL_ERROR
:
return
"
CU
RAND_STATUS_INTERNAL_ERROR"
;
case
HIP
RAND_STATUS_SUCCESS
:
return
"
HIP
RAND_STATUS_SUCCESS"
;
case
HIP
RAND_STATUS_VERSION_MISMATCH
:
return
"
HIP
RAND_STATUS_VERSION_MISMATCH"
;
case
HIP
RAND_STATUS_NOT_INITIALIZED
:
return
"
HIP
RAND_STATUS_NOT_INITIALIZED"
;
case
HIP
RAND_STATUS_ALLOCATION_FAILED
:
return
"
HIP
RAND_STATUS_ALLOCATION_FAILED"
;
case
HIP
RAND_STATUS_TYPE_ERROR
:
return
"
HIP
RAND_STATUS_TYPE_ERROR"
;
case
HIP
RAND_STATUS_OUT_OF_RANGE
:
return
"
HIP
RAND_STATUS_OUT_OF_RANGE"
;
case
HIP
RAND_STATUS_LENGTH_NOT_MULTIPLE
:
return
"
HIP
RAND_STATUS_LENGTH_NOT_MULTIPLE"
;
case
HIP
RAND_STATUS_DOUBLE_PRECISION_REQUIRED
:
return
"
HIP
RAND_STATUS_DOUBLE_PRECISION_REQUIRED"
;
case
HIP
RAND_STATUS_LAUNCH_FAILURE
:
return
"
HIP
RAND_STATUS_LAUNCH_FAILURE"
;
case
HIP
RAND_STATUS_PREEXISTING_FAILURE
:
return
"
HIP
RAND_STATUS_PREEXISTING_FAILURE"
;
case
HIP
RAND_STATUS_INITIALIZATION_FAILED
:
return
"
HIP
RAND_STATUS_INITIALIZATION_FAILED"
;
case
HIP
RAND_STATUS_ARCH_MISMATCH
:
return
"
HIP
RAND_STATUS_ARCH_MISMATCH"
;
case
HIP
RAND_STATUS_INTERNAL_ERROR
:
return
"
HIP
RAND_STATUS_INTERNAL_ERROR"
;
}
// To suppress compiler warning.
return
"Unrecognized
cu
rand error string"
;
return
"Unrecognized
hip
rand error string"
;
}
/**
* @brief Cast data type to
cuda
DataType
_t
.
* @brief Cast data type to
hip
DataType.
*/
template
<
typename
T
>
struct
cuda_dtype
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_32F
;
static
constexpr
hip
DataType
value
=
HIP
_R_32F
;
};
template
<
>
struct
cuda_dtype
<
__half
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_16F
;
static
constexpr
hip
DataType
value
=
HIP
_R_16F
;
};
#if BF16_ENABLED
template
<
>
struct
cuda_dtype
<
__
nv
_bfloat16
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_16BF
;
struct
cuda_dtype
<
__
hip
_bfloat16
>
{
static
constexpr
hip
DataType
value
=
HIP
_R_16BF
;
};
#endif // BF16_ENABLED
template
<
>
struct
cuda_dtype
<
float
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_32F
;
static
constexpr
hip
DataType
value
=
HIP
_R_32F
;
};
template
<
>
struct
cuda_dtype
<
double
>
{
static
constexpr
cuda
DataType
_t
value
=
CUDA
_R_64F
;
static
constexpr
hip
DataType
value
=
HIP
_R_64F
;
};
/*
...
...
@@ -202,7 +203,7 @@ struct accum_dtype<__half> {
#if BF16_ENABLED
template
<
>
struct
accum_dtype
<
__
nv
_bfloat16
>
{
struct
accum_dtype
<
__
hip
_bfloat16
>
{
typedef
float
type
;
};
#endif // BF16_ENABLED
...
...
@@ -217,23 +218,23 @@ struct accum_dtype<double> {
typedef
double
type
;
};
#if
CUDA
RT_VERSION >= 11000
#if
DTK
RT_VERSION >= 11000
/**
* @brief Cast index data type to
cu
sparseIndexType_t.
* @brief Cast index data type to
hip
sparseIndexType_t.
*/
template
<
typename
T
>
struct
cusparse_idtype
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_32I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_32I
;
};
template
<
>
struct
cusparse_idtype
<
int32_t
>
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_32I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_32I
;
};
template
<
>
struct
cusparse_idtype
<
int64_t
>
{
static
constexpr
cu
sparseIndexType_t
value
=
CU
SPARSE_INDEX_64I
;
static
constexpr
hip
sparseIndexType_t
value
=
HIP
SPARSE_INDEX_64I
;
};
#endif
...
...
@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> {
class
CUDAThreadEntry
{
public:
/** @brief The cusparse handler */
cu
sparseHandle_t
cusparse_handle
{
nullptr
};
hip
sparseHandle_t
cusparse_handle
{
nullptr
};
/** @brief The cublas handler */
cu
blasHandle_t
cublas_handle
{
nullptr
};
hip
blasHandle_t
cublas_handle
{
nullptr
};
/** @brief thread local pool*/
WorkspacePool
pool
;
/** @brief constructor */
...
...
@@ -253,7 +254,7 @@ class CUDAThreadEntry {
};
/** @brief Get the current CUDA stream */
cuda
Stream_t
getCurrent
CUDA
Stream
();
hip
Stream_t
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
}
// namespace runtime
}
// namespace dgl
#endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
src/runtime/cuda/cuda_device_api.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017-2022 by Contributors
* @file cuda_device_api.cc
* @brief GPU specific API
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/tensordispatch.h>
...
...
@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI {
public:
CUDADeviceAPI
()
{
int
count
;
auto
err
=
cuda
GetDeviceCount
(
&
count
);
auto
err
=
hip
GetDeviceCount
(
&
count
);
switch
(
err
)
{
case
cuda
Success
:
case
hip
Success
:
break
;
default:
count
=
0
;
cuda
GetLastError
();
hip
GetLastError
();
}
is_available_
=
count
>
0
;
}
...
...
@@ -32,67 +33,67 @@ class CUDADeviceAPI final : public DeviceAPI {
bool
IsAvailable
()
final
{
return
is_available_
;
}
void
SetDevice
(
DGLContext
ctx
)
final
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
}
void
GetAttr
(
DGLContext
ctx
,
DeviceAttrKind
kind
,
DGLRetValue
*
rv
)
final
{
int
value
=
0
;
switch
(
kind
)
{
case
kExist
:
value
=
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxThreadsPerBlock
,
ctx
.
device_id
)
==
cuda
Success
);
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
MaxThreadsPerBlock
,
ctx
.
device_id
)
==
hip
Success
);
break
;
case
kMaxThreadsPerBlock
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxThreadsPerBlock
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
MaxThreadsPerBlock
,
ctx
.
device_id
));
break
;
}
case
kWarpSize
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
WarpSize
,
ctx
.
device_id
));
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
WarpSize
,
ctx
.
device_id
));
break
;
}
case
kMaxSharedMemoryPerBlock
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
MaxSharedMemoryPerBlock
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
MaxSharedMemoryPerBlock
,
ctx
.
device_id
));
break
;
}
case
kComputeVersion
:
{
std
::
ostringstream
os
;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ComputeCapabilityMajor
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
ComputeCapabilityMajor
,
ctx
.
device_id
));
os
<<
value
<<
"."
;
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ComputeCapabilityMinor
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
ComputeCapabilityMinor
,
ctx
.
device_id
));
os
<<
value
;
*
rv
=
os
.
str
();
return
;
}
case
kDeviceName
:
{
cuda
DeviceProp
props
;
CUDA_CALL
(
cuda
GetDeviceProperties
(
&
props
,
ctx
.
device_id
));
hip
DeviceProp
_t
props
;
CUDA_CALL
(
hip
GetDeviceProperties
(
&
props
,
ctx
.
device_id
));
*
rv
=
std
::
string
(
props
.
name
);
return
;
}
case
kMaxClockRate
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
ClockRate
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
ClockRate
,
ctx
.
device_id
));
break
;
}
case
kMultiProcessorCount
:
{
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
value
,
cudaDevAttr
Multi
P
rocessorCount
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
value
,
hipDeviceAttribute
Multi
p
rocessorCount
,
ctx
.
device_id
));
break
;
}
case
kMaxThreadDimensions
:
{
int
dims
[
3
];
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
dims
[
0
],
cudaDevAttr
MaxBlockDimX
,
ctx
.
device_id
));
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
dims
[
1
],
cudaDevAttr
MaxBlockDimY
,
ctx
.
device_id
));
CUDA_CALL
(
cuda
DeviceGetAttribute
(
&
dims
[
2
],
cudaDevAttr
MaxBlockDimZ
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
0
],
hipDeviceAttribute
MaxBlockDimX
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
1
],
hipDeviceAttribute
MaxBlockDimY
,
ctx
.
device_id
));
CUDA_CALL
(
hip
DeviceGetAttribute
(
&
dims
[
2
],
hipDeviceAttribute
MaxBlockDimZ
,
ctx
.
device_id
));
std
::
stringstream
ss
;
// use json string to return multiple int values;
ss
<<
"["
<<
dims
[
0
]
<<
", "
<<
dims
[
1
]
<<
", "
<<
dims
[
2
]
<<
"]"
;
...
...
@@ -110,11 +111,11 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
nbytes
,
getCurrent
CUDA
Stream
());
nbytes
,
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
}
CHECK_EQ
(
256
%
alignment
,
0U
)
<<
"CUDA space is aligned at 256 bytes"
;
void
*
ret
;
CUDA_CALL
(
cuda
Malloc
(
&
ret
,
nbytes
));
CUDA_CALL
(
hip
Malloc
(
&
ret
,
nbytes
));
return
ret
;
}
...
...
@@ -124,34 +125,35 @@ class CUDADeviceAPI final : public DeviceAPI {
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CUDAFreeWorkspace
(
ptr
);
}
CUDA_CALL
(
cuda
Free
(
ptr
));
CUDA_CALL
(
hip
Free
(
ptr
));
}
void
CopyDataFromTo
(
const
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
DGLStreamHandle
stream
)
{
cuda
Stream_t
cu_stream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
hip
Stream_t
cu_stream
=
static_cast
<
hip
Stream_t
>
(
stream
);
from
=
static_cast
<
const
char
*>
(
from
)
+
from_offset
;
to
=
static_cast
<
char
*>
(
to
)
+
to_offset
;
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCUDA
)
{
CUDA_CALL
(
cudaSetDevice
(
ctx_from
.
device_id
));
// if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
if
((
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
)
&&
(
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
))
{
CUDA_CALL
(
hipSetDevice
(
ctx_from
.
device_id
));
if
(
ctx_from
.
device_id
==
ctx_to
.
device_id
)
{
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyDeviceToDevice
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyDeviceToDevice
,
cu_stream
);
}
else
{
CUDA_CALL
(
cuda
MemcpyPeerAsync
(
CUDA_CALL
(
hip
MemcpyPeerAsync
(
to
,
ctx_to
.
device_id
,
from
,
ctx_from
.
device_id
,
size
,
cu_stream
));
}
}
else
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_from
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyDeviceToHost
,
cu_stream
);
(
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
)
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
CUDA_CALL
(
hip
SetDevice
(
ctx_from
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyDeviceToHost
,
cu_stream
);
}
else
if
(
ctx_from
.
device_type
==
kDGLCPU
&&
ctx_to
.
device_type
==
kDGLCUDA
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx_to
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
cuda
MemcpyHostToDevice
,
cu_stream
);
ctx_from
.
device_type
==
kDGLCPU
&&
(
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
)
)
{
CUDA_CALL
(
hip
SetDevice
(
ctx_to
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hip
MemcpyHostToDevice
,
cu_stream
);
}
else
{
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU
"
;
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU
. ctx_from.device_type: "
<<
ctx_from
.
device_type
<<
", ctx_to.device_type: "
<<
ctx_to
.
device_type
;
}
}
...
...
@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI {
}
// To ensure correct behavior, `record_event` must be invoked anytime a
// pointer from PyTorch CachingHostAllocator is used in a
cuda
MemcpyAsync
// pointer from PyTorch CachingHostAllocator is used in a
hip
MemcpyAsync
// call. It provides a way to re-use freed pinned (page-locked) memory
// allocations and avoid device sync due to
cudaFreeHost
calls.
// allocations and avoid device sync due to
hipHostFree
calls.
void
RecordedCopyDataFromTo
(
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
...
...
@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI {
stream
);
auto
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
{
auto
custream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
auto
custream
=
static_cast
<
hip
Stream_t
>
(
stream
);
void
*
ptr
=
ctx_to
.
device_type
==
kDGLCPU
?
to
:
from
;
int
id
=
ctx_to
.
device_type
==
kDGLCPU
?
ctx_from
.
device_id
:
ctx_to
.
device_id
;
...
...
@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI {
}
DGLStreamHandle
CreateStream
(
DGLContext
ctx
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
retval
;
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
hip
Stream_t
retval
;
// make sure the legacy default stream won't block on this stream
CUDA_CALL
(
cuda
StreamCreateWithFlags
(
&
retval
,
cuda
StreamNonBlocking
));
CUDA_CALL
(
hip
StreamCreateWithFlags
(
&
retval
,
hip
StreamNonBlocking
));
return
static_cast
<
DGLStreamHandle
>
(
retval
);
}
void
FreeStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
cu_stream
=
static_cast
<
cuda
Stream_t
>
(
stream
);
CUDA_CALL
(
cuda
StreamDestroy
(
cu_stream
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
hip
Stream_t
cu_stream
=
static_cast
<
hip
Stream_t
>
(
stream
);
CUDA_CALL
(
hip
StreamDestroy
(
cu_stream
));
}
void
SyncStreamFromTo
(
DGLContext
ctx
,
DGLStreamHandle
event_src
,
DGLStreamHandle
event_dst
)
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
cuda
Stream_t
src_stream
=
static_cast
<
cuda
Stream_t
>
(
event_src
);
cuda
Stream_t
dst_stream
=
static_cast
<
cuda
Stream_t
>
(
event_dst
);
cuda
Event_t
evt
;
CUDA_CALL
(
cuda
EventCreate
(
&
evt
));
CUDA_CALL
(
cuda
EventRecord
(
evt
,
src_stream
));
CUDA_CALL
(
cuda
StreamWaitEvent
(
dst_stream
,
evt
,
0
));
CUDA_CALL
(
cuda
EventDestroy
(
evt
));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
hip
Stream_t
src_stream
=
static_cast
<
hip
Stream_t
>
(
event_src
);
hip
Stream_t
dst_stream
=
static_cast
<
hip
Stream_t
>
(
event_dst
);
hip
Event_t
evt
;
CUDA_CALL
(
hip
EventCreate
(
&
evt
));
CUDA_CALL
(
hip
EventRecord
(
evt
,
src_stream
));
CUDA_CALL
(
hip
StreamWaitEvent
(
dst_stream
,
evt
,
0
));
CUDA_CALL
(
hip
EventDestroy
(
evt
));
}
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{
CUDA_CALL
(
cuda
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
cuda
StreamSynchronize
(
static_cast
<
cuda
Stream_t
>
(
stream
)));
CUDA_CALL
(
hip
SetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
hip
StreamSynchronize
(
static_cast
<
hip
Stream_t
>
(
stream
)));
}
/** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
...
...
@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI {
void
SetStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{}
DGLStreamHandle
GetStream
()
const
final
{
return
static_cast
<
DGLStreamHandle
>
(
getCurrent
CUDA
Stream
());
return
static_cast
<
DGLStreamHandle
>
(
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
}
/** NOTE:
cuda
HostRegister can be called from an arbitrary GPU device,
/** NOTE:
hip
HostRegister can be called from an arbitrary GPU device,
* so we don't need to specify a ctx.
* The pinned memory can be seen by all CUDA contexts,
* not just the one that performed the allocation
...
...
@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI {
if
(
tensor_dispatcher
->
IsAvailable
())
{
tensor_dispatcher
->
CUDAHostAllocatorEmptyCache
();
}
CUDA_CALL
(
cuda
HostRegister
(
ptr
,
nbytes
,
cuda
HostRegisterDefault
));
CUDA_CALL
(
hip
HostRegister
(
ptr
,
nbytes
,
hip
HostRegisterDefault
));
return
true
;
}
void
UnpinData
(
void
*
ptr
)
{
if
(
ptr
==
nullptr
)
return
;
CUDA_CALL
(
cuda
HostUnregister
(
ptr
));
CUDA_CALL
(
hip
HostUnregister
(
ptr
));
}
void
*
AllocPinnedDataSpace
(
...
...
@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI {
// can't be a pinned tensor if CUDA context is unavailable.
if
(
!
is_available_
)
return
false
;
cuda
PointerAttribute
s
attr
;
cuda
Error_t
status
=
cuda
PointerGetAttributes
(
&
attr
,
ptr
);
hip
PointerAttribute
_t
attr
;
hip
Error_t
status
=
hip
PointerGetAttributes
(
&
attr
,
ptr
);
bool
result
=
false
;
switch
(
status
)
{
case
cuda
ErrorInvalidValue
:
case
hip
ErrorInvalidValue
:
// might be a normal CPU tensor in CUDA 10.2-
cuda
GetLastError
();
// clear error
hip
GetLastError
();
// clear error
break
;
case
cuda
Success
:
result
=
(
attr
.
type
==
cuda
MemoryTypeHost
);
case
hip
Success
:
result
=
(
attr
.
type
==
hip
MemoryTypeHost
);
break
;
case
cuda
ErrorInitializationError
:
case
cuda
ErrorNoDevice
:
case
cuda
ErrorInsufficientDriver
:
case
cuda
ErrorInvalidDevice
:
case
hip
ErrorInitializationError
:
case
hip
ErrorNoDevice
:
case
hip
ErrorInsufficientDriver
:
case
hip
ErrorInvalidDevice
:
// We don't want to fail in these particular cases since this function
// can be called when users only want to run on CPU even if CUDA API is
// enabled, or in a forked subprocess where CUDA context cannot be
// initialized. So we just mark the CUDA context to unavailable and
// return.
is_available_
=
false
;
cuda
GetLastError
();
// clear error
hip
GetLastError
();
// clear error
break
;
default:
LOG
(
FATAL
)
<<
"error while determining memory status: "
<<
cuda
GetErrorString
(
status
);
<<
hip
GetErrorString
(
status
);
break
;
}
...
...
@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI {
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
size
,
getCurrent
CUDA
Stream
());
size
,
getCurrent
HIP
Stream
MasqueradingAsCUDA
());
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
}
...
...
@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI {
private:
static
void
GPUCopy
(
const
void
*
from
,
void
*
to
,
size_t
size
,
cuda
MemcpyKind
kind
,
cuda
Stream_t
stream
)
{
CUDA_CALL
(
cuda
MemcpyAsync
(
to
,
from
,
size
,
kind
,
stream
));
if
(
stream
==
0
&&
kind
==
cuda
MemcpyDeviceToHost
)
{
const
void
*
from
,
void
*
to
,
size_t
size
,
hip
MemcpyKind
kind
,
hip
Stream_t
stream
)
{
CUDA_CALL
(
hip
MemcpyAsync
(
to
,
from
,
size
,
kind
,
stream
));
if
(
stream
==
0
&&
kind
==
hip
MemcpyDeviceToHost
)
{
// only wait for the copy, when it's on the default stream, and it's to
// host memory
CUDA_CALL
(
cuda
StreamSynchronize
(
stream
));
CUDA_CALL
(
hip
StreamSynchronize
(
stream
));
}
}
...
...
@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
return
CUDAThreadStore
::
Get
();
}
cuda
Stream_t
getCurrent
CUDA
Stream
()
{
hip
Stream_t
getCurrent
HIP
Stream
MasqueradingAsCUDA
()
{
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CUDAGetCurrentStream
();
...
...
src/runtime/cuda/cuda_hashtable.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
...
...
@@ -10,7 +12,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include "cuda_common.h"
#include
"cuda
_runtime.h
"
#include
<hip/hip
_runtime.h
>
namespace
dgl
{
namespace
runtime
{
...
...
@@ -228,7 +230,7 @@ class OrderedHashTable {
* @param stream The stream to use for initializing the hashtable.
*/
OrderedHashTable
(
const
size_t
size
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
const
size_t
size
,
DGLContext
ctx
,
hip
Stream_t
stream
,
const
int
scale
=
kDefaultScale
);
/**
...
...
@@ -252,7 +254,7 @@ class OrderedHashTable {
*/
void
FillWithDuplicates
(
const
IdType
*
const
input
,
const
size_t
num_input
,
IdType
*
const
unique
,
int64_t
*
const
num_unique
,
cuda
Stream_t
stream
);
int64_t
*
const
num_unique
,
hip
Stream_t
stream
);
/**
* @brief Fill the hashtable with an array of unique keys.
...
...
@@ -262,7 +264,7 @@ class OrderedHashTable {
* @param stream The stream to perform operations on.
*/
void
FillWithUnique
(
const
IdType
*
const
input
,
const
size_t
num_input
,
cuda
Stream_t
stream
);
const
IdType
*
const
input
,
const
size_t
num_input
,
hip
Stream_t
stream
);
/**
* @brief Get a verison of the hashtable usable from device functions.
...
...
src/runtime/cuda/cuda_hashtable.
cu
→
src/runtime/cuda/cuda_hashtable.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file runtime/cuda/cuda_device_common.cuh
...
...
@@ -5,7 +7,7 @@
*/
#include <cassert>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include "../../array/cuda/atomic.cuh"
#include "cuda_common.h"
...
...
@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return The mapping.
*/
inline __device__ Iterator Search(const IdType id) {
const
IdType
pos
=
SearchForPosition
(
id
);
// const IdType pos = SearchForPosition(id);
const IdType pos = DeviceOrderedHashTable<IdType>::SearchForPosition(id);
return GetMutable(pos);
}
...
...
@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
* @return An iterator to inserted mapping.
*/
inline __device__ Iterator Insert(const IdType id, const size_t index) {
size_t
pos
=
Hash
(
id
);
// size_t pos = Hash(id);
size_t pos = DeviceOrderedHashTable<IdType>::Hash(id);
// linearly scan for an empty slot or matching entry
IdType delta = 1;
while (!AttemptInsertAt(pos, id, index)) {
pos
=
Hash
(
pos
+
delta
);
// pos = Hash(pos + delta);
pos = DeviceOrderedHashTable<IdType>::Hash(pos+delta);
delta += 1;
}
...
...
@@ -246,7 +254,7 @@ __global__ void count_hashmap(
DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
assert(BLOCK_SIZE == blockDim.x);
using
BlockReduce
=
typename
cub
::
BlockReduce
<
IdType
,
BLOCK_SIZE
>
;
using BlockReduce = typename
hip
cub::BlockReduce<IdType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
const size_t block_start = TILE_SIZE * blockIdx.x;
...
...
@@ -300,7 +308,7 @@ __global__ void compact_hashmap(
assert(BLOCK_SIZE == blockDim.x);
using FlagType = uint16_t;
using
BlockScan
=
typename
cub
::
BlockScan
<
FlagType
,
BLOCK_SIZE
>
;
using BlockScan = typename
hip
cub::BlockScan<FlagType, BLOCK_SIZE>;
using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
...
...
@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
template <typename IdType>
OrderedHashTable<IdType>::OrderedHashTable(
const
size_t
size
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
const
int
scale
)
const size_t size, DGLContext ctx,
hip
Stream_t stream, const int scale)
: table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
// make sure we will at least as many buckets as items.
CHECK_GT(scale, 0);
...
...
@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
table_ = static_cast<Mapping*>(
device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
sizeof(Mapping) * size_, stream));
}
...
...
@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
template <typename IdType>
void OrderedHashTable<IdType>::FillWithDuplicates(
const IdType* const input, const size_t num_input, IdType* const unique,
int64_t
*
const
num_unique
,
cuda
Stream_t
stream
)
{
int64_t* const num_unique,
hip
Stream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx_);
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
...
...
@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
input, num_input, device_table, item_prefix);
size_t workspace_bytes;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), grid.x + 1, stream));
void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
stream));
device->FreeWorkspace(ctx_, workspace);
...
...
@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
template <typename IdType>
void OrderedHashTable<IdType>::FillWithUnique(
const
IdType
*
const
input
,
const
size_t
num_input
,
cuda
Stream_t
stream
)
{
const IdType* const input, const size_t num_input,
hip
Stream_t stream) {
const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
const dim3 grid(num_tiles);
...
...
src/runtime/cuda/gpu_cache.
cu
→
src/runtime/cuda/gpu_cache.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/*!
* Copyright (c) 2022 by Contributors
*
...
...
@@ -20,7 +21,7 @@
#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/array.h>
#include <dgl/aten/array_ops.h>
#include <dgl/packed_func_ext.h>
...
...
@@ -31,7 +32,7 @@
#include <nv_gpu_cache.hpp>
#include "
../../runtime/cuda/
cuda_common.h"
#include "cuda_common.h"
namespace dgl {
namespace runtime {
...
...
@@ -40,7 +41,7 @@ namespace cuda {
template <typename key_t>
class GpuCache : public runtime::Object {
constexpr static int set_associativity = 2;
constexpr
static
int
WARP_SIZE
=
32
;
constexpr static int WARP_SIZE =
64
;
constexpr static int bucket_size = WARP_SIZE * set_associativity;
using gpu_cache_t = gpu_cache::gpu_cache<
key_t, uint64_t, std::numeric_limits<key_t>::max(), set_associativity,
...
...
@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object {
: num_feats(num_feats),
cache(std::make_unique<gpu_cache_t>(
(num_items + bucket_size - 1) / bucket_size, num_feats)) {
CUDA_CALL
(
cuda
GetDevice
(
&
cuda_device
));
CUDA_CALL(
hip
GetDevice(&cuda_device));
}
std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
const auto &ctx = keys->ctx;
cuda
Stream_t
stream
=
dgl
::
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = dgl::runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = dgl::runtime::DeviceAPI::Get(ctx);
CHECK_EQ(ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
...
...
@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object {
}
void Replace(IdArray keys, NDArray values) {
cuda
Stream_t
stream
=
dgl
::
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = dgl::runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
<< "The keys should be on a CUDA device";
CHECK_EQ(keys->ctx.device_id, cuda_device)
...
...
src/runtime/module.cc
View file @
74d88bf8
...
...
@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
}
else
if
(
target
.
length
()
>=
5
&&
target
.
substr
(
0
,
5
)
==
"nvptx"
)
{
f_name
=
"device_api.cuda"
;
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"rocm"
)
{
f_name
=
"device_api.
rocm
"
;
f_name
=
"device_api.
cuda
"
;
}
else
if
(
target
.
length
()
>=
4
&&
target
.
substr
(
0
,
4
)
==
"llvm"
)
{
const
PackedFunc
*
pf
=
runtime
::
Registry
::
Get
(
"codegen.llvm_target_enabled"
);
...
...
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment