Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
114 additions
and
89 deletions
+114
-89
src/array/libra_partition.cc
src/array/libra_partition.cc
+2
-1
src/array/selector.h
src/array/selector.h
+4
-3
src/array/uvm_array.cc
src/array/uvm_array.cc
+2
-1
src/geometry/cuda/edge_coarsening_impl.hip
src/geometry/cuda/edge_coarsening_impl.hip
+11
-9
src/geometry/cuda/geometry_op_impl.hip
src/geometry/cuda/geometry_op_impl.hip
+4
-2
src/geometry/geometry.cc
src/geometry/geometry.cc
+2
-1
src/graph/creators.cc
src/graph/creators.cc
+2
-1
src/graph/heterograph.cc
src/graph/heterograph.cc
+2
-1
src/graph/heterograph.h
src/graph/heterograph.h
+2
-1
src/graph/heterograph_capi.cc
src/graph/heterograph_capi.cc
+2
-1
src/graph/pickle.cc
src/graph/pickle.cc
+2
-1
src/graph/sampling/randomwalks/frequency_hashmap.cuh
src/graph/sampling/randomwalks/frequency_hashmap.cuh
+4
-2
src/graph/sampling/randomwalks/frequency_hashmap.hip
src/graph/sampling/randomwalks/frequency_hashmap.hip
+20
-18
src/graph/sampling/randomwalks/get_node_types_gpu.hip
src/graph/sampling/randomwalks/get_node_types_gpu.hip
+2
-1
src/graph/sampling/randomwalks/randomwalk_gpu.hip
src/graph/sampling/randomwalks/randomwalk_gpu.hip
+26
-24
src/graph/serialize/heterograph_serialize.cc
src/graph/serialize/heterograph_serialize.cc
+3
-2
src/graph/subgraph.cc
src/graph/subgraph.cc
+2
-1
src/graph/transform/cuda/cuda_compact_graph.hip
src/graph/transform/cuda/cuda_compact_graph.hip
+5
-4
src/graph/transform/cuda/cuda_map_edges.cuh
src/graph/transform/cuda/cuda_map_edges.cuh
+4
-3
src/graph/transform/cuda/cuda_to_block.hip
src/graph/transform/cuda/cuda_to_block.hip
+13
-12
No files found.
src/array/libra_partition.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 Intel Corporation
*
...
...
@@ -21,7 +22,7 @@
#include <vector>
#include "../c_api_common.h"
#include "
./
check.h"
#include "check.h"
#include "kernel_decl.h"
using
namespace
dgl
::
runtime
;
...
...
src/array/selector.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/selector.h
...
...
@@ -12,13 +13,13 @@ namespace dgl {
namespace
{
#ifdef __
CUDA
CC__
#define DGLDEVICE __device__
#ifdef __
HIP
CC__
#define DGLDEVICE __device__
__host__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __
CUDA
CC__
#endif // __
HIP
CC__
}
// namespace
...
...
src/array/uvm_array.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/uvm_array.cc
...
...
@@ -8,7 +9,7 @@
#include <sstream>
#include "../c_api_common.h"
#include "
./
uvm_array_op.h"
#include "uvm_array_op.h"
using
namespace
dgl
::
runtime
;
...
...
src/geometry/cuda/edge_coarsening_impl.
cu
→
src/geometry/cuda/edge_coarsening_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/edge_coarsening_impl.cu
* @brief Edge coarsening CUDA implementation
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dmlc/thread_local.h>
...
...
@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
float *ret_values, size_t num, uint64_t seed) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < num) {
cu
randState
state
;
cu
rand_init
(
seed
,
id
,
0
,
&
state
);
ret_values
[
id
]
=
cu
rand_uniform
(
&
state
);
hip
randState
_t
state;
hip
rand_init(seed, id, 0, &state);
ret_values[id] =
hip
rand_uniform(&state);
}
}
...
...
@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
// initial done signal
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
// generate color prop for each node
...
...
@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
result_data);
bool done_h = false;
CUDA_CALL
(
cuda
MemcpyFromSymbol
(
&
done_h
,
done_d
,
sizeof
(
done_h
),
0
,
cuda
MemcpyDeviceToHost
));
CUDA_CALL(
hip
MemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0,
hip
MemcpyDeviceToHost));
return done_h;
}
...
...
@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
template <DGLDeviceType XPU, typename FloatType, typename IdType>
void WeightedNeighborMatching(
const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto &ctx = result->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
device->SetDevice(ctx);
...
...
@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
device->SetDevice(ctx);
// generate random weights
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
NDArray weight = NDArray::Empty(
{num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
float *weight_data = static_cast<float *>(weight->data);
...
...
src/geometry/cuda/geometry_op_impl.
cu
→
src/geometry/cuda/geometry_op_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/geometry_op_impl.cc
...
...
@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void FarthestPointSampler(
NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
IdArray start_idx, IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const FloatType* array_data = static_cast<FloatType*>(array->data);
...
...
@@ -110,7 +112,7 @@ void FarthestPointSampler(
// sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
CUDA_CALL
(
cuda
SetDevice
(
array
->
ctx
.
device_id
));
CUDA_CALL(
hip
SetDevice(array->ctx.device_id));
CUDA_KERNEL_CALL(
fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
...
...
src/geometry/geometry.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file geometry/geometry.cc
...
...
@@ -10,7 +11,7 @@
#include "../array/check.h"
#include "../c_api_common.h"
#include "
./
geometry_op.h"
#include "geometry_op.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/creators.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/creators.cc
* @brief Functions for constructing graphs.
*/
#include "
./
heterograph.h"
#include "heterograph.h"
using
namespace
dgl
::
runtime
;
namespace
dgl
{
...
...
src/graph/heterograph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.cc
* @brief Heterograph implementation
*/
#include "
./
heterograph.h"
#include "heterograph.h"
#include <dgl/array.h>
#include <dgl/graph_serializer.h>
...
...
src/graph/heterograph.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.h
...
...
@@ -18,7 +19,7 @@
#include <utility>
#include <vector>
#include "
./
unit_graph.h"
#include "unit_graph.h"
#include "shared_mem_manager.h"
namespace
dgl
{
...
...
src/graph/heterograph_capi.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/heterograph_capi.cc
...
...
@@ -14,7 +15,7 @@
#include <set>
#include "../c_api_common.h"
#include "
./
heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/pickle.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/pickle.cc
...
...
@@ -10,7 +11,7 @@
#include <dmlc/memory_io.h>
#include "../c_api_common.h"
#include "
./
heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/sampling/randomwalks/frequency_hashmap.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cuh
...
...
@@ -56,7 +58,7 @@ class FrequencyHashmap {
FrequencyHashmap
()
=
delete
;
FrequencyHashmap
(
int64_t
num_dst
,
int64_t
num_items_each_dst
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
int64_t
edge_table_scale
=
kDefaultEdgeTableScale
);
hip
Stream_t
stream
,
int64_t
edge_table_scale
=
kDefaultEdgeTableScale
);
~
FrequencyHashmap
();
using
EdgeItem
=
typename
DeviceEdgeHashmap
<
IdxType
>::
EdgeItem
;
std
::
tuple
<
IdArray
,
IdArray
,
IdArray
>
Topk
(
...
...
@@ -66,7 +68,7 @@ class FrequencyHashmap {
private:
DGLContext
_ctx
;
cuda
Stream_t
_stream
;
hip
Stream_t
_stream
;
DeviceEdgeHashmap
<
IdxType
>
*
_device_edge_hashmap
;
IdxType
*
_dst_unique_edges
;
EdgeItem
*
_edge_hashmap
;
...
...
src/graph/sampling/randomwalks/frequency_hashmap.
cu
→
src/graph/sampling/randomwalks/frequency_hashmap.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cu
...
...
@@ -5,7 +7,7 @@
*/
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <tuple>
#include <utility>
...
...
@@ -71,7 +73,7 @@ __global__ void _count_frequency(
}
}
using
BlockReduce
=
typename
cub
::
BlockReduce
<
IdxType
,
BLOCK_SIZE
>
;
using BlockReduce = typename
hip
cub::BlockReduce<IdxType, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_space;
count = BlockReduce(temp_space).Sum(count);
...
...
@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
int64_t last_idx = start_idx + TILE_SIZE;
const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
using
BlockScan
=
typename
cub
::
BlockScan
<
IdxType
,
BLOCK_SIZE
>
;
using BlockScan = typename
hip
cub::BlockScan<IdxType, BLOCK_SIZE>;
__shared__ typename BlockScan::TempStorage temp_space;
BlockPrefixCallbackOp<IdxType> prefix_op(0);
...
...
@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
template <typename IdxType>
FrequencyHashmap<IdxType>::FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cuda
Stream_t
stream
,
int64_t
edge_table_scale
)
{
hip
Stream_t stream, int64_t edge_table_scale) {
_ctx = ctx;
_stream = stream;
num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
...
...
@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
constexpr int TILE_SIZE = BLOCK_SIZE * 8;
dim3 block(BLOCK_SIZE);
dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
CUDA_CALL
(
cuda
Memset
(
dst_unique_edges
,
0
,
(
num_dst
)
*
sizeof
(
IdxType
)));
CUDA_CALL(
hip
Memset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_KERNEL_CALL(
(_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
_stream, edge_hashmap, (num_dst * num_items_each_dst));
...
...
@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// _edge_hashmap
bool *is_first_position = static_cast<bool *>(
device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
CUDA_CALL
(
cuda
Memset
(
is_first_position
,
0
,
sizeof
(
bool
)
*
(
num_edges
)));
CUDA_CALL(
hip
Memset(is_first_position, 0, sizeof(bool) * (num_edges)));
// double space to use ExclusiveSum
auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
_ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
...
...
@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 2.1 ExclusiveSum the edge_blocks_prefix
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
...
...
@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 3.1 ExclusiveSum the num_unique_each_node
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
// 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
// Create a set of DoubleBuffers to wrap pairs of device pointers
cub
::
DoubleBuffer
<
Idx64Type
>
d_unique_frequency
(
hip
cub::DoubleBuffer<Idx64Type> d_unique_frequency(
unique_frequency, unique_frequency_alternate);
cub
::
DoubleBuffer
<
IdxType
>
d_unique_src_edges
(
hip
cub::DoubleBuffer<IdxType> d_unique_src_edges(
unique_src_edges, unique_src_edges_alternate);
// Determine temporary device storage requirements
d_temp_storage = nullptr;
...
...
@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
// especially when num_dst_nodes is large (about ~10000)
if (dtype.bits == 32) {
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...
...
@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
}
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
if (dtype.bits == 32) {
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...
...
@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// use unique_output_offsets;
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
...
...
src/graph/sampling/randomwalks/get_node_types_gpu.
cu
→
src/graph/sampling/randomwalks/get_node_types_gpu.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/get_node_types_gpu.cu
* @brief DGL sampler
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/runtime/device_api.h>
...
...
src/graph/sampling/randomwalks/randomwalk_gpu.
cu
→
src/graph/sampling/randomwalks/randomwalk_gpu.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021-2022 by Contributors
* @file graph/sampling/randomwalk_gpu.cu
* @brief CUDA random walk sampleing
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <tuple>
#include <utility>
#include <vector>
...
...
@@ -48,10 +50,10 @@ __global__ void _RandomWalkKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
cu
randState
rng
;
hip
randState
_t
rng;
// reference:
// https://docs.nvidia.com/cuda/
cu
rand/device-api-overview.html#performance-notes
cu
rand_init
(
rand_seed
+
idx
,
0
,
0
,
&
rng
);
// https://docs.nvidia.com/cuda/
hip
rand/device-api-overview.html#performance-notes
hip
rand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
...
...
@@ -68,18 +70,18 @@ __global__ void _RandomWalkKernel(
if (deg == 0) { // the degree is zero
break;
}
const
int64_t
num
=
cu
rand
(
&
rng
)
%
deg
;
const int64_t num =
hip
rand(&rng) % deg;
IdType pick = graph.in_cols[in_row_start + num];
IdType eid =
(graph.data ? graph.data[in_row_start + num] : in_row_start + num);
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
step_idx
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
0
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
...
...
@@ -107,10 +109,10 @@ __global__ void _RandomWalkBiasedKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
cu
randState
rng
;
hip
randState
_t
rng;
// reference:
// https://docs.nvidia.com/cuda/
cu
rand/device-api-overview.html#performance-notes
cu
rand_init
(
rand_seed
+
idx
,
0
,
0
,
&
rng
);
// https://docs.nvidia.com/cuda/
hip
rand/device-api-overview.html#performance-notes
hip
rand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
...
...
@@ -133,9 +135,9 @@ __global__ void _RandomWalkBiasedKernel(
const FloatType *prob = probs[metapath_id];
int64_t num;
if (prob == nullptr) {
num
=
cu
rand
(
&
rng
)
%
deg
;
num =
hip
rand(&rng) % deg;
} else {
auto
rnd_sum_w
=
prob_sum
[
curr
]
*
cu
rand_uniform
(
&
rng
);
auto rnd_sum_w = prob_sum[curr] *
hip
rand_uniform(&rng);
FloatType sum_w{0.};
for (num = 0; num < deg; ++num) {
sum_w += prob[in_row_start + num];
...
...
@@ -149,11 +151,11 @@ __global__ void _RandomWalkBiasedKernel(
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
step_idx
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
0
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
...
...
@@ -202,7 +204,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
: nullptr);
}
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = DeviceAPI::Get(ctx);
auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
...
...
@@ -222,7 +224,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
ATEN_FLOAT_TYPE_SWITCH(
restart_prob->dtype, FloatType, "random walk GPU kernel", {
CHECK
(
restart_prob
->
ctx
.
device_type
==
kDGLCUDA
)
CHECK(restart_prob->ctx.device_type == kDGLCUDA
||restart_prob->ctx.device_type == kDGLROCM
)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
...
...
@@ -263,7 +265,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
IdType *traces_data = traces.Ptr<IdType>();
IdType *eids_data = eids.Ptr<IdType>();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = DeviceAPI::Get(ctx);
// new probs and prob sums pointers
assert(num_etypes == static_cast<int64_t>(prob.size()));
...
...
@@ -297,11 +299,11 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
// calculate the sum of the neighbor weights
const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
size_t temp_storage_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Sum(
nullptr, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Sum(
temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
device->FreeWorkspace(ctx, temp_storage);
...
...
@@ -335,7 +337,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
dim3 block(256);
dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
CHECK
(
restart_prob
->
ctx
.
device_type
==
kDGLCUDA
)
CHECK(restart_prob->ctx.device_type == kDGLCUDA
||restart_prob->ctx.device_type == kDGLROCM
)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
...
...
@@ -396,7 +398,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
device->CopyDataFromTo(
&restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
...
...
@@ -443,13 +445,13 @@ template <DGLDeviceType XPU, typename IdxType>
std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
const int64_t k) {
CHECK
(
src
->
ctx
.
device_type
==
kDGLCUDA
)
<<
"IdArray needs be on GPU!"
;
CHECK(src->ctx.device_type == kDGLCUDA
|| src->ctx.device_type == kDGLROCM
) << "IdArray needs be on GPU!";
const IdxType *src_data = src.Ptr<IdxType>();
const IdxType *dst_data = dst.Ptr<IdxType>();
const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
auto ctx = src->ctx;
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto frequency_hashmap = FrequencyHashmap<IdxType>(
num_dst_nodes, num_samples_per_node, ctx, stream);
auto ret = frequency_hashmap.Topk(
...
...
src/graph/serialize/heterograph_serialize.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/serialize/heterograph_serialize.cc
...
...
@@ -48,8 +49,8 @@
#include <vector>
#include "../heterograph.h"
#include "
./
dglstream.h"
#include "
./
graph_serialize.h"
#include "dglstream.h"
#include "graph_serialize.h"
#include "dmlc/memory_io.h"
namespace
dgl
{
...
...
src/graph/subgraph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/subgraph.cc
* @brief Functions for extracting subgraphs.
*/
#include "
./
heterograph.h"
#include "heterograph.h"
using
namespace
dgl
::
runtime
;
namespace
dgl
{
...
...
src/graph/transform/cuda/cuda_compact_graph.
cu
→
src/graph/transform/cuda/cuda_compact_graph.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2021 Contributors
*
...
...
@@ -18,7 +19,7 @@
* all given graphs with the same set of nodes.
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
...
...
@@ -55,10 +56,10 @@ template <typename IdType>
void BuildNodeMaps(
const std::vector<IdArray> &input_nodes,
DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
std
::
vector
<
IdArray
>
*
const
unique_nodes_device
,
cuda
Stream_t
stream
)
{
std::vector<IdArray> *const unique_nodes_device,
hip
Stream_t stream) {
const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
stream));
...
...
@@ -81,7 +82,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
const std::vector<IdArray> &always_preserve) {
const auto &ctx = graphs[0]->Context();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CHECK_EQ(ctx.device_type, kDGLCUDA);
...
...
src/graph/transform/cuda/cuda_map_edges.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2022 Contributors
*
...
...
@@ -22,7 +23,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include <dgl/base_heterograph.h>
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/runtime/c_runtime_api.h>
#include <algorithm>
...
...
@@ -113,7 +114,7 @@ class DeviceNodeMap {
DeviceNodeMap
(
const
std
::
vector
<
int64_t
>&
num_nodes
,
const
int64_t
offset
,
DGLContext
ctx
,
cuda
Stream_t
stream
)
DGLContext
ctx
,
hip
Stream_t
stream
)
:
num_types_
(
num_nodes
.
size
()),
rhs_offset_
(
offset
),
hash_tables_
(),
...
...
@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) {
template
<
typename
IdType
>
std
::
tuple
<
std
::
vector
<
IdArray
>
,
std
::
vector
<
IdArray
>>
MapEdges
(
HeteroGraphPtr
graph
,
const
std
::
vector
<
EdgeArray
>&
edge_sets
,
const
DeviceNodeMap
<
IdType
>&
node_map
,
cuda
Stream_t
stream
)
{
const
DeviceNodeMap
<
IdType
>&
node_map
,
hip
Stream_t
stream
)
{
constexpr
const
int
BLOCK_SIZE
=
128
;
constexpr
const
size_t
TILE_SIZE
=
1024
;
...
...
src/graph/transform/cuda/cuda_to_block.
cu
→
src/graph/transform/cuda/cuda_to_block.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2021 Contributors
*
...
...
@@ -20,7 +21,7 @@
* Tested via python wrapper: python/dgl/path/to/to_block.py
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
...
...
@@ -69,10 +70,10 @@ class DeviceNodeMapMaker {
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
std
::
vector
<
IdArray
>*
const
lhs_device
,
cuda
Stream_t
stream
)
{
std::vector<IdArray>* const lhs_device,
hip
Stream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
// possibly dublicate lhs nodes
...
...
@@ -112,7 +113,7 @@ class DeviceNodeMapMaker {
void Make(
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap
<
IdType
>*
const
node_maps
,
cuda
Stream_t
stream
)
{
DeviceNodeMap<IdType>* const node_maps,
hip
Stream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
// unique lhs nodes
...
...
@@ -155,7 +156,7 @@ struct CUDAIdsMapper {
std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
const bool generate_lhs_nodes = lhs_nodes.empty();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// Allocate space for map creation process.
DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
...
...
@@ -168,7 +169,7 @@ struct CUDAIdsMapper {
}
}
cuda
Event_t
copyEvent
;
hip
Event_t copyEvent;
NDArray new_len_tensor;
// Populate the mappings.
if (generate_lhs_nodes) {
...
...
@@ -179,7 +180,7 @@ struct CUDAIdsMapper {
src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
stream);
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
CUDA_CALL(
hip
EventCreate(©Event));
if (TensorDispatcher::Global()->IsAvailable()) {
new_len_tensor = NDArray::PinnedEmpty(
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
...
...
@@ -190,11 +191,11 @@ struct CUDAIdsMapper {
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
DGLContext{kDGLCPU, 0});
}
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
new_len_tensor->data, count_lhs_device,
sizeof(*num_nodes_per_type.data()) * num_ntypes,
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
device->FreeWorkspace(ctx, count_lhs_device);
} else {
...
...
@@ -209,8 +210,8 @@ struct CUDAIdsMapper {
if (generate_lhs_nodes) {
// wait for the previous copy
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL(
hip
EventDestroy(copyEvent));
// Resize lhs nodes.
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment