Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
"docs/vscode:/vscode.git/clone" did not exist on "7ac6e286ee994270e737b70c904ea50049d53567"
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
114 additions
and
89 deletions
+114
-89
src/array/libra_partition.cc
src/array/libra_partition.cc
+2
-1
src/array/selector.h
src/array/selector.h
+4
-3
src/array/uvm_array.cc
src/array/uvm_array.cc
+2
-1
src/geometry/cuda/edge_coarsening_impl.hip
src/geometry/cuda/edge_coarsening_impl.hip
+11
-9
src/geometry/cuda/geometry_op_impl.hip
src/geometry/cuda/geometry_op_impl.hip
+4
-2
src/geometry/geometry.cc
src/geometry/geometry.cc
+2
-1
src/graph/creators.cc
src/graph/creators.cc
+2
-1
src/graph/heterograph.cc
src/graph/heterograph.cc
+2
-1
src/graph/heterograph.h
src/graph/heterograph.h
+2
-1
src/graph/heterograph_capi.cc
src/graph/heterograph_capi.cc
+2
-1
src/graph/pickle.cc
src/graph/pickle.cc
+2
-1
src/graph/sampling/randomwalks/frequency_hashmap.cuh
src/graph/sampling/randomwalks/frequency_hashmap.cuh
+4
-2
src/graph/sampling/randomwalks/frequency_hashmap.hip
src/graph/sampling/randomwalks/frequency_hashmap.hip
+20
-18
src/graph/sampling/randomwalks/get_node_types_gpu.hip
src/graph/sampling/randomwalks/get_node_types_gpu.hip
+2
-1
src/graph/sampling/randomwalks/randomwalk_gpu.hip
src/graph/sampling/randomwalks/randomwalk_gpu.hip
+26
-24
src/graph/serialize/heterograph_serialize.cc
src/graph/serialize/heterograph_serialize.cc
+3
-2
src/graph/subgraph.cc
src/graph/subgraph.cc
+2
-1
src/graph/transform/cuda/cuda_compact_graph.hip
src/graph/transform/cuda/cuda_compact_graph.hip
+5
-4
src/graph/transform/cuda/cuda_map_edges.cuh
src/graph/transform/cuda/cuda_map_edges.cuh
+4
-3
src/graph/transform/cuda/cuda_to_block.hip
src/graph/transform/cuda/cuda_to_block.hip
+13
-12
No files found.
src/array/libra_partition.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 Intel Corporation
*
...
...
@@ -21,7 +22,7 @@
#include <vector>
#include "../c_api_common.h"
#include "
./
check.h"
#include "check.h"
#include "kernel_decl.h"
using
namespace
dgl
::
runtime
;
...
...
src/array/selector.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/selector.h
...
...
@@ -12,13 +13,13 @@ namespace dgl {
namespace
{
#ifdef __
CUDA
CC__
#define DGLDEVICE __device__
#ifdef __
HIP
CC__
#define DGLDEVICE __device__
__host__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __
CUDA
CC__
#endif // __
HIP
CC__
}
// namespace
...
...
src/array/uvm_array.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/uvm_array.cc
...
...
@@ -8,7 +9,7 @@
#include <sstream>
#include "../c_api_common.h"
#include "
./
uvm_array_op.h"
#include "uvm_array_op.h"
using
namespace
dgl
::
runtime
;
...
...
src/geometry/cuda/edge_coarsening_impl.
cu
→
src/geometry/cuda/edge_coarsening_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/edge_coarsening_impl.cu
* @brief Edge coarsening CUDA implementation
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dmlc/thread_local.h>
...
...
@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
float *ret_values, size_t num, uint64_t seed) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < num) {
cu
randState
state
;
cu
rand_init
(
seed
,
id
,
0
,
&
state
);
ret_values
[
id
]
=
cu
rand_uniform
(
&
state
);
hip
randState
_t
state;
hip
rand_init(seed, id, 0, &state);
ret_values[id] =
hip
rand_uniform(&state);
}
}
...
...
@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
// initial done signal
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
// generate color prop for each node
...
...
@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
result_data);
bool done_h = false;
CUDA_CALL
(
cuda
MemcpyFromSymbol
(
&
done_h
,
done_d
,
sizeof
(
done_h
),
0
,
cuda
MemcpyDeviceToHost
));
CUDA_CALL(
hip
MemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0,
hip
MemcpyDeviceToHost));
return done_h;
}
...
...
@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
template <DGLDeviceType XPU, typename FloatType, typename IdType>
void WeightedNeighborMatching(
const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto &ctx = result->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
device->SetDevice(ctx);
...
...
@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
device->SetDevice(ctx);
// generate random weights
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
NDArray weight = NDArray::Empty(
{num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
float *weight_data = static_cast<float *>(weight->data);
...
...
src/geometry/cuda/geometry_op_impl.
cu
→
src/geometry/cuda/geometry_op_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/geometry_op_impl.cc
...
...
@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void FarthestPointSampler(
NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
IdArray start_idx, IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const FloatType* array_data = static_cast<FloatType*>(array->data);
...
...
@@ -110,7 +112,7 @@ void FarthestPointSampler(
// sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
CUDA_CALL
(
cuda
SetDevice
(
array
->
ctx
.
device_id
));
CUDA_CALL(
hip
SetDevice(array->ctx.device_id));
CUDA_KERNEL_CALL(
fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
...
...
src/geometry/geometry.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file geometry/geometry.cc
...
...
@@ -10,7 +11,7 @@
#include "../array/check.h"
#include "../c_api_common.h"
#include "
./
geometry_op.h"
#include "geometry_op.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/creators.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/creators.cc
* @brief Functions for constructing graphs.
*/
#include "
./
heterograph.h"
#include "heterograph.h"
using
namespace
dgl
::
runtime
;
namespace
dgl
{
...
...
src/graph/heterograph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.cc
* @brief Heterograph implementation
*/
#include "
./
heterograph.h"
#include "heterograph.h"
#include <dgl/array.h>
#include <dgl/graph_serializer.h>
...
...
src/graph/heterograph.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.h
...
...
@@ -18,7 +19,7 @@
#include <utility>
#include <vector>
#include "
./
unit_graph.h"
#include "unit_graph.h"
#include "shared_mem_manager.h"
namespace
dgl
{
...
...
src/graph/heterograph_capi.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/heterograph_capi.cc
...
...
@@ -14,7 +15,7 @@
#include <set>
#include "../c_api_common.h"
#include "
./
heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/pickle.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/pickle.cc
...
...
@@ -10,7 +11,7 @@
#include <dmlc/memory_io.h>
#include "../c_api_common.h"
#include "
./
heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/sampling/randomwalks/frequency_hashmap.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cuh
...
...
@@ -56,7 +58,7 @@ class FrequencyHashmap {
FrequencyHashmap
()
=
delete
;
FrequencyHashmap
(
int64_t
num_dst
,
int64_t
num_items_each_dst
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
int64_t
edge_table_scale
=
kDefaultEdgeTableScale
);
hip
Stream_t
stream
,
int64_t
edge_table_scale
=
kDefaultEdgeTableScale
);
~
FrequencyHashmap
();
using
EdgeItem
=
typename
DeviceEdgeHashmap
<
IdxType
>::
EdgeItem
;
std
::
tuple
<
IdArray
,
IdArray
,
IdArray
>
Topk
(
...
...
@@ -66,7 +68,7 @@ class FrequencyHashmap {
private:
DGLContext
_ctx
;
cuda
Stream_t
_stream
;
hip
Stream_t
_stream
;
DeviceEdgeHashmap
<
IdxType
>
*
_device_edge_hashmap
;
IdxType
*
_dst_unique_edges
;
EdgeItem
*
_edge_hashmap
;
...
...
src/graph/sampling/randomwalks/frequency_hashmap.
cu
→
src/graph/sampling/randomwalks/frequency_hashmap.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cu
...
...
@@ -5,7 +7,7 @@
*/
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <tuple>
#include <utility>
...
...
@@ -71,7 +73,7 @@ __global__ void _count_frequency(
}
}
using
BlockReduce
=
typename
cub
::
BlockReduce
<
IdxType
,
BLOCK_SIZE
>
;
using BlockReduce = typename
hip
cub::BlockReduce<IdxType, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_space;
count = BlockReduce(temp_space).Sum(count);
...
...
@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
int64_t last_idx = start_idx + TILE_SIZE;
const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
using
BlockScan
=
typename
cub
::
BlockScan
<
IdxType
,
BLOCK_SIZE
>
;
using BlockScan = typename
hip
cub::BlockScan<IdxType, BLOCK_SIZE>;
__shared__ typename BlockScan::TempStorage temp_space;
BlockPrefixCallbackOp<IdxType> prefix_op(0);
...
...
@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
template <typename IdxType>
FrequencyHashmap<IdxType>::FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cuda
Stream_t
stream
,
int64_t
edge_table_scale
)
{
hip
Stream_t stream, int64_t edge_table_scale) {
_ctx = ctx;
_stream = stream;
num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
...
...
@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
constexpr int TILE_SIZE = BLOCK_SIZE * 8;
dim3 block(BLOCK_SIZE);
dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
CUDA_CALL
(
cuda
Memset
(
dst_unique_edges
,
0
,
(
num_dst
)
*
sizeof
(
IdxType
)));
CUDA_CALL(
hip
Memset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_KERNEL_CALL(
(_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
_stream, edge_hashmap, (num_dst * num_items_each_dst));
...
...
@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// _edge_hashmap
bool *is_first_position = static_cast<bool *>(
device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
CUDA_CALL
(
cuda
Memset
(
is_first_position
,
0
,
sizeof
(
bool
)
*
(
num_edges
)));
CUDA_CALL(
hip
Memset(is_first_position, 0, sizeof(bool) * (num_edges)));
// double space to use ExclusiveSum
auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
_ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
...
...
@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 2.1 ExclusiveSum the edge_blocks_prefix
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
...
...
@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 3.1 ExclusiveSum the num_unique_each_node
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
// 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
// Create a set of DoubleBuffers to wrap pairs of device pointers
cub
::
DoubleBuffer
<
Idx64Type
>
d_unique_frequency
(
hip
cub::DoubleBuffer<Idx64Type> d_unique_frequency(
unique_frequency, unique_frequency_alternate);
cub
::
DoubleBuffer
<
IdxType
>
d_unique_src_edges
(
hip
cub::DoubleBuffer<IdxType> d_unique_src_edges(
unique_src_edges, unique_src_edges_alternate);
// Determine temporary device storage requirements
d_temp_storage = nullptr;
...
...
@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
// especially when num_dst_nodes is large (about ~10000)
if (dtype.bits == 32) {
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...
...
@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
}
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
if (dtype.bits == 32) {
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...
...
@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// use unique_output_offsets;
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
...
...
src/graph/sampling/randomwalks/get_node_types_gpu.
cu
→
src/graph/sampling/randomwalks/get_node_types_gpu.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/get_node_types_gpu.cu
* @brief DGL sampler
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/runtime/device_api.h>
...
...
src/graph/sampling/randomwalks/randomwalk_gpu.
cu
→
src/graph/sampling/randomwalks/randomwalk_gpu.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021-2022 by Contributors
* @file graph/sampling/randomwalk_gpu.cu
* @brief CUDA random walk sampleing
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <tuple>
#include <utility>
#include <vector>
...
...
@@ -48,10 +50,10 @@ __global__ void _RandomWalkKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
cu
randState
rng
;
hip
randState
_t
rng;
// reference:
// https://docs.nvidia.com/cuda/
cu
rand/device-api-overview.html#performance-notes
cu
rand_init
(
rand_seed
+
idx
,
0
,
0
,
&
rng
);
// https://docs.nvidia.com/cuda/
hip
rand/device-api-overview.html#performance-notes
hip
rand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
...
...
@@ -68,18 +70,18 @@ __global__ void _RandomWalkKernel(
if (deg == 0) { // the degree is zero
break;
}
const
int64_t
num
=
cu
rand
(
&
rng
)
%
deg
;
const int64_t num =
hip
rand(&rng) % deg;
IdType pick = graph.in_cols[in_row_start + num];
IdType eid =
(graph.data ? graph.data[in_row_start + num] : in_row_start + num);
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
step_idx
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
0
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
...
...
@@ -107,10 +109,10 @@ __global__ void _RandomWalkBiasedKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
cu
randState
rng
;
hip
randState
_t
rng;
// reference:
// https://docs.nvidia.com/cuda/
cu
rand/device-api-overview.html#performance-notes
cu
rand_init
(
rand_seed
+
idx
,
0
,
0
,
&
rng
);
// https://docs.nvidia.com/cuda/
hip
rand/device-api-overview.html#performance-notes
hip
rand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
...
...
@@ -133,9 +135,9 @@ __global__ void _RandomWalkBiasedKernel(
const FloatType *prob = probs[metapath_id];
int64_t num;
if (prob == nullptr) {
num
=
cu
rand
(
&
rng
)
%
deg
;
num =
hip
rand(&rng) % deg;
} else {
auto
rnd_sum_w
=
prob_sum
[
curr
]
*
cu
rand_uniform
(
&
rng
);
auto rnd_sum_w = prob_sum[curr] *
hip
rand_uniform(&rng);
FloatType sum_w{0.};
for (num = 0; num < deg; ++num) {
sum_w += prob[in_row_start + num];
...
...
@@ -149,11 +151,11 @@ __global__ void _RandomWalkBiasedKernel(
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
step_idx
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
0
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
...
...
@@ -202,7 +204,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
: nullptr);
}
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = DeviceAPI::Get(ctx);
auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
...
...
@@ -222,7 +224,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
ATEN_FLOAT_TYPE_SWITCH(
restart_prob->dtype, FloatType, "random walk GPU kernel", {
CHECK
(
restart_prob
->
ctx
.
device_type
==
kDGLCUDA
)
CHECK(restart_prob->ctx.device_type == kDGLCUDA
||restart_prob->ctx.device_type == kDGLROCM
)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
...
...
@@ -263,7 +265,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
IdType *traces_data = traces.Ptr<IdType>();
IdType *eids_data = eids.Ptr<IdType>();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = DeviceAPI::Get(ctx);
// new probs and prob sums pointers
assert(num_etypes == static_cast<int64_t>(prob.size()));
...
...
@@ -297,11 +299,11 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
// calculate the sum of the neighbor weights
const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
size_t temp_storage_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Sum(
nullptr, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Sum(
temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
device->FreeWorkspace(ctx, temp_storage);
...
...
@@ -335,7 +337,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
dim3 block(256);
dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
CHECK
(
restart_prob
->
ctx
.
device_type
==
kDGLCUDA
)
CHECK(restart_prob->ctx.device_type == kDGLCUDA
||restart_prob->ctx.device_type == kDGLROCM
)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
...
...
@@ -396,7 +398,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
device->CopyDataFromTo(
&restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
...
...
@@ -443,13 +445,13 @@ template <DGLDeviceType XPU, typename IdxType>
std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
const int64_t k) {
CHECK
(
src
->
ctx
.
device_type
==
kDGLCUDA
)
<<
"IdArray needs be on GPU!"
;
CHECK(src->ctx.device_type == kDGLCUDA
|| src->ctx.device_type == kDGLROCM
) << "IdArray needs be on GPU!";
const IdxType *src_data = src.Ptr<IdxType>();
const IdxType *dst_data = dst.Ptr<IdxType>();
const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
auto ctx = src->ctx;
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto frequency_hashmap = FrequencyHashmap<IdxType>(
num_dst_nodes, num_samples_per_node, ctx, stream);
auto ret = frequency_hashmap.Topk(
...
...
src/graph/serialize/heterograph_serialize.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/serialize/heterograph_serialize.cc
...
...
@@ -48,8 +49,8 @@
#include <vector>
#include "../heterograph.h"
#include "
./
dglstream.h"
#include "
./
graph_serialize.h"
#include "dglstream.h"
#include "graph_serialize.h"
#include "dmlc/memory_io.h"
namespace
dgl
{
...
...
src/graph/subgraph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/subgraph.cc
* @brief Functions for extracting subgraphs.
*/
#include "
./
heterograph.h"
#include "heterograph.h"
using
namespace
dgl
::
runtime
;
namespace
dgl
{
...
...
src/graph/transform/cuda/cuda_compact_graph.
cu
→
src/graph/transform/cuda/cuda_compact_graph.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2021 Contributors
*
...
...
@@ -18,7 +19,7 @@
* all given graphs with the same set of nodes.
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
...
...
@@ -55,10 +56,10 @@ template <typename IdType>
void BuildNodeMaps(
const std::vector<IdArray> &input_nodes,
DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
std
::
vector
<
IdArray
>
*
const
unique_nodes_device
,
cuda
Stream_t
stream
)
{
std::vector<IdArray> *const unique_nodes_device,
hip
Stream_t stream) {
const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
stream));
...
...
@@ -81,7 +82,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
const std::vector<IdArray> &always_preserve) {
const auto &ctx = graphs[0]->Context();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CHECK_EQ(ctx.device_type, kDGLCUDA);
...
...
src/graph/transform/cuda/cuda_map_edges.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2022 Contributors
*
...
...
@@ -22,7 +23,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include <dgl/base_heterograph.h>
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/runtime/c_runtime_api.h>
#include <algorithm>
...
...
@@ -113,7 +114,7 @@ class DeviceNodeMap {
DeviceNodeMap
(
const
std
::
vector
<
int64_t
>&
num_nodes
,
const
int64_t
offset
,
DGLContext
ctx
,
cuda
Stream_t
stream
)
DGLContext
ctx
,
hip
Stream_t
stream
)
:
num_types_
(
num_nodes
.
size
()),
rhs_offset_
(
offset
),
hash_tables_
(),
...
...
@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) {
template
<
typename
IdType
>
std
::
tuple
<
std
::
vector
<
IdArray
>
,
std
::
vector
<
IdArray
>>
MapEdges
(
HeteroGraphPtr
graph
,
const
std
::
vector
<
EdgeArray
>&
edge_sets
,
const
DeviceNodeMap
<
IdType
>&
node_map
,
cuda
Stream_t
stream
)
{
const
DeviceNodeMap
<
IdType
>&
node_map
,
hip
Stream_t
stream
)
{
constexpr
const
int
BLOCK_SIZE
=
128
;
constexpr
const
size_t
TILE_SIZE
=
1024
;
...
...
src/graph/transform/cuda/cuda_to_block.
cu
→
src/graph/transform/cuda/cuda_to_block.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2021 Contributors
*
...
...
@@ -20,7 +21,7 @@
* Tested via python wrapper: python/dgl/path/to/to_block.py
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
...
...
@@ -69,10 +70,10 @@ class DeviceNodeMapMaker {
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
std
::
vector
<
IdArray
>*
const
lhs_device
,
cuda
Stream_t
stream
)
{
std::vector<IdArray>* const lhs_device,
hip
Stream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
CUDA_CALL
(
cuda
MemsetAsync
(
CUDA_CALL(
hip
MemsetAsync(
count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
// possibly dublicate lhs nodes
...
...
@@ -112,7 +113,7 @@ class DeviceNodeMapMaker {
void Make(
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap
<
IdType
>*
const
node_maps
,
cuda
Stream_t
stream
)
{
DeviceNodeMap<IdType>* const node_maps,
hip
Stream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
// unique lhs nodes
...
...
@@ -155,7 +156,7 @@ struct CUDAIdsMapper {
std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
const bool generate_lhs_nodes = lhs_nodes.empty();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// Allocate space for map creation process.
DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
...
...
@@ -168,7 +169,7 @@ struct CUDAIdsMapper {
}
}
cuda
Event_t
copyEvent
;
hip
Event_t copyEvent;
NDArray new_len_tensor;
// Populate the mappings.
if (generate_lhs_nodes) {
...
...
@@ -179,7 +180,7 @@ struct CUDAIdsMapper {
src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
stream);
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
CUDA_CALL(
hip
EventCreate(©Event));
if (TensorDispatcher::Global()->IsAvailable()) {
new_len_tensor = NDArray::PinnedEmpty(
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
...
...
@@ -190,11 +191,11 @@ struct CUDAIdsMapper {
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
DGLContext{kDGLCPU, 0});
}
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
new_len_tensor->data, count_lhs_device,
sizeof(*num_nodes_per_type.data()) * num_ntypes,
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
device->FreeWorkspace(ctx, count_lhs_device);
} else {
...
...
@@ -209,8 +210,8 @@ struct CUDAIdsMapper {
if (generate_lhs_nodes) {
// wait for the previous copy
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL(
hip
EventDestroy(copyEvent));
// Resize lhs nodes.
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment