Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
74d88bf8
Commit
74d88bf8
authored
Feb 20, 2025
by
sangwz
Browse files
Merge branch 'dtk25.04' of
http://developer.sourcefind.cn/codes/OpenDAS/dgl
into 2.2.1
parents
2a1ac588
314cedc1
Changes
179
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
167 additions
and
98 deletions
+167
-98
src/array/cuda/utils.hip
src/array/cuda/utils.hip
+7
-5
src/array/cuda/uvm/array_index_select_uvm.cuh
src/array/cuda/uvm/array_index_select_uvm.cuh
+2
-0
src/array/cuda/uvm/array_index_select_uvm.hip
src/array/cuda/uvm/array_index_select_uvm.hip
+5
-3
src/array/filter.cc
src/array/filter.cc
+3
-2
src/array/kernel.cc
src/array/kernel.cc
+2
-1
src/array/libra_partition.cc
src/array/libra_partition.cc
+2
-1
src/array/selector.h
src/array/selector.h
+4
-3
src/array/uvm_array.cc
src/array/uvm_array.cc
+2
-1
src/geometry/cuda/edge_coarsening_impl.hip
src/geometry/cuda/edge_coarsening_impl.hip
+11
-9
src/geometry/cuda/geometry_op_impl.hip
src/geometry/cuda/geometry_op_impl.hip
+4
-2
src/geometry/geometry.cc
src/geometry/geometry.cc
+2
-1
src/graph/creators.cc
src/graph/creators.cc
+2
-1
src/graph/heterograph.cc
src/graph/heterograph.cc
+2
-1
src/graph/heterograph.h
src/graph/heterograph.h
+2
-1
src/graph/heterograph_capi.cc
src/graph/heterograph_capi.cc
+2
-1
src/graph/pickle.cc
src/graph/pickle.cc
+2
-1
src/graph/sampling/randomwalks/frequency_hashmap.cuh
src/graph/sampling/randomwalks/frequency_hashmap.cuh
+4
-2
src/graph/sampling/randomwalks/frequency_hashmap.hip
src/graph/sampling/randomwalks/frequency_hashmap.hip
+20
-18
src/graph/sampling/randomwalks/get_node_types_gpu.hip
src/graph/sampling/randomwalks/get_node_types_gpu.hip
+2
-1
src/graph/sampling/randomwalks/randomwalk_gpu.hip
src/graph/sampling/randomwalks/randomwalk_gpu.hip
+87
-44
No files found.
src/array/cuda/utils.
cu
→
src/array/cuda/utils.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.cu
* @brief Utilities for CUDA kernels.
*/
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace cuda {
...
...
@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction
size_t workspace_size = 0;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
CUDA_CALL
(
cub
::
DeviceReduce
::
Min
(
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CUDA_CALL(
hip
cub::DeviceReduce::Min(
nullptr, workspace_size, flags, rst, length, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL
(
cub
::
DeviceReduce
::
Min
(
CUDA_CALL(
hip
cub::DeviceReduce::Min(
workspace, workspace_size, flags, rst, length, stream));
int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
device->FreeWorkspace(ctx, workspace);
...
...
src/array/cuda/uvm/array_index_select_uvm.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cpu/array_index_select_uvm.cuh
...
...
src/array/cuda/uvm/array_index_select_uvm.
cu
→
src/array/cuda/uvm/array_index_select_uvm.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/cuda/uvm/array_index_select_uvm.cu
...
...
@@ -8,7 +10,7 @@
#include "../../../runtime/cuda/cuda_common.h"
#include "../array_index_select.cuh"
#include "../utils.h"
#include "
./
array_index_select_uvm.cuh"
#include "array_index_select_uvm.cuh"
namespace dgl {
using runtime::NDArray;
...
...
@@ -17,7 +19,7 @@ namespace impl {
template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0];
int64_t num_feat = 1;
...
...
@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template <typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* source_data = static_cast<DType*>(source->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = dest->shape[0];
...
...
src/array/filter.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file array/filter.cc
* @brief Object for selecting items in a set, or selecting items not in a set.
*/
#include "
./
filter.h"
#include "filter.h"
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/packed_func.h>
...
...
@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
IdArray
array
=
args
[
0
];
auto
ctx
=
array
->
ctx
;
// TODO(nv-dlasalle): Implement CPU version.
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
#ifdef DGL_USE_CUDA
ATEN_ID_TYPE_SWITCH
(
array
->
dtype
,
IdType
,
{
*
rv
=
CreateSetFilter
<
kDGLCUDA
,
IdType
>
(
array
);
...
...
src/array/kernel.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/kernel.cc
...
...
@@ -7,7 +8,7 @@
#include <dgl/packed_func_ext.h>
#include "../c_api_common.h"
#include "
./
check.h"
#include "check.h"
#include "kernel_decl.h"
using
namespace
dgl
::
runtime
;
...
...
src/array/libra_partition.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 Intel Corporation
*
...
...
@@ -21,7 +22,7 @@
#include <vector>
#include "../c_api_common.h"
#include "
./
check.h"
#include "check.h"
#include "kernel_decl.h"
using
namespace
dgl
::
runtime
;
...
...
src/array/selector.h
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/selector.h
...
...
@@ -12,13 +13,13 @@ namespace dgl {
namespace
{
#ifdef __
CUDA
CC__
#define DGLDEVICE __device__
#ifdef __
HIP
CC__
#define DGLDEVICE __device__
__host__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __
CUDA
CC__
#endif // __
HIP
CC__
}
// namespace
...
...
src/array/uvm_array.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/uvm_array.cc
...
...
@@ -8,7 +9,7 @@
#include <sstream>
#include "../c_api_common.h"
#include "
./
uvm_array_op.h"
#include "uvm_array_op.h"
using
namespace
dgl
::
runtime
;
...
...
src/geometry/cuda/edge_coarsening_impl.
cu
→
src/geometry/cuda/edge_coarsening_impl.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/edge_coarsening_impl.cu
* @brief Edge coarsening CUDA implementation
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dmlc/thread_local.h>
...
...
@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
float *ret_values, size_t num, uint64_t seed) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < num) {
cu
randState
state
;
cu
rand_init
(
seed
,
id
,
0
,
&
state
);
ret_values
[
id
]
=
cu
rand_uniform
(
&
state
);
hip
randState
_t
state;
hip
rand_init(seed, id, 0, &state);
ret_values[id] =
hip
rand_uniform(&state);
}
}
...
...
@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
// initial done signal
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
// generate color prop for each node
...
...
@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
result_data);
bool done_h = false;
CUDA_CALL
(
cuda
MemcpyFromSymbol
(
&
done_h
,
done_d
,
sizeof
(
done_h
),
0
,
cuda
MemcpyDeviceToHost
));
CUDA_CALL(
hip
MemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0,
hip
MemcpyDeviceToHost));
return done_h;
}
...
...
@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
template <DGLDeviceType XPU, typename FloatType, typename IdType>
void WeightedNeighborMatching(
const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto &ctx = result->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
device->SetDevice(ctx);
...
...
@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
device->SetDevice(ctx);
// generate random weights
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
NDArray weight = NDArray::Empty(
{num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
float *weight_data = static_cast<float *>(weight->data);
...
...
src/geometry/cuda/geometry_op_impl.
cu
→
src/geometry/cuda/geometry_op_impl.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/geometry_op_impl.cc
...
...
@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void FarthestPointSampler(
NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
IdArray start_idx, IdArray result) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const FloatType* array_data = static_cast<FloatType*>(array->data);
...
...
@@ -110,7 +112,7 @@ void FarthestPointSampler(
// sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
CUDA_CALL
(
cuda
SetDevice
(
array
->
ctx
.
device_id
));
CUDA_CALL(
hip
SetDevice(array->ctx.device_id));
CUDA_KERNEL_CALL(
fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
...
...
src/geometry/geometry.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file geometry/geometry.cc
...
...
@@ -10,7 +11,7 @@
#include "../array/check.h"
#include "../c_api_common.h"
#include "
./
geometry_op.h"
#include "geometry_op.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/creators.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/creators.cc
* @brief Functions for constructing graphs.
*/
#include "
./
heterograph.h"
#include "heterograph.h"
using
namespace
dgl
::
runtime
;
namespace
dgl
{
...
...
src/graph/heterograph.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.cc
* @brief Heterograph implementation
*/
#include "
./
heterograph.h"
#include "heterograph.h"
#include <dgl/array.h>
#include <dgl/graph_serializer.h>
...
...
src/graph/heterograph.h
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.h
...
...
@@ -18,7 +19,7 @@
#include <utility>
#include <vector>
#include "
./
unit_graph.h"
#include "unit_graph.h"
#include "shared_mem_manager.h"
namespace
dgl
{
...
...
src/graph/heterograph_capi.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/heterograph_capi.cc
...
...
@@ -14,7 +15,7 @@
#include <set>
#include "../c_api_common.h"
#include "
./
heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/pickle.cc
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/pickle.cc
...
...
@@ -10,7 +11,7 @@
#include <dmlc/memory_io.h>
#include "../c_api_common.h"
#include "
./
heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using
namespace
dgl
::
runtime
;
...
...
src/graph/sampling/randomwalks/frequency_hashmap.cuh
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cuh
...
...
@@ -56,7 +58,7 @@ class FrequencyHashmap {
FrequencyHashmap
()
=
delete
;
FrequencyHashmap
(
int64_t
num_dst
,
int64_t
num_items_each_dst
,
DGLContext
ctx
,
cuda
Stream_t
stream
,
int64_t
edge_table_scale
=
kDefaultEdgeTableScale
);
hip
Stream_t
stream
,
int64_t
edge_table_scale
=
kDefaultEdgeTableScale
);
~
FrequencyHashmap
();
using
EdgeItem
=
typename
DeviceEdgeHashmap
<
IdxType
>::
EdgeItem
;
std
::
tuple
<
IdArray
,
IdArray
,
IdArray
>
Topk
(
...
...
@@ -66,7 +68,7 @@ class FrequencyHashmap {
private:
DGLContext
_ctx
;
cuda
Stream_t
_stream
;
hip
Stream_t
_stream
;
DeviceEdgeHashmap
<
IdxType
>
*
_device_edge_hashmap
;
IdxType
*
_dst_unique_edges
;
EdgeItem
*
_edge_hashmap
;
...
...
src/graph/sampling/randomwalks/frequency_hashmap.
cu
→
src/graph/sampling/randomwalks/frequency_hashmap.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cu
...
...
@@ -5,7 +7,7 @@
*/
#include <algorithm>
#include <cub/cub.
cuh
> // NOLINT
#include <
hip
cub/
hip
cub.
hpp
> // NOLINT
#include <tuple>
#include <utility>
...
...
@@ -71,7 +73,7 @@ __global__ void _count_frequency(
}
}
using
BlockReduce
=
typename
cub
::
BlockReduce
<
IdxType
,
BLOCK_SIZE
>
;
using BlockReduce = typename
hip
cub::BlockReduce<IdxType, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_space;
count = BlockReduce(temp_space).Sum(count);
...
...
@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
int64_t last_idx = start_idx + TILE_SIZE;
const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
using
BlockScan
=
typename
cub
::
BlockScan
<
IdxType
,
BLOCK_SIZE
>
;
using BlockScan = typename
hip
cub::BlockScan<IdxType, BLOCK_SIZE>;
__shared__ typename BlockScan::TempStorage temp_space;
BlockPrefixCallbackOp<IdxType> prefix_op(0);
...
...
@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
template <typename IdxType>
FrequencyHashmap<IdxType>::FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cuda
Stream_t
stream
,
int64_t
edge_table_scale
)
{
hip
Stream_t stream, int64_t edge_table_scale) {
_ctx = ctx;
_stream = stream;
num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
...
...
@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
constexpr int TILE_SIZE = BLOCK_SIZE * 8;
dim3 block(BLOCK_SIZE);
dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
CUDA_CALL
(
cuda
Memset
(
dst_unique_edges
,
0
,
(
num_dst
)
*
sizeof
(
IdxType
)));
CUDA_CALL(
hip
Memset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_KERNEL_CALL(
(_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
_stream, edge_hashmap, (num_dst * num_items_each_dst));
...
...
@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// _edge_hashmap
bool *is_first_position = static_cast<bool *>(
device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
CUDA_CALL
(
cuda
Memset
(
is_first_position
,
0
,
sizeof
(
bool
)
*
(
num_edges
)));
CUDA_CALL(
hip
Memset(is_first_position, 0, sizeof(bool) * (num_edges)));
// double space to use ExclusiveSum
auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
_ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
...
...
@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 2.1 ExclusiveSum the edge_blocks_prefix
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
...
...
@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 3.1 ExclusiveSum the num_unique_each_node
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
// 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
// Create a set of DoubleBuffers to wrap pairs of device pointers
cub
::
DoubleBuffer
<
Idx64Type
>
d_unique_frequency
(
hip
cub::DoubleBuffer<Idx64Type> d_unique_frequency(
unique_frequency, unique_frequency_alternate);
cub
::
DoubleBuffer
<
IdxType
>
d_unique_src_edges
(
hip
cub::DoubleBuffer<IdxType> d_unique_src_edges(
unique_src_edges, unique_src_edges_alternate);
// Determine temporary device storage requirements
d_temp_storage = nullptr;
...
...
@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
// especially when num_dst_nodes is large (about ~10000)
if (dtype.bits == 32) {
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...
...
@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
}
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
if (dtype.bits == 32) {
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL
(
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...
...
@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// use unique_output_offsets;
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
...
...
src/graph/sampling/randomwalks/get_node_types_gpu.
cu
→
src/graph/sampling/randomwalks/get_node_types_gpu.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/get_node_types_gpu.cu
* @brief DGL sampler
*/
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/runtime/device_api.h>
...
...
src/graph/sampling/randomwalks/randomwalk_gpu.
cu
→
src/graph/sampling/randomwalks/randomwalk_gpu.
hip
View file @
74d88bf8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021-2022 by Contributors
* @file graph/sampling/randomwalk_gpu.cu
* @brief CUDA random walk sampleing
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <tuple>
#include <utility>
#include <vector>
...
...
@@ -35,6 +37,22 @@ struct GraphKernelData {
const IdType *in_cols;
const IdType *data;
};
template<typename IdType>
inline IdType* __GetDevicePointer(runtime::NDArray array) {
IdType* ptr = array.Ptr<IdType>();
if (array.IsPinned()) {
CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
}
return ptr;
}
inline void* __GetDevicePointer(runtime::NDArray array) {
void* ptr = array->data;
if (array.IsPinned()) {
CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
}
return ptr;
}
template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
__global__ void _RandomWalkKernel(
...
...
@@ -48,10 +66,10 @@ __global__ void _RandomWalkKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
cu
randState
rng
;
hip
randState
_t
rng;
// reference:
// https://docs.nvidia.com/cuda/
cu
rand/device-api-overview.html#performance-notes
cu
rand_init
(
rand_seed
+
idx
,
0
,
0
,
&
rng
);
// https://docs.nvidia.com/cuda/
hip
rand/device-api-overview.html#performance-notes
hip
rand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
...
...
@@ -68,18 +86,18 @@ __global__ void _RandomWalkKernel(
if (deg == 0) { // the degree is zero
break;
}
const
int64_t
num
=
cu
rand
(
&
rng
)
%
deg
;
const int64_t num =
hip
rand(&rng) % deg;
IdType pick = graph.in_cols[in_row_start + num];
IdType eid =
(graph.data ? graph.data[in_row_start + num] : in_row_start + num);
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
step_idx
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
0
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
...
...
@@ -107,10 +125,10 @@ __global__ void _RandomWalkBiasedKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
cu
randState
rng
;
hip
randState
_t
rng;
// reference:
// https://docs.nvidia.com/cuda/
cu
rand/device-api-overview.html#performance-notes
cu
rand_init
(
rand_seed
+
idx
,
0
,
0
,
&
rng
);
// https://docs.nvidia.com/cuda/
hip
rand/device-api-overview.html#performance-notes
hip
rand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
...
...
@@ -133,9 +151,9 @@ __global__ void _RandomWalkBiasedKernel(
const FloatType *prob = probs[metapath_id];
int64_t num;
if (prob == nullptr) {
num
=
cu
rand
(
&
rng
)
%
deg
;
num =
hip
rand(&rng) % deg;
} else {
auto
rnd_sum_w
=
prob_sum
[
curr
]
*
cu
rand_uniform
(
&
rng
);
auto rnd_sum_w = prob_sum[curr] *
hip
rand_uniform(&rng);
FloatType sum_w{0.};
for (num = 0; num < deg; ++num) {
sum_w += prob[in_row_start + num];
...
...
@@ -149,11 +167,11 @@ __global__ void _RandomWalkBiasedKernel(
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
step_idx
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(
cu
rand_uniform
(
&
rng
)
<
restart_prob_data
[
0
]))
{
(
hip
rand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
...
...
@@ -176,14 +194,17 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
FloatArray restart_prob) {
const int64_t max_num_steps = metapath->shape[0];
const
IdType
*
metapath_data
=
static_cast
<
IdType
*>
(
metapath
->
data
);
// const IdType *metapath_data = static_cast<IdType *>(metapath->data);
const IdType *metapath_data = static_cast<const IdType *>(__GetDevicePointer(metapath));
const int64_t begin_ntype =
hg->meta_graph()->FindEdge(metapath_data[0]).first;
const int64_t max_nodes = hg->NumVertices(begin_ntype);
int64_t num_etypes = hg->NumEdgeTypes();
auto ctx = seeds->ctx;
const
IdType
*
seed_data
=
static_cast
<
const
IdType
*>
(
seeds
->
data
);
// const IdType *seed_data = static_cast<const IdType *>(seeds->data);
const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
// const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
const int64_t num_seeds = seeds->shape[0];
int64_t trace_length = max_num_steps + 1;
...
...
@@ -195,14 +216,19 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
const CSRMatrix &csr = hg->GetCSRMatrix(etype);
h_graphs
[
etype
].
in_ptr
=
static_cast
<
const
IdType
*>
(
csr
.
indptr
->
data
);
h_graphs
[
etype
].
in_cols
=
static_cast
<
const
IdType
*>
(
csr
.
indices
->
data
);
// h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
// h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
// h_graphs[etype].data =
// (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
// : nullptr);
h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
h_graphs[etype].data =
(
CSRHasData
(
csr
)
?
static_cast
<
const
IdType
*>
(
csr
.
data
->
data
)
(CSRHasData(csr) ? static_cast<const IdType *>(
__GetDevicePointer(
csr.data
)
)
: nullptr);
}
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = DeviceAPI::Get(ctx);
auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
...
...
@@ -222,10 +248,11 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
ATEN_FLOAT_TYPE_SWITCH(
restart_prob->dtype, FloatType, "random walk GPU kernel", {
CHECK
(
restart_prob
->
ctx
.
device_type
==
kDGLCUDA
)
CHECK(restart_prob->ctx.device_type == kDGLCUDA
||restart_prob->ctx.device_type == kDGLROCM
)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const
FloatType
*
restart_prob_data
=
restart_prob
.
Ptr
<
FloatType
>
();
// const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
const int64_t restart_prob_size = restart_prob->shape[0];
CUDA_KERNEL_CALL(
(_RandomWalkKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
...
...
@@ -247,23 +274,27 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
const std::vector<FloatArray> &prob, FloatArray restart_prob) {
const int64_t max_num_steps = metapath->shape[0];
const
IdType
*
metapath_data
=
static_cast
<
IdType
*>
(
metapath
->
data
);
// const IdType *metapath_data = static_cast<IdType *>(metapath->data);
const IdType *metapath_data = static_cast<IdType *>(__GetDevicePointer(metapath));
const int64_t begin_ntype =
hg->meta_graph()->FindEdge(metapath_data[0]).first;
const int64_t max_nodes = hg->NumVertices(begin_ntype);
int64_t num_etypes = hg->NumEdgeTypes();
auto ctx = seeds->ctx;
const
IdType
*
seed_data
=
static_cast
<
const
IdType
*>
(
seeds
->
data
);
// const IdType *seed_data = static_cast<const IdType *>(seeds->data);
const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
const int64_t num_seeds = seeds->shape[0];
int64_t trace_length = max_num_steps + 1;
IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
IdType *traces_data = traces.Ptr<IdType>();
IdType
*
eids_data
=
eids
.
Ptr
<
IdType
>
();
// IdType *traces_data = static_cast<IdType *>(__GetDevicePointer(traces));
// IdType *eids_data = eids.Ptr<IdType>();
IdType *eids_data = static_cast<IdType *>(__GetDevicePointer(eids));
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = DeviceAPI::Get(ctx);
// new probs and prob sums pointers
assert(num_etypes == static_cast<int64_t>(prob.size()));
...
...
@@ -276,10 +307,15 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
const CSRMatrix &csr = hg->GetCSRMatrix(etype);
h_graphs
[
etype
].
in_ptr
=
static_cast
<
const
IdType
*>
(
csr
.
indptr
->
data
);
h_graphs
[
etype
].
in_cols
=
static_cast
<
const
IdType
*>
(
csr
.
indices
->
data
);
// h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
// h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
// h_graphs[etype].data =
// (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
// : nullptr);
h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
h_graphs[etype].data =
(
CSRHasData
(
csr
)
?
static_cast
<
const
IdType
*>
(
csr
.
data
->
data
)
(CSRHasData(csr) ? static_cast<const IdType *>(
__GetDevicePointer(
csr.data
)
)
: nullptr);
int64_t num_segments = csr.indptr->shape[0] - 1;
...
...
@@ -289,19 +325,22 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
prob_sums[etype] = nullptr;
continue;
}
probs
[
etype
]
=
prob
[
etype
].
Ptr
<
FloatType
>
();
// probs[etype] = prob[etype].Ptr<FloatType>();
probs[etype] = static_cast<FloatType *>(__GetDevicePointer(prob[etype]));
prob_sums_arr.push_back(
FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx));
prob_sums
[
etype
]
=
prob_sums_arr
[
etype
].
Ptr
<
FloatType
>
();
// prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
prob_sums[etype] = static_cast<FloatType *>(__GetDevicePointer(prob_sums_arr[etype]));
// calculate the sum of the neighbor weights
const
IdType
*
d_offsets
=
static_cast
<
const
IdType
*>
(
csr
.
indptr
->
data
);
// const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
const IdType *d_offsets = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
size_t temp_storage_size = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Sum(
nullptr, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
CUDA_CALL(
hip
cub::DeviceSegmentedReduce::Sum(
temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
device->FreeWorkspace(ctx, temp_storage);
...
...
@@ -328,18 +367,20 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype);
// copy metapath to GPU
auto d_metapath = metapath.CopyTo(ctx);
const
IdType
*
d_metapath_data
=
static_cast
<
IdType
*>
(
d_metapath
->
data
);
// const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
const IdType *d_metapath_data = static_cast<IdType *>(__GetDevicePointer(d_metapath));
constexpr int BLOCK_SIZE = 256;
constexpr int TILE_SIZE = BLOCK_SIZE * 4;
dim3 block(256);
dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
CHECK
(
restart_prob
->
ctx
.
device_type
==
kDGLCUDA
)
CHECK(restart_prob->ctx.device_type == kDGLCUDA
||restart_prob->ctx.device_type == kDGLROCM
)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const
FloatType
*
restart_prob_data
=
restart_prob
.
Ptr
<
FloatType
>
();
const
int64_t
restart_prob_size
=
restart_prob
->
shape
[
0
];
// const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
const int64_t restart_prob_size = restart_prob->shape[0];
CUDA_KERNEL_CALL(
(_RandomWalkBiasedKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
block, 0, stream, random_seed, seed_data, num_seeds, d_metapath_data,
...
...
@@ -396,7 +437,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
device->CopyDataFromTo(
&restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
...
...
@@ -443,13 +484,15 @@ template <DGLDeviceType XPU, typename IdxType>
std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
const int64_t k) {
CHECK
(
src
->
ctx
.
device_type
==
kDGLCUDA
)
<<
"IdArray needs be on GPU!"
;
const
IdxType
*
src_data
=
src
.
Ptr
<
IdxType
>
();
const
IdxType
*
dst_data
=
dst
.
Ptr
<
IdxType
>
();
CHECK(src->ctx.device_type == kDGLCUDA || src->ctx.device_type == kDGLROCM) << "IdArray needs be on GPU!";
// const IdxType *src_data = src.Ptr<IdxType>();
const IdxType *src_data = static_cast<IdxType*>(__GetDevicePointer(src));
// const IdxType *dst_data = dst.Ptr<IdxType>();
const IdxType *dst_data = static_cast<IdxType*>(__GetDevicePointer(dst));
const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
auto ctx = src->ctx;
// use cuda stream from local thread
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto frequency_hashmap = FrequencyHashmap<IdxType>(
num_dst_nodes, num_samples_per_node, ctx, stream);
auto ret = frequency_hashmap.Topk(
...
...
Prev
1
…
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment