Commit 74d88bf8 authored by sangwz's avatar sangwz
Browse files

Merge branch 'dtk25.04' of http://developer.sourcefind.cn/codes/OpenDAS/dgl into 2.2.1

parents 2a1ac588 314cedc1
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.cu
* @brief Utilities for CUDA kernels.
*/
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
namespace dgl {
namespace cuda {
......@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction
size_t workspace_size = 0;
cudaStream_t stream = runtime::getCurrentCUDAStream();
CUDA_CALL(cub::DeviceReduce::Min(
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CUDA_CALL(hipcub::DeviceReduce::Min(
nullptr, workspace_size, flags, rst, length, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceReduce::Min(
CUDA_CALL(hipcub::DeviceReduce::Min(
workspace, workspace_size, flags, rst, length, stream));
int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
device->FreeWorkspace(ctx, workspace);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cpu/array_index_select_uvm.cuh
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/cuda/uvm/array_index_select_uvm.cu
......@@ -8,7 +10,7 @@
#include "../../../runtime/cuda/cuda_common.h"
#include "../array_index_select.cuh"
#include "../utils.h"
#include "./array_index_select_uvm.cuh"
#include "array_index_select_uvm.cuh"
namespace dgl {
using runtime::NDArray;
......@@ -17,7 +19,7 @@ namespace impl {
template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0];
int64_t num_feat = 1;
......@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template <typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const DType* source_data = static_cast<DType*>(source->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = dest->shape[0];
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file array/filter.cc
* @brief Object for selecting items in a set, or selecting items not in a set.
*/
#include "./filter.h"
#include "filter.h"
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/packed_func.h>
......@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
IdArray array = args[0];
auto ctx = array->ctx;
// TODO(nv-dlasalle): Implement CPU version.
if (ctx.device_type == kDGLCUDA) {
if (ctx.device_type == kDGLCUDA|| ctx.device_type == kDGLROCM) {
#ifdef DGL_USE_CUDA
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
*rv = CreateSetFilter<kDGLCUDA, IdType>(array);
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/kernel.cc
......@@ -7,7 +8,7 @@
#include <dgl/packed_func_ext.h>
#include "../c_api_common.h"
#include "./check.h"
#include "check.h"
#include "kernel_decl.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 Intel Corporation
*
......@@ -21,7 +22,7 @@
#include <vector>
#include "../c_api_common.h"
#include "./check.h"
#include "check.h"
#include "kernel_decl.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/selector.h
......@@ -12,13 +13,13 @@ namespace dgl {
namespace {
#ifdef __CUDACC__
#define DGLDEVICE __device__
#ifdef __HIPCC__
#define DGLDEVICE __device__ __host__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __CUDACC__
#endif // __HIPCC__
} // namespace
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/uvm_array.cc
......@@ -8,7 +9,7 @@
#include <sstream>
#include "../c_api_common.h"
#include "./uvm_array_op.h"
#include "uvm_array_op.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/edge_coarsening_impl.cu
* @brief Edge coarsening CUDA implementation
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dmlc/thread_local.h>
......@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
float *ret_values, size_t num, uint64_t seed) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < num) {
curandState state;
curand_init(seed, id, 0, &state);
ret_values[id] = curand_uniform(&state);
hiprandState_t state;
hiprand_init(seed, id, 0, &state);
ret_values[id] = hiprand_uniform(&state);
}
}
......@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
// initial done signal
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
// generate color prop for each node
......@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
result_data);
bool done_h = false;
CUDA_CALL(cudaMemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
CUDA_CALL(hipMemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0, hipMemcpyDeviceToHost));
return done_h;
}
......@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
template <DGLDeviceType XPU, typename FloatType, typename IdType>
void WeightedNeighborMatching(
const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto &ctx = result->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
device->SetDevice(ctx);
......@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
device->SetDevice(ctx);
// generate random weights
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
NDArray weight = NDArray::Empty(
{num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
float *weight_data = static_cast<float *>(weight->data);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/geometry_op_impl.cc
......@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void FarthestPointSampler(
NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
IdArray start_idx, IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const FloatType* array_data = static_cast<FloatType*>(array->data);
......@@ -110,7 +112,7 @@ void FarthestPointSampler(
// sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
CUDA_CALL(cudaSetDevice(array->ctx.device_id));
CUDA_CALL(hipSetDevice(array->ctx.device_id));
CUDA_KERNEL_CALL(
fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file geometry/geometry.cc
......@@ -10,7 +11,7 @@
#include "../array/check.h"
#include "../c_api_common.h"
#include "./geometry_op.h"
#include "geometry_op.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/creators.cc
* @brief Functions for constructing graphs.
*/
#include "./heterograph.h"
#include "heterograph.h"
using namespace dgl::runtime;
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.cc
* @brief Heterograph implementation
*/
#include "./heterograph.h"
#include "heterograph.h"
#include <dgl/array.h>
#include <dgl/graph_serializer.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.h
......@@ -18,7 +19,7 @@
#include <utility>
#include <vector>
#include "./unit_graph.h"
#include "unit_graph.h"
#include "shared_mem_manager.h"
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/heterograph_capi.cc
......@@ -14,7 +15,7 @@
#include <set>
#include "../c_api_common.h"
#include "./heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/pickle.cc
......@@ -10,7 +11,7 @@
#include <dmlc/memory_io.h>
#include "../c_api_common.h"
#include "./heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cuh
......@@ -56,7 +58,7 @@ class FrequencyHashmap {
FrequencyHashmap() = delete;
FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
hipStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
~FrequencyHashmap();
using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
std::tuple<IdArray, IdArray, IdArray> Topk(
......@@ -66,7 +68,7 @@ class FrequencyHashmap {
private:
DGLContext _ctx;
cudaStream_t _stream;
hipStream_t _stream;
DeviceEdgeHashmap<IdxType> *_device_edge_hashmap;
IdxType *_dst_unique_edges;
EdgeItem *_edge_hashmap;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cu
......@@ -5,7 +7,7 @@
*/
#include <algorithm>
#include <cub/cub.cuh> // NOLINT
#include <hipcub/hipcub.hpp> // NOLINT
#include <tuple>
#include <utility>
......@@ -71,7 +73,7 @@ __global__ void _count_frequency(
}
}
using BlockReduce = typename cub::BlockReduce<IdxType, BLOCK_SIZE>;
using BlockReduce = typename hipcub::BlockReduce<IdxType, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_space;
count = BlockReduce(temp_space).Sum(count);
......@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
int64_t last_idx = start_idx + TILE_SIZE;
const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
using BlockScan = typename cub::BlockScan<IdxType, BLOCK_SIZE>;
using BlockScan = typename hipcub::BlockScan<IdxType, BLOCK_SIZE>;
__shared__ typename BlockScan::TempStorage temp_space;
BlockPrefixCallbackOp<IdxType> prefix_op(0);
......@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
template <typename IdxType>
FrequencyHashmap<IdxType>::FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cudaStream_t stream, int64_t edge_table_scale) {
hipStream_t stream, int64_t edge_table_scale) {
_ctx = ctx;
_stream = stream;
num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
......@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
constexpr int TILE_SIZE = BLOCK_SIZE * 8;
dim3 block(BLOCK_SIZE);
dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_CALL(hipMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_KERNEL_CALL(
(_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
_stream, edge_hashmap, (num_dst * num_items_each_dst));
......@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// _edge_hashmap
bool *is_first_position = static_cast<bool *>(
device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
CUDA_CALL(hipMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
// double space to use ExclusiveSum
auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
_ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
......@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 2.1 ExclusiveSum the edge_blocks_prefix
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
......@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 3.1 ExclusiveSum the num_unique_each_node
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
// 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
// Create a set of DoubleBuffers to wrap pairs of device pointers
cub::DoubleBuffer<Idx64Type> d_unique_frequency(
hipcub::DoubleBuffer<Idx64Type> d_unique_frequency(
unique_frequency, unique_frequency_alternate);
cub::DoubleBuffer<IdxType> d_unique_src_edges(
hipcub::DoubleBuffer<IdxType> d_unique_src_edges(
unique_src_edges, unique_src_edges_alternate);
// Determine temporary device storage requirements
d_temp_storage = nullptr;
......@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
// especially when num_dst_nodes is large (about ~10000)
if (dtype.bits == 32) {
CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
......@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
}
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
if (dtype.bits == 32) {
CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
......@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// use unique_output_offsets;
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/get_node_types_gpu.cu
* @brief DGL sampler
*/
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/runtime/device_api.h>
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021-2022 by Contributors
* @file graph/sampling/randomwalk_gpu.cu
* @brief CUDA random walk sampleing
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <tuple>
#include <utility>
#include <vector>
......@@ -35,6 +37,22 @@ struct GraphKernelData {
const IdType *in_cols;
const IdType *data;
};
template<typename IdType>
inline IdType* __GetDevicePointer(runtime::NDArray array) {
IdType* ptr = array.Ptr<IdType>();
if (array.IsPinned()) {
CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
}
return ptr;
}
inline void* __GetDevicePointer(runtime::NDArray array) {
void* ptr = array->data;
if (array.IsPinned()) {
CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
}
return ptr;
}
template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
__global__ void _RandomWalkKernel(
......@@ -48,10 +66,10 @@ __global__ void _RandomWalkKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
curandState rng;
hiprandState_t rng;
// reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
curand_init(rand_seed + idx, 0, 0, &rng);
// https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
hiprand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
......@@ -68,18 +86,18 @@ __global__ void _RandomWalkKernel(
if (deg == 0) { // the degree is zero
break;
}
const int64_t num = curand(&rng) % deg;
const int64_t num = hiprand(&rng) % deg;
IdType pick = graph.in_cols[in_row_start + num];
IdType eid =
(graph.data ? graph.data[in_row_start + num] : in_row_start + num);
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(curand_uniform(&rng) < restart_prob_data[step_idx])) {
(hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(curand_uniform(&rng) < restart_prob_data[0])) {
(hiprand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
......@@ -107,10 +125,10 @@ __global__ void _RandomWalkBiasedKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
curandState rng;
hiprandState_t rng;
// reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
curand_init(rand_seed + idx, 0, 0, &rng);
// https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
hiprand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
......@@ -133,9 +151,9 @@ __global__ void _RandomWalkBiasedKernel(
const FloatType *prob = probs[metapath_id];
int64_t num;
if (prob == nullptr) {
num = curand(&rng) % deg;
num = hiprand(&rng) % deg;
} else {
auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
auto rnd_sum_w = prob_sum[curr] * hiprand_uniform(&rng);
FloatType sum_w{0.};
for (num = 0; num < deg; ++num) {
sum_w += prob[in_row_start + num];
......@@ -149,11 +167,11 @@ __global__ void _RandomWalkBiasedKernel(
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(curand_uniform(&rng) < restart_prob_data[step_idx])) {
(hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(curand_uniform(&rng) < restart_prob_data[0])) {
(hiprand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
......@@ -176,14 +194,17 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
FloatArray restart_prob) {
const int64_t max_num_steps = metapath->shape[0];
const IdType *metapath_data = static_cast<IdType *>(metapath->data);
// const IdType *metapath_data = static_cast<IdType *>(metapath->data);
const IdType *metapath_data = static_cast<const IdType *>(__GetDevicePointer(metapath));
const int64_t begin_ntype =
hg->meta_graph()->FindEdge(metapath_data[0]).first;
const int64_t max_nodes = hg->NumVertices(begin_ntype);
int64_t num_etypes = hg->NumEdgeTypes();
auto ctx = seeds->ctx;
const IdType *seed_data = static_cast<const IdType *>(seeds->data);
// const IdType *seed_data = static_cast<const IdType *>(seeds->data);
const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
// const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
const int64_t num_seeds = seeds->shape[0];
int64_t trace_length = max_num_steps + 1;
......@@ -195,14 +216,19 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
const CSRMatrix &csr = hg->GetCSRMatrix(etype);
h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
// h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
// h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
// h_graphs[etype].data =
// (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
// : nullptr);
h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
h_graphs[etype].data =
(CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
(CSRHasData(csr) ? static_cast<const IdType *>(__GetDevicePointer(csr.data))
: nullptr);
}
// use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = DeviceAPI::Get(ctx);
auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
......@@ -222,10 +248,11 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
ATEN_FLOAT_TYPE_SWITCH(
restart_prob->dtype, FloatType, "random walk GPU kernel", {
CHECK(restart_prob->ctx.device_type == kDGLCUDA)
CHECK(restart_prob->ctx.device_type == kDGLCUDA||restart_prob->ctx.device_type == kDGLROCM)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
// const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
const int64_t restart_prob_size = restart_prob->shape[0];
CUDA_KERNEL_CALL(
(_RandomWalkKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
......@@ -247,23 +274,27 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
const std::vector<FloatArray> &prob, FloatArray restart_prob) {
const int64_t max_num_steps = metapath->shape[0];
const IdType *metapath_data = static_cast<IdType *>(metapath->data);
// const IdType *metapath_data = static_cast<IdType *>(metapath->data);
const IdType *metapath_data = static_cast<IdType *>(__GetDevicePointer(metapath));
const int64_t begin_ntype =
hg->meta_graph()->FindEdge(metapath_data[0]).first;
const int64_t max_nodes = hg->NumVertices(begin_ntype);
int64_t num_etypes = hg->NumEdgeTypes();
auto ctx = seeds->ctx;
const IdType *seed_data = static_cast<const IdType *>(seeds->data);
// const IdType *seed_data = static_cast<const IdType *>(seeds->data);
const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
const int64_t num_seeds = seeds->shape[0];
int64_t trace_length = max_num_steps + 1;
IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
IdType *traces_data = traces.Ptr<IdType>();
IdType *eids_data = eids.Ptr<IdType>();
// IdType *traces_data = static_cast<IdType *>(__GetDevicePointer(traces));
// IdType *eids_data = eids.Ptr<IdType>();
IdType *eids_data = static_cast<IdType *>(__GetDevicePointer(eids));
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = DeviceAPI::Get(ctx);
// new probs and prob sums pointers
assert(num_etypes == static_cast<int64_t>(prob.size()));
......@@ -276,10 +307,15 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
const CSRMatrix &csr = hg->GetCSRMatrix(etype);
h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
// h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
// h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
// h_graphs[etype].data =
// (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
// : nullptr);
h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
h_graphs[etype].data =
(CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
(CSRHasData(csr) ? static_cast<const IdType *>(__GetDevicePointer(csr.data))
: nullptr);
int64_t num_segments = csr.indptr->shape[0] - 1;
......@@ -289,19 +325,22 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
prob_sums[etype] = nullptr;
continue;
}
probs[etype] = prob[etype].Ptr<FloatType>();
// probs[etype] = prob[etype].Ptr<FloatType>();
probs[etype] = static_cast<FloatType *>(__GetDevicePointer(prob[etype]));
prob_sums_arr.push_back(
FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx));
prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
// prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
prob_sums[etype] = static_cast<FloatType *>(__GetDevicePointer(prob_sums_arr[etype]));
// calculate the sum of the neighbor weights
const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
// const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
const IdType *d_offsets = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
size_t temp_storage_size = 0;
CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
nullptr, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
device->FreeWorkspace(ctx, temp_storage);
......@@ -328,18 +367,20 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype);
// copy metapath to GPU
auto d_metapath = metapath.CopyTo(ctx);
const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
// const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
const IdType *d_metapath_data = static_cast<IdType *>(__GetDevicePointer(d_metapath));
constexpr int BLOCK_SIZE = 256;
constexpr int TILE_SIZE = BLOCK_SIZE * 4;
dim3 block(256);
dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
CHECK(restart_prob->ctx.device_type == kDGLCUDA)
CHECK(restart_prob->ctx.device_type == kDGLCUDA ||restart_prob->ctx.device_type == kDGLROCM)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
const int64_t restart_prob_size = restart_prob->shape[0];
// const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
const int64_t restart_prob_size = restart_prob->shape[0];
CUDA_KERNEL_CALL(
(_RandomWalkBiasedKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
block, 0, stream, random_seed, seed_data, num_seeds, d_metapath_data,
......@@ -396,7 +437,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
// use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
device->CopyDataFromTo(
&restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
......@@ -443,13 +484,15 @@ template <DGLDeviceType XPU, typename IdxType>
std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
const int64_t k) {
CHECK(src->ctx.device_type == kDGLCUDA) << "IdArray needs be on GPU!";
const IdxType *src_data = src.Ptr<IdxType>();
const IdxType *dst_data = dst.Ptr<IdxType>();
CHECK(src->ctx.device_type == kDGLCUDA || src->ctx.device_type == kDGLROCM) << "IdArray needs be on GPU!";
// const IdxType *src_data = src.Ptr<IdxType>();
const IdxType *src_data = static_cast<IdxType*>(__GetDevicePointer(src));
// const IdxType *dst_data = dst.Ptr<IdxType>();
const IdxType *dst_data = static_cast<IdxType*>(__GetDevicePointer(dst));
const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
auto ctx = src->ctx;
// use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto frequency_hashmap = FrequencyHashmap<IdxType>(
num_dst_nodes, num_samples_per_node, ctx, stream);
auto ret = frequency_hashmap.Topk(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment