"docs/vscode:/vscode.git/clone" did not exist on "7ac6e286ee994270e737b70c904ea50049d53567"
Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 Intel Corporation
*
......@@ -21,7 +22,7 @@
#include <vector>
#include "../c_api_common.h"
#include "./check.h"
#include "check.h"
#include "kernel_decl.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/selector.h
......@@ -12,13 +13,13 @@ namespace dgl {
namespace {
#ifdef __CUDACC__
#define DGLDEVICE __device__
#ifdef __HIPCC__
#define DGLDEVICE __device__ __host__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __CUDACC__
#endif // __HIPCC__
} // namespace
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/uvm_array.cc
......@@ -8,7 +9,7 @@
#include <sstream>
#include "../c_api_common.h"
#include "./uvm_array_op.h"
#include "uvm_array_op.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/edge_coarsening_impl.cu
* @brief Edge coarsening CUDA implementation
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/array.h>
#include <dgl/random.h>
#include <dmlc/thread_local.h>
......@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
float *ret_values, size_t num, uint64_t seed) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < num) {
curandState state;
curand_init(seed, id, 0, &state);
ret_values[id] = curand_uniform(&state);
hiprandState_t state;
hiprand_init(seed, id, 0, &state);
ret_values[id] = hiprand_uniform(&state);
}
}
......@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
// initial done signal
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
// generate color prop for each node
......@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
result_data);
bool done_h = false;
CUDA_CALL(cudaMemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
CUDA_CALL(hipMemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0, hipMemcpyDeviceToHost));
return done_h;
}
......@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
template <DGLDeviceType XPU, typename FloatType, typename IdType>
void WeightedNeighborMatching(
const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto &ctx = result->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
device->SetDevice(ctx);
......@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
device->SetDevice(ctx);
// generate random weights
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
NDArray weight = NDArray::Empty(
{num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
float *weight_data = static_cast<float *>(weight->data);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file geometry/cuda/geometry_op_impl.cc
......@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void FarthestPointSampler(
NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
IdArray start_idx, IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const FloatType* array_data = static_cast<FloatType*>(array->data);
......@@ -110,7 +112,7 @@ void FarthestPointSampler(
// sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
CUDA_CALL(cudaSetDevice(array->ctx.device_id));
CUDA_CALL(hipSetDevice(array->ctx.device_id));
CUDA_KERNEL_CALL(
fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file geometry/geometry.cc
......@@ -10,7 +11,7 @@
#include "../array/check.h"
#include "../c_api_common.h"
#include "./geometry_op.h"
#include "geometry_op.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/creators.cc
* @brief Functions for constructing graphs.
*/
#include "./heterograph.h"
#include "heterograph.h"
using namespace dgl::runtime;
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.cc
* @brief Heterograph implementation
*/
#include "./heterograph.h"
#include "heterograph.h"
#include <dgl/array.h>
#include <dgl/graph_serializer.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/heterograph.h
......@@ -18,7 +19,7 @@
#include <utility>
#include <vector>
#include "./unit_graph.h"
#include "unit_graph.h"
#include "shared_mem_manager.h"
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/heterograph_capi.cc
......@@ -14,7 +15,7 @@
#include <set>
#include "../c_api_common.h"
#include "./heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/pickle.cc
......@@ -10,7 +11,7 @@
#include <dmlc/memory_io.h>
#include "../c_api_common.h"
#include "./heterograph.h"
#include "heterograph.h"
#include "unit_graph.h"
using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cuh
......@@ -56,7 +58,7 @@ class FrequencyHashmap {
FrequencyHashmap() = delete;
FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
hipStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
~FrequencyHashmap();
using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
std::tuple<IdArray, IdArray, IdArray> Topk(
......@@ -66,7 +68,7 @@ class FrequencyHashmap {
private:
DGLContext _ctx;
cudaStream_t _stream;
hipStream_t _stream;
DeviceEdgeHashmap<IdxType> *_device_edge_hashmap;
IdxType *_dst_unique_edges;
EdgeItem *_edge_hashmap;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cu
......@@ -5,7 +7,7 @@
*/
#include <algorithm>
#include <cub/cub.cuh> // NOLINT
#include <hipcub/hipcub.hpp> // NOLINT
#include <tuple>
#include <utility>
......@@ -71,7 +73,7 @@ __global__ void _count_frequency(
}
}
using BlockReduce = typename cub::BlockReduce<IdxType, BLOCK_SIZE>;
using BlockReduce = typename hipcub::BlockReduce<IdxType, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_space;
count = BlockReduce(temp_space).Sum(count);
......@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
int64_t last_idx = start_idx + TILE_SIZE;
const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
using BlockScan = typename cub::BlockScan<IdxType, BLOCK_SIZE>;
using BlockScan = typename hipcub::BlockScan<IdxType, BLOCK_SIZE>;
__shared__ typename BlockScan::TempStorage temp_space;
BlockPrefixCallbackOp<IdxType> prefix_op(0);
......@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
template <typename IdxType>
FrequencyHashmap<IdxType>::FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cudaStream_t stream, int64_t edge_table_scale) {
hipStream_t stream, int64_t edge_table_scale) {
_ctx = ctx;
_stream = stream;
num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
......@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
constexpr int TILE_SIZE = BLOCK_SIZE * 8;
dim3 block(BLOCK_SIZE);
dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_CALL(hipMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_KERNEL_CALL(
(_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
_stream, edge_hashmap, (num_dst * num_items_each_dst));
......@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// _edge_hashmap
bool *is_first_position = static_cast<bool *>(
device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
CUDA_CALL(hipMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
// double space to use ExclusiveSum
auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
_ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
......@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 2.1 ExclusiveSum the edge_blocks_prefix
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
......@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 3.1 ExclusiveSum the num_unique_each_node
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
// 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
// Create a set of DoubleBuffers to wrap pairs of device pointers
cub::DoubleBuffer<Idx64Type> d_unique_frequency(
hipcub::DoubleBuffer<Idx64Type> d_unique_frequency(
unique_frequency, unique_frequency_alternate);
cub::DoubleBuffer<IdxType> d_unique_src_edges(
hipcub::DoubleBuffer<IdxType> d_unique_src_edges(
unique_src_edges, unique_src_edges_alternate);
// Determine temporary device storage requirements
d_temp_storage = nullptr;
......@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
// especially when num_dst_nodes is large (about ~10000)
if (dtype.bits == 32) {
CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
......@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
}
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
if (dtype.bits == 32) {
CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream));
} else {
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
......@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// use unique_output_offsets;
d_temp_storage = nullptr;
temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage);
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file graph/sampling/get_node_types_gpu.cu
* @brief DGL sampler
*/
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/runtime/device_api.h>
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021-2022 by Contributors
* @file graph/sampling/randomwalk_gpu.cu
* @brief CUDA random walk sampleing
*/
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <tuple>
#include <utility>
#include <vector>
......@@ -48,10 +50,10 @@ __global__ void _RandomWalkKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
curandState rng;
hiprandState_t rng;
// reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
curand_init(rand_seed + idx, 0, 0, &rng);
// https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
hiprand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
......@@ -68,18 +70,18 @@ __global__ void _RandomWalkKernel(
if (deg == 0) { // the degree is zero
break;
}
const int64_t num = curand(&rng) % deg;
const int64_t num = hiprand(&rng) % deg;
IdType pick = graph.in_cols[in_row_start + num];
IdType eid =
(graph.data ? graph.data[in_row_start + num] : in_row_start + num);
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(curand_uniform(&rng) < restart_prob_data[step_idx])) {
(hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(curand_uniform(&rng) < restart_prob_data[0])) {
(hiprand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
......@@ -107,10 +109,10 @@ __global__ void _RandomWalkBiasedKernel(
int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
curandState rng;
hiprandState_t rng;
// reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
curand_init(rand_seed + idx, 0, 0, &rng);
// https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
hiprand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
......@@ -133,9 +135,9 @@ __global__ void _RandomWalkBiasedKernel(
const FloatType *prob = probs[metapath_id];
int64_t num;
if (prob == nullptr) {
num = curand(&rng) % deg;
num = hiprand(&rng) % deg;
} else {
auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
auto rnd_sum_w = prob_sum[curr] * hiprand_uniform(&rng);
FloatType sum_w{0.};
for (num = 0; num < deg; ++num) {
sum_w += prob[in_row_start + num];
......@@ -149,11 +151,11 @@ __global__ void _RandomWalkBiasedKernel(
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) &&
(curand_uniform(&rng) < restart_prob_data[step_idx])) {
(hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if (
(restart_prob_size == 1) &&
(curand_uniform(&rng) < restart_prob_data[0])) {
(hiprand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr;
......@@ -202,7 +204,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
: nullptr);
}
// use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = DeviceAPI::Get(ctx);
auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
......@@ -222,7 +224,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
ATEN_FLOAT_TYPE_SWITCH(
restart_prob->dtype, FloatType, "random walk GPU kernel", {
CHECK(restart_prob->ctx.device_type == kDGLCUDA)
CHECK(restart_prob->ctx.device_type == kDGLCUDA||restart_prob->ctx.device_type == kDGLROCM)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
......@@ -263,7 +265,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
IdType *traces_data = traces.Ptr<IdType>();
IdType *eids_data = eids.Ptr<IdType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = DeviceAPI::Get(ctx);
// new probs and prob sums pointers
assert(num_etypes == static_cast<int64_t>(prob.size()));
......@@ -297,11 +299,11 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
// calculate the sum of the neighbor weights
const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
size_t temp_storage_size = 0;
CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
nullptr, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream));
device->FreeWorkspace(ctx, temp_storage);
......@@ -335,7 +337,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
dim3 block(256);
dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
CHECK(restart_prob->ctx.device_type == kDGLCUDA)
CHECK(restart_prob->ctx.device_type == kDGLCUDA ||restart_prob->ctx.device_type == kDGLROCM)
<< "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
......@@ -396,7 +398,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
// use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
device->CopyDataFromTo(
&restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
......@@ -443,13 +445,13 @@ template <DGLDeviceType XPU, typename IdxType>
std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
const int64_t k) {
CHECK(src->ctx.device_type == kDGLCUDA) << "IdArray needs be on GPU!";
CHECK(src->ctx.device_type == kDGLCUDA || src->ctx.device_type == kDGLROCM) << "IdArray needs be on GPU!";
const IdxType *src_data = src.Ptr<IdxType>();
const IdxType *dst_data = dst.Ptr<IdxType>();
const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
auto ctx = src->ctx;
// use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto frequency_hashmap = FrequencyHashmap<IdxType>(
num_dst_nodes, num_samples_per_node, ctx, stream);
auto ret = frequency_hashmap.Topk(
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file graph/serialize/heterograph_serialize.cc
......@@ -48,8 +49,8 @@
#include <vector>
#include "../heterograph.h"
#include "./dglstream.h"
#include "./graph_serialize.h"
#include "dglstream.h"
#include "graph_serialize.h"
#include "dmlc/memory_io.h"
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file graph/subgraph.cc
* @brief Functions for extracting subgraphs.
*/
#include "./heterograph.h"
#include "heterograph.h"
using namespace dgl::runtime;
namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2021 Contributors
*
......@@ -18,7 +19,7 @@
* all given graphs with the same set of nodes.
*/
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
......@@ -55,10 +56,10 @@ template <typename IdType>
void BuildNodeMaps(
const std::vector<IdArray> &input_nodes,
DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
std::vector<IdArray> *const unique_nodes_device, cudaStream_t stream) {
std::vector<IdArray> *const unique_nodes_device, hipStream_t stream) {
const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
CUDA_CALL(cudaMemsetAsync(
CUDA_CALL(hipMemsetAsync(
count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
stream));
......@@ -81,7 +82,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
const std::vector<IdArray> &always_preserve) {
const auto &ctx = graphs[0]->Context();
auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CHECK_EQ(ctx.device_type, kDGLCUDA);
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2022 Contributors
*
......@@ -22,7 +23,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include <dgl/base_heterograph.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <dgl/runtime/c_runtime_api.h>
#include <algorithm>
......@@ -113,7 +114,7 @@ class DeviceNodeMap {
DeviceNodeMap(
const std::vector<int64_t>& num_nodes, const int64_t offset,
DGLContext ctx, cudaStream_t stream)
DGLContext ctx, hipStream_t stream)
: num_types_(num_nodes.size()),
rhs_offset_(offset),
hash_tables_(),
......@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) {
template <typename IdType>
std::tuple<std::vector<IdArray>, std::vector<IdArray>> MapEdges(
HeteroGraphPtr graph, const std::vector<EdgeArray>& edge_sets,
const DeviceNodeMap<IdType>& node_map, cudaStream_t stream) {
const DeviceNodeMap<IdType>& node_map, hipStream_t stream) {
constexpr const int BLOCK_SIZE = 128;
constexpr const size_t TILE_SIZE = 1024;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright 2020-2021 Contributors
*
......@@ -20,7 +21,7 @@
* Tested via python wrapper: python/dgl/path/to/to_block.py
*/
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
......@@ -69,10 +70,10 @@ class DeviceNodeMapMaker {
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
std::vector<IdArray>* const lhs_device, cudaStream_t stream) {
std::vector<IdArray>* const lhs_device, hipStream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
CUDA_CALL(cudaMemsetAsync(
CUDA_CALL(hipMemsetAsync(
count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
// possibly dublicate lhs nodes
......@@ -112,7 +113,7 @@ class DeviceNodeMapMaker {
void Make(
const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) {
DeviceNodeMap<IdType>* const node_maps, hipStream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
// unique lhs nodes
......@@ -155,7 +156,7 @@ struct CUDAIdsMapper {
std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
const bool generate_lhs_nodes = lhs_nodes.empty();
auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// Allocate space for map creation process.
DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
......@@ -168,7 +169,7 @@ struct CUDAIdsMapper {
}
}
cudaEvent_t copyEvent;
hipEvent_t copyEvent;
NDArray new_len_tensor;
// Populate the mappings.
if (generate_lhs_nodes) {
......@@ -179,7 +180,7 @@ struct CUDAIdsMapper {
src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
stream);
CUDA_CALL(cudaEventCreate(&copyEvent));
CUDA_CALL(hipEventCreate(&copyEvent));
if (TensorDispatcher::Global()->IsAvailable()) {
new_len_tensor = NDArray::PinnedEmpty(
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
......@@ -190,11 +191,11 @@ struct CUDAIdsMapper {
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
DGLContext{kDGLCPU, 0});
}
CUDA_CALL(cudaMemcpyAsync(
CUDA_CALL(hipMemcpyAsync(
new_len_tensor->data, count_lhs_device,
sizeof(*num_nodes_per_type.data()) * num_ntypes,
cudaMemcpyDeviceToHost, stream));
CUDA_CALL(cudaEventRecord(copyEvent, stream));
hipMemcpyDeviceToHost, stream));
CUDA_CALL(hipEventRecord(copyEvent, stream));
device->FreeWorkspace(ctx, count_lhs_device);
} else {
......@@ -209,8 +210,8 @@ struct CUDAIdsMapper {
if (generate_lhs_nodes) {
// wait for the previous copy
CUDA_CALL(cudaEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent));
CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(hipEventDestroy(copyEvent));
// Resize lhs nodes.
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment