Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2021 Intel Corporation * Copyright (c) 2021 Intel Corporation
* *
...@@ -21,7 +22,7 @@ ...@@ -21,7 +22,7 @@
#include <vector> #include <vector>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./check.h" #include "check.h"
#include "kernel_decl.h" #include "kernel_decl.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/selector.h * @file array/selector.h
...@@ -12,13 +13,13 @@ namespace dgl { ...@@ -12,13 +13,13 @@ namespace dgl {
namespace { namespace {
#ifdef __CUDACC__ #ifdef __HIPCC__
#define DGLDEVICE __device__ #define DGLDEVICE __device__ __host__
#define DGLINLINE __forceinline__ #define DGLINLINE __forceinline__
#else #else
#define DGLDEVICE #define DGLDEVICE
#define DGLINLINE inline #define DGLINLINE inline
#endif // __CUDACC__ #endif // __HIPCC__
} // namespace } // namespace
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019-2022 by Contributors * Copyright (c) 2019-2022 by Contributors
* @file array/uvm_array.cc * @file array/uvm_array.cc
...@@ -8,7 +9,7 @@ ...@@ -8,7 +9,7 @@
#include <sstream> #include <sstream>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./uvm_array_op.h" #include "uvm_array_op.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file geometry/cuda/edge_coarsening_impl.cu * @file geometry/cuda/edge_coarsening_impl.cu
* @brief Edge coarsening CUDA implementation * @brief Edge coarsening CUDA implementation
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dmlc/thread_local.h> #include <dmlc/thread_local.h>
...@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel( ...@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
float *ret_values, size_t num, uint64_t seed) { float *ret_values, size_t num, uint64_t seed) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x; size_t id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < num) { if (id < num) {
curandState state; hiprandState_t state;
curand_init(seed, id, 0, &state); hiprand_init(seed, id, 0, &state);
ret_values[id] = curand_uniform(&state); ret_values[id] = hiprand_uniform(&state);
} }
} }
...@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel( ...@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
template <typename IdType> template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) { bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
// initial done signal // initial done signal
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream); CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
// generate color prop for each node // generate color prop for each node
...@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) { ...@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes, colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
result_data); result_data);
bool done_h = false; bool done_h = false;
CUDA_CALL(cudaMemcpyFromSymbol( CUDA_CALL(hipMemcpyFromSymbol(
&done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost)); &done_h, done_d, sizeof(done_h), 0, hipMemcpyDeviceToHost));
return done_h; return done_h;
} }
...@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) { ...@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
template <DGLDeviceType XPU, typename FloatType, typename IdType> template <DGLDeviceType XPU, typename FloatType, typename IdType>
void WeightedNeighborMatching( void WeightedNeighborMatching(
const aten::CSRMatrix &csr, const NDArray weight, IdArray result) { const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const auto &ctx = result->ctx; const auto &ctx = result->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
device->SetDevice(ctx); device->SetDevice(ctx);
...@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) { ...@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
device->SetDevice(ctx); device->SetDevice(ctx);
// generate random weights // generate random weights
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
NDArray weight = NDArray::Empty( NDArray weight = NDArray::Empty(
{num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx); {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
float *weight_data = static_cast<float *>(weight->data); float *weight_data = static_cast<float *>(weight->data);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file geometry/cuda/geometry_op_impl.cc * @file geometry/cuda/geometry_op_impl.cc
...@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType> ...@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
void FarthestPointSampler( void FarthestPointSampler(
NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist, NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
IdArray start_idx, IdArray result) { IdArray start_idx, IdArray result) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const FloatType* array_data = static_cast<FloatType*>(array->data); const FloatType* array_data = static_cast<FloatType*>(array->data);
...@@ -110,7 +112,7 @@ void FarthestPointSampler( ...@@ -110,7 +112,7 @@ void FarthestPointSampler(
// sample for each cloud in the batch // sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data); IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
CUDA_CALL(cudaSetDevice(array->ctx.device_id)); CUDA_CALL(hipSetDevice(array->ctx.device_id));
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size, fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file geometry/geometry.cc * @file geometry/geometry.cc
...@@ -10,7 +11,7 @@ ...@@ -10,7 +11,7 @@
#include "../array/check.h" #include "../array/check.h"
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./geometry_op.h" #include "geometry_op.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file graph/creators.cc * @file graph/creators.cc
* @brief Functions for constructing graphs. * @brief Functions for constructing graphs.
*/ */
#include "./heterograph.h" #include "heterograph.h"
using namespace dgl::runtime; using namespace dgl::runtime;
namespace dgl { namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file graph/heterograph.cc * @file graph/heterograph.cc
* @brief Heterograph implementation * @brief Heterograph implementation
*/ */
#include "./heterograph.h" #include "heterograph.h"
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/graph_serializer.h> #include <dgl/graph_serializer.h>
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file graph/heterograph.h * @file graph/heterograph.h
...@@ -18,7 +19,7 @@ ...@@ -18,7 +19,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "./unit_graph.h" #include "unit_graph.h"
#include "shared_mem_manager.h" #include "shared_mem_manager.h"
namespace dgl { namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file graph/heterograph_capi.cc * @file graph/heterograph_capi.cc
...@@ -14,7 +15,7 @@ ...@@ -14,7 +15,7 @@
#include <set> #include <set>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./heterograph.h" #include "heterograph.h"
#include "unit_graph.h" #include "unit_graph.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file graph/pickle.cc * @file graph/pickle.cc
...@@ -10,7 +11,7 @@ ...@@ -10,7 +11,7 @@
#include <dmlc/memory_io.h> #include <dmlc/memory_io.h>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./heterograph.h" #include "heterograph.h"
#include "unit_graph.h" #include "unit_graph.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cuh * @file graph/sampling/frequency_hashmap.cuh
...@@ -56,7 +58,7 @@ class FrequencyHashmap { ...@@ -56,7 +58,7 @@ class FrequencyHashmap {
FrequencyHashmap() = delete; FrequencyHashmap() = delete;
FrequencyHashmap( FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx, int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale); hipStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
~FrequencyHashmap(); ~FrequencyHashmap();
using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem; using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
std::tuple<IdArray, IdArray, IdArray> Topk( std::tuple<IdArray, IdArray, IdArray> Topk(
...@@ -66,7 +68,7 @@ class FrequencyHashmap { ...@@ -66,7 +68,7 @@ class FrequencyHashmap {
private: private:
DGLContext _ctx; DGLContext _ctx;
cudaStream_t _stream; hipStream_t _stream;
DeviceEdgeHashmap<IdxType> *_device_edge_hashmap; DeviceEdgeHashmap<IdxType> *_device_edge_hashmap;
IdxType *_dst_unique_edges; IdxType *_dst_unique_edges;
EdgeItem *_edge_hashmap; EdgeItem *_edge_hashmap;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file graph/sampling/frequency_hashmap.cu * @file graph/sampling/frequency_hashmap.cu
...@@ -5,7 +7,7 @@ ...@@ -5,7 +7,7 @@
*/ */
#include <algorithm> #include <algorithm>
#include <cub/cub.cuh> // NOLINT #include <hipcub/hipcub.hpp> // NOLINT
#include <tuple> #include <tuple>
#include <utility> #include <utility>
...@@ -71,7 +73,7 @@ __global__ void _count_frequency( ...@@ -71,7 +73,7 @@ __global__ void _count_frequency(
} }
} }
using BlockReduce = typename cub::BlockReduce<IdxType, BLOCK_SIZE>; using BlockReduce = typename hipcub::BlockReduce<IdxType, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_space; __shared__ typename BlockReduce::TempStorage temp_space;
count = BlockReduce(temp_space).Sum(count); count = BlockReduce(temp_space).Sum(count);
...@@ -112,7 +114,7 @@ __global__ void _compact_frequency( ...@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
int64_t last_idx = start_idx + TILE_SIZE; int64_t last_idx = start_idx + TILE_SIZE;
const IdxType block_offset = edge_blocks_prefix[blockIdx.x]; const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
using BlockScan = typename cub::BlockScan<IdxType, BLOCK_SIZE>; using BlockScan = typename hipcub::BlockScan<IdxType, BLOCK_SIZE>;
__shared__ typename BlockScan::TempStorage temp_space; __shared__ typename BlockScan::TempStorage temp_space;
BlockPrefixCallbackOp<IdxType> prefix_op(0); BlockPrefixCallbackOp<IdxType> prefix_op(0);
...@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount( ...@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
template <typename IdxType> template <typename IdxType>
FrequencyHashmap<IdxType>::FrequencyHashmap( FrequencyHashmap<IdxType>::FrequencyHashmap(
int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx, int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
cudaStream_t stream, int64_t edge_table_scale) { hipStream_t stream, int64_t edge_table_scale) {
_ctx = ctx; _ctx = ctx;
_stream = stream; _stream = stream;
num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale); num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
...@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap( ...@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
constexpr int TILE_SIZE = BLOCK_SIZE * 8; constexpr int TILE_SIZE = BLOCK_SIZE * 8;
dim3 block(BLOCK_SIZE); dim3 block(BLOCK_SIZE);
dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE); dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType))); CUDA_CALL(hipMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0, (_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
_stream, edge_hashmap, (num_dst * num_items_each_dst)); _stream, edge_hashmap, (num_dst * num_items_each_dst));
...@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// _edge_hashmap // _edge_hashmap
bool *is_first_position = static_cast<bool *>( bool *is_first_position = static_cast<bool *>(
device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges))); device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges))); CUDA_CALL(hipMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
// double space to use ExclusiveSum // double space to use ExclusiveSum
auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace( auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
_ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1))); _ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
...@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 2.1 ExclusiveSum the edge_blocks_prefix // 2.1 ExclusiveSum the edge_blocks_prefix
void *d_temp_storage = nullptr; void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix, d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream)); edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, edge_blocks_prefix, d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream)); edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage); device->FreeWorkspace(_ctx, d_temp_storage);
...@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 3.1 ExclusiveSum the num_unique_each_node // 3.1 ExclusiveSum the num_unique_each_node
d_temp_storage = nullptr; d_temp_storage = nullptr;
temp_storage_bytes = 0; temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node, d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream)); num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node, d_temp_storage, temp_storage_bytes, num_unique_each_node,
num_unique_each_node_alternate, num_dst_nodes + 1, _stream)); num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage); device->FreeWorkspace(_ctx, d_temp_storage);
// 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency // 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
// Create a set of DoubleBuffers to wrap pairs of device pointers // Create a set of DoubleBuffers to wrap pairs of device pointers
cub::DoubleBuffer<Idx64Type> d_unique_frequency( hipcub::DoubleBuffer<Idx64Type> d_unique_frequency(
unique_frequency, unique_frequency_alternate); unique_frequency, unique_frequency_alternate);
cub::DoubleBuffer<IdxType> d_unique_src_edges( hipcub::DoubleBuffer<IdxType> d_unique_src_edges(
unique_src_edges, unique_src_edges_alternate); unique_src_edges, unique_src_edges_alternate);
// Determine temporary device storage requirements // Determine temporary device storage requirements
d_temp_storage = nullptr; d_temp_storage = nullptr;
...@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// the DeviceRadixSort is faster than DeviceSegmentedRadixSort, // the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
// especially when num_dst_nodes is large (about ~10000) // especially when num_dst_nodes is large (about ~10000)
if (dtype.bits == 32) { if (dtype.bits == 32) {
CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency, d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8, d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream)); _stream));
} else { } else {
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency, d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes, d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0, num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
} }
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
if (dtype.bits == 32) { if (dtype.bits == 32) {
CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency, d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8, d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
_stream)); _stream));
} else { } else {
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending( CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, d_unique_frequency, d_temp_storage, temp_storage_bytes, d_unique_frequency,
d_unique_src_edges, num_unique_edges, num_dst_nodes, d_unique_src_edges, num_unique_edges, num_dst_nodes,
num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0, num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
...@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// use unique_output_offsets; // use unique_output_offsets;
d_temp_storage = nullptr; d_temp_storage = nullptr;
temp_storage_bytes = 0; temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node, d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream)); unique_output_offsets, num_dst_nodes + 1, _stream));
d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
d_temp_storage, temp_storage_bytes, num_unique_each_node, d_temp_storage, temp_storage_bytes, num_unique_each_node,
unique_output_offsets, num_dst_nodes + 1, _stream)); unique_output_offsets, num_dst_nodes + 1, _stream));
device->FreeWorkspace(_ctx, d_temp_storage); device->FreeWorkspace(_ctx, d_temp_storage);
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* @file graph/sampling/get_node_types_gpu.cu * @file graph/sampling/get_node_types_gpu.cu
* @brief DGL sampler * @brief DGL sampler
*/ */
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021-2022 by Contributors * Copyright (c) 2021-2022 by Contributors
* @file graph/sampling/randomwalk_gpu.cu * @file graph/sampling/randomwalk_gpu.cu
* @brief CUDA random walk sampleing * @brief CUDA random walk sampleing
*/ */
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -48,10 +50,10 @@ __global__ void _RandomWalkKernel( ...@@ -48,10 +50,10 @@ __global__ void _RandomWalkKernel(
int64_t last_idx = int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1); int64_t trace_length = (max_num_steps + 1);
curandState rng; hiprandState_t rng;
// reference: // reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes // https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
curand_init(rand_seed + idx, 0, 0, &rng); hiprand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) { while (idx < last_idx) {
IdType curr = seed_data[idx]; IdType curr = seed_data[idx];
...@@ -68,18 +70,18 @@ __global__ void _RandomWalkKernel( ...@@ -68,18 +70,18 @@ __global__ void _RandomWalkKernel(
if (deg == 0) { // the degree is zero if (deg == 0) { // the degree is zero
break; break;
} }
const int64_t num = curand(&rng) % deg; const int64_t num = hiprand(&rng) % deg;
IdType pick = graph.in_cols[in_row_start + num]; IdType pick = graph.in_cols[in_row_start + num];
IdType eid = IdType eid =
(graph.data ? graph.data[in_row_start + num] : in_row_start + num); (graph.data ? graph.data[in_row_start + num] : in_row_start + num);
*traces_data_ptr = pick; *traces_data_ptr = pick;
*eids_data_ptr = eid; *eids_data_ptr = eid;
if ((restart_prob_size > 1) && if ((restart_prob_size > 1) &&
(curand_uniform(&rng) < restart_prob_data[step_idx])) { (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
break; break;
} else if ( } else if (
(restart_prob_size == 1) && (restart_prob_size == 1) &&
(curand_uniform(&rng) < restart_prob_data[0])) { (hiprand_uniform(&rng) < restart_prob_data[0])) {
break; break;
} }
++traces_data_ptr; ++traces_data_ptr;
...@@ -107,10 +109,10 @@ __global__ void _RandomWalkBiasedKernel( ...@@ -107,10 +109,10 @@ __global__ void _RandomWalkBiasedKernel(
int64_t last_idx = int64_t last_idx =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds); min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1); int64_t trace_length = (max_num_steps + 1);
curandState rng; hiprandState_t rng;
// reference: // reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes // https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
curand_init(rand_seed + idx, 0, 0, &rng); hiprand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) { while (idx < last_idx) {
IdType curr = seed_data[idx]; IdType curr = seed_data[idx];
...@@ -133,9 +135,9 @@ __global__ void _RandomWalkBiasedKernel( ...@@ -133,9 +135,9 @@ __global__ void _RandomWalkBiasedKernel(
const FloatType *prob = probs[metapath_id]; const FloatType *prob = probs[metapath_id];
int64_t num; int64_t num;
if (prob == nullptr) { if (prob == nullptr) {
num = curand(&rng) % deg; num = hiprand(&rng) % deg;
} else { } else {
auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng); auto rnd_sum_w = prob_sum[curr] * hiprand_uniform(&rng);
FloatType sum_w{0.}; FloatType sum_w{0.};
for (num = 0; num < deg; ++num) { for (num = 0; num < deg; ++num) {
sum_w += prob[in_row_start + num]; sum_w += prob[in_row_start + num];
...@@ -149,11 +151,11 @@ __global__ void _RandomWalkBiasedKernel( ...@@ -149,11 +151,11 @@ __global__ void _RandomWalkBiasedKernel(
*traces_data_ptr = pick; *traces_data_ptr = pick;
*eids_data_ptr = eid; *eids_data_ptr = eid;
if ((restart_prob_size > 1) && if ((restart_prob_size > 1) &&
(curand_uniform(&rng) < restart_prob_data[step_idx])) { (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
break; break;
} else if ( } else if (
(restart_prob_size == 1) && (restart_prob_size == 1) &&
(curand_uniform(&rng) < restart_prob_data[0])) { (hiprand_uniform(&rng) < restart_prob_data[0])) {
break; break;
} }
++traces_data_ptr; ++traces_data_ptr;
...@@ -202,7 +204,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform( ...@@ -202,7 +204,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
: nullptr); : nullptr);
} }
// use cuda stream from local thread // use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = DeviceAPI::Get(ctx); auto device = DeviceAPI::Get(ctx);
auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace( auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
ctx, (num_etypes) * sizeof(GraphKernelData<IdType>))); ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
...@@ -222,7 +224,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform( ...@@ -222,7 +224,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
ATEN_FLOAT_TYPE_SWITCH( ATEN_FLOAT_TYPE_SWITCH(
restart_prob->dtype, FloatType, "random walk GPU kernel", { restart_prob->dtype, FloatType, "random walk GPU kernel", {
CHECK(restart_prob->ctx.device_type == kDGLCUDA) CHECK(restart_prob->ctx.device_type == kDGLCUDA||restart_prob->ctx.device_type == kDGLROCM)
<< "restart prob should be in GPU."; << "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1."; CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>(); const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
...@@ -263,7 +265,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased( ...@@ -263,7 +265,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
IdType *traces_data = traces.Ptr<IdType>(); IdType *traces_data = traces.Ptr<IdType>();
IdType *eids_data = eids.Ptr<IdType>(); IdType *eids_data = eids.Ptr<IdType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto device = DeviceAPI::Get(ctx); auto device = DeviceAPI::Get(ctx);
// new probs and prob sums pointers // new probs and prob sums pointers
assert(num_etypes == static_cast<int64_t>(prob.size())); assert(num_etypes == static_cast<int64_t>(prob.size()));
...@@ -297,11 +299,11 @@ std::pair<IdArray, IdArray> RandomWalkBiased( ...@@ -297,11 +299,11 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
// calculate the sum of the neighbor weights // calculate the sum of the neighbor weights
const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data); const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
size_t temp_storage_size = 0; size_t temp_storage_size = 0;
CUDA_CALL(cub::DeviceSegmentedReduce::Sum( CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
nullptr, temp_storage_size, probs[etype], prob_sums[etype], nullptr, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream)); num_segments, d_offsets, d_offsets + 1, stream));
void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size); void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
CUDA_CALL(cub::DeviceSegmentedReduce::Sum( CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
temp_storage, temp_storage_size, probs[etype], prob_sums[etype], temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
num_segments, d_offsets, d_offsets + 1, stream)); num_segments, d_offsets, d_offsets + 1, stream));
device->FreeWorkspace(ctx, temp_storage); device->FreeWorkspace(ctx, temp_storage);
...@@ -335,7 +337,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased( ...@@ -335,7 +337,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
dim3 block(256); dim3 block(256);
dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE); dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
CHECK(restart_prob->ctx.device_type == kDGLCUDA) CHECK(restart_prob->ctx.device_type == kDGLCUDA ||restart_prob->ctx.device_type == kDGLROCM)
<< "restart prob should be in GPU."; << "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1."; CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>(); const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
...@@ -396,7 +398,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart( ...@@ -396,7 +398,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
auto device = dgl::runtime::DeviceAPI::Get(device_ctx); auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
// use cuda stream from local thread // use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
device->CopyDataFromTo( device->CopyDataFromTo(
&restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double), &restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype); DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
...@@ -443,13 +445,13 @@ template <DGLDeviceType XPU, typename IdxType> ...@@ -443,13 +445,13 @@ template <DGLDeviceType XPU, typename IdxType>
std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors( std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
const IdArray src, const IdArray dst, const int64_t num_samples_per_node, const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
const int64_t k) { const int64_t k) {
CHECK(src->ctx.device_type == kDGLCUDA) << "IdArray needs be on GPU!"; CHECK(src->ctx.device_type == kDGLCUDA || src->ctx.device_type == kDGLROCM) << "IdArray needs be on GPU!";
const IdxType *src_data = src.Ptr<IdxType>(); const IdxType *src_data = src.Ptr<IdxType>();
const IdxType *dst_data = dst.Ptr<IdxType>(); const IdxType *dst_data = dst.Ptr<IdxType>();
const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node); const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
auto ctx = src->ctx; auto ctx = src->ctx;
// use cuda stream from local thread // use cuda stream from local thread
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
auto frequency_hashmap = FrequencyHashmap<IdxType>( auto frequency_hashmap = FrequencyHashmap<IdxType>(
num_dst_nodes, num_samples_per_node, ctx, stream); num_dst_nodes, num_samples_per_node, ctx, stream);
auto ret = frequency_hashmap.Topk( auto ret = frequency_hashmap.Topk(
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file graph/serialize/heterograph_serialize.cc * @file graph/serialize/heterograph_serialize.cc
...@@ -48,8 +49,8 @@ ...@@ -48,8 +49,8 @@
#include <vector> #include <vector>
#include "../heterograph.h" #include "../heterograph.h"
#include "./dglstream.h" #include "dglstream.h"
#include "./graph_serialize.h" #include "graph_serialize.h"
#include "dmlc/memory_io.h" #include "dmlc/memory_io.h"
namespace dgl { namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file graph/subgraph.cc * @file graph/subgraph.cc
* @brief Functions for extracting subgraphs. * @brief Functions for extracting subgraphs.
*/ */
#include "./heterograph.h" #include "heterograph.h"
using namespace dgl::runtime; using namespace dgl::runtime;
namespace dgl { namespace dgl {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright 2021 Contributors * Copyright 2021 Contributors
* *
...@@ -18,7 +19,7 @@ ...@@ -18,7 +19,7 @@
* all given graphs with the same set of nodes. * all given graphs with the same set of nodes.
*/ */
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
...@@ -55,10 +56,10 @@ template <typename IdType> ...@@ -55,10 +56,10 @@ template <typename IdType>
void BuildNodeMaps( void BuildNodeMaps(
const std::vector<IdArray> &input_nodes, const std::vector<IdArray> &input_nodes,
DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device, DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
std::vector<IdArray> *const unique_nodes_device, cudaStream_t stream) { std::vector<IdArray> *const unique_nodes_device, hipStream_t stream) {
const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size()); const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
CUDA_CALL(cudaMemsetAsync( CUDA_CALL(hipMemsetAsync(
count_unique_device, 0, num_ntypes * sizeof(*count_unique_device), count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
stream)); stream));
...@@ -81,7 +82,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU( ...@@ -81,7 +82,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
const std::vector<IdArray> &always_preserve) { const std::vector<IdArray> &always_preserve) {
const auto &ctx = graphs[0]->Context(); const auto &ctx = graphs[0]->Context();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
CHECK_EQ(ctx.device_type, kDGLCUDA); CHECK_EQ(ctx.device_type, kDGLCUDA);
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright 2020-2022 Contributors * Copyright 2020-2022 Contributors
* *
...@@ -22,7 +23,7 @@ ...@@ -22,7 +23,7 @@
#include <dgl/runtime/c_runtime_api.h> #include <dgl/runtime/c_runtime_api.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <dgl/runtime/c_runtime_api.h> #include <dgl/runtime/c_runtime_api.h>
#include <algorithm> #include <algorithm>
...@@ -113,7 +114,7 @@ class DeviceNodeMap { ...@@ -113,7 +114,7 @@ class DeviceNodeMap {
DeviceNodeMap( DeviceNodeMap(
const std::vector<int64_t>& num_nodes, const int64_t offset, const std::vector<int64_t>& num_nodes, const int64_t offset,
DGLContext ctx, cudaStream_t stream) DGLContext ctx, hipStream_t stream)
: num_types_(num_nodes.size()), : num_types_(num_nodes.size()),
rhs_offset_(offset), rhs_offset_(offset),
hash_tables_(), hash_tables_(),
...@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) { ...@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) {
template <typename IdType> template <typename IdType>
std::tuple<std::vector<IdArray>, std::vector<IdArray>> MapEdges( std::tuple<std::vector<IdArray>, std::vector<IdArray>> MapEdges(
HeteroGraphPtr graph, const std::vector<EdgeArray>& edge_sets, HeteroGraphPtr graph, const std::vector<EdgeArray>& edge_sets,
const DeviceNodeMap<IdType>& node_map, cudaStream_t stream) { const DeviceNodeMap<IdType>& node_map, hipStream_t stream) {
constexpr const int BLOCK_SIZE = 128; constexpr const int BLOCK_SIZE = 128;
constexpr const size_t TILE_SIZE = 1024; constexpr const size_t TILE_SIZE = 1024;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright 2020-2021 Contributors * Copyright 2020-2021 Contributors
* *
...@@ -20,7 +21,7 @@ ...@@ -20,7 +21,7 @@
* Tested via python wrapper: python/dgl/path/to/to_block.py * Tested via python wrapper: python/dgl/path/to/to_block.py
*/ */
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h> #include <dgl/runtime/tensordispatch.h>
...@@ -69,10 +70,10 @@ class DeviceNodeMapMaker { ...@@ -69,10 +70,10 @@ class DeviceNodeMapMaker {
const std::vector<IdArray>& lhs_nodes, const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes, const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device, DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
std::vector<IdArray>* const lhs_device, cudaStream_t stream) { std::vector<IdArray>* const lhs_device, hipStream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
CUDA_CALL(cudaMemsetAsync( CUDA_CALL(hipMemsetAsync(
count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream)); count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
// possibly dublicate lhs nodes // possibly dublicate lhs nodes
...@@ -112,7 +113,7 @@ class DeviceNodeMapMaker { ...@@ -112,7 +113,7 @@ class DeviceNodeMapMaker {
void Make( void Make(
const std::vector<IdArray>& lhs_nodes, const std::vector<IdArray>& lhs_nodes,
const std::vector<IdArray>& rhs_nodes, const std::vector<IdArray>& rhs_nodes,
DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) { DeviceNodeMap<IdType>* const node_maps, hipStream_t stream) {
const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
// unique lhs nodes // unique lhs nodes
...@@ -155,7 +156,7 @@ struct CUDAIdsMapper { ...@@ -155,7 +156,7 @@ struct CUDAIdsMapper {
std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr; std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
const bool generate_lhs_nodes = lhs_nodes.empty(); const bool generate_lhs_nodes = lhs_nodes.empty();
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// Allocate space for map creation process. // Allocate space for map creation process.
DeviceNodeMapMaker<IdType> maker(maxNodesPerType); DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
...@@ -168,7 +169,7 @@ struct CUDAIdsMapper { ...@@ -168,7 +169,7 @@ struct CUDAIdsMapper {
} }
} }
cudaEvent_t copyEvent; hipEvent_t copyEvent;
NDArray new_len_tensor; NDArray new_len_tensor;
// Populate the mappings. // Populate the mappings.
if (generate_lhs_nodes) { if (generate_lhs_nodes) {
...@@ -179,7 +180,7 @@ struct CUDAIdsMapper { ...@@ -179,7 +180,7 @@ struct CUDAIdsMapper {
src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
stream); stream);
CUDA_CALL(cudaEventCreate(&copyEvent)); CUDA_CALL(hipEventCreate(&copyEvent));
if (TensorDispatcher::Global()->IsAvailable()) { if (TensorDispatcher::Global()->IsAvailable()) {
new_len_tensor = NDArray::PinnedEmpty( new_len_tensor = NDArray::PinnedEmpty(
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype, {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
...@@ -190,11 +191,11 @@ struct CUDAIdsMapper { ...@@ -190,11 +191,11 @@ struct CUDAIdsMapper {
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype, {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
DGLContext{kDGLCPU, 0}); DGLContext{kDGLCPU, 0});
} }
CUDA_CALL(cudaMemcpyAsync( CUDA_CALL(hipMemcpyAsync(
new_len_tensor->data, count_lhs_device, new_len_tensor->data, count_lhs_device,
sizeof(*num_nodes_per_type.data()) * num_ntypes, sizeof(*num_nodes_per_type.data()) * num_ntypes,
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(hipEventRecord(copyEvent, stream));
device->FreeWorkspace(ctx, count_lhs_device); device->FreeWorkspace(ctx, count_lhs_device);
} else { } else {
...@@ -209,8 +210,8 @@ struct CUDAIdsMapper { ...@@ -209,8 +210,8 @@ struct CUDAIdsMapper {
if (generate_lhs_nodes) { if (generate_lhs_nodes) {
// wait for the previous copy // wait for the previous copy
CUDA_CALL(cudaEventSynchronize(copyEvent)); CUDA_CALL(hipEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent)); CUDA_CALL(hipEventDestroy(copyEvent));
// Resize lhs nodes. // Resize lhs nodes.
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment