update src and graphbolt code

6ac701f8 · sangwzh · 1547bd93 · 6ac701f8 · 6ac701f8 · 6ac701f8
Commit 6ac701f8 authored Sep 13, 2024 by sangwzh
20 changed files
--- a/src/array/libra_partition.cc
+++ b/src/array/libra_partition.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2021 Intel Corporation
 *
@@ -21,7 +22,7 @@
 #include <vector>

 #include "../c_api_common.h"
-#include "./check.h"
+#include "check.h"
 #include "kernel_decl.h"

 using namespace dgl::runtime;

--- a/src/array/selector.h
+++ b/src/array/selector.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/selector.h
@@ -12,13 +13,13 @@ namespace dgl {

 namespace {

-#ifdef __CUDACC__
-#define DGLDEVICE __device__
+#ifdef __HIPCC__
+#define DGLDEVICE __device__ __host__
 #define DGLINLINE __forceinline__
 #else
 #define DGLDEVICE
 #define DGLINLINE inline
-#endif  // __CUDACC__
+#endif  // __HIPCC__

 }  // namespace


--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019-2022 by Contributors
 * @file array/uvm_array.cc
@@ -8,7 +9,7 @@
 #include <sstream>

 #include "../c_api_common.h"
-#include "./uvm_array_op.h"
+#include "uvm_array_op.h"

 using namespace dgl::runtime;


--- a/src/geometry/cuda/edge_coarsening_impl.cu
+++ b/src/geometry/cuda/edge_coarsening_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019 by Contributors
 * @file geometry/cuda/edge_coarsening_impl.cu
 * @brief Edge coarsening CUDA implementation
 */
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/random.h>
 #include <dmlc/thread_local.h>
@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
    float *ret_values, size_t num, uint64_t seed) {
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  if (id < num) {
-    curandState state;
-    curand_init(seed, id, 0, &state);
-    ret_values[id] = curand_uniform(&state);
+    hiprandState_t state;
+    hiprand_init(seed, id, 0, &state);
+    ret_values[id] = hiprand_uniform(&state);
  }
 }

@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
 template <typename IdType>
 bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
  // initial done signal
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);

  // generate color prop for each node
@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
      colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
      result_data);
  bool done_h = false;
-  CUDA_CALL(cudaMemcpyFromSymbol(
-      &done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
+  CUDA_CALL(hipMemcpyFromSymbol(
+      &done_h, done_d, sizeof(done_h), 0, hipMemcpyDeviceToHost));
  return done_h;
 }

@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
 template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void WeightedNeighborMatching(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto &ctx = result->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  device->SetDevice(ctx);
@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
  device->SetDevice(ctx);

  // generate random weights
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  NDArray weight = NDArray::Empty(
      {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
  float *weight_data = static_cast<float *>(weight->data);

--- a/src/geometry/cuda/geometry_op_impl.cu
+++ b/src/geometry/cuda/geometry_op_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019 by Contributors
 * @file geometry/cuda/geometry_op_impl.cc
@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void FarthestPointSampler(
    NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
    IdArray start_idx, IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const FloatType* array_data = static_cast<FloatType*>(array->data);

@@ -110,7 +112,7 @@ void FarthestPointSampler(

  // sample for each cloud in the batch
  IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
-  CUDA_CALL(cudaSetDevice(array->ctx.device_id));
+  CUDA_CALL(hipSetDevice(array->ctx.device_id));

  CUDA_KERNEL_CALL(
      fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,

--- a/src/geometry/geometry.cc
+++ b/src/geometry/geometry.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file geometry/geometry.cc
@@ -10,7 +11,7 @@

 #include "../array/check.h"
 #include "../c_api_common.h"
-#include "./geometry_op.h"
+#include "geometry_op.h"

 using namespace dgl::runtime;


--- a/src/graph/creators.cc
+++ b/src/graph/creators.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/creators.cc
 * @brief Functions for constructing graphs.
 */
-#include "./heterograph.h"
+#include "heterograph.h"
 using namespace dgl::runtime;

 namespace dgl {

--- a/src/graph/heterograph.cc
+++ b/src/graph/heterograph.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file graph/heterograph.cc
 * @brief Heterograph implementation
 */
-#include "./heterograph.h"
+#include "heterograph.h"

 #include <dgl/array.h>
 #include <dgl/graph_serializer.h>

--- a/src/graph/heterograph.h
+++ b/src/graph/heterograph.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file graph/heterograph.h
@@ -18,7 +19,7 @@
 #include <utility>
 #include <vector>

-#include "./unit_graph.h"
+#include "unit_graph.h"
 #include "shared_mem_manager.h"

 namespace dgl {

--- a/src/graph/heterograph_capi.cc
+++ b/src/graph/heterograph_capi.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/heterograph_capi.cc
@@ -14,7 +15,7 @@
 #include <set>

 #include "../c_api_common.h"
-#include "./heterograph.h"
+#include "heterograph.h"
 #include "unit_graph.h"

 using namespace dgl::runtime;

--- a/src/graph/pickle.cc
+++ b/src/graph/pickle.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/pickle.cc
@@ -10,7 +11,7 @@
 #include <dmlc/memory_io.h>

 #include "../c_api_common.h"
-#include "./heterograph.h"
+#include "heterograph.h"
 #include "unit_graph.h"

 using namespace dgl::runtime;

--- a/src/graph/sampling/randomwalks/frequency_hashmap.cuh
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file graph/sampling/frequency_hashmap.cuh
@@ -56,7 +58,7 @@ class FrequencyHashmap {
  FrequencyHashmap() = delete;
  FrequencyHashmap(
      int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
-      cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
+      hipStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
  ~FrequencyHashmap();
  using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
  std::tuple<IdArray, IdArray, IdArray> Topk(
@@ -66,7 +68,7 @@ class FrequencyHashmap {

 private:
  DGLContext _ctx;
-  cudaStream_t _stream;
+  hipStream_t _stream;
  DeviceEdgeHashmap<IdxType> *_device_edge_hashmap;
  IdxType *_dst_unique_edges;
  EdgeItem *_edge_hashmap;

--- a/src/graph/sampling/randomwalks/frequency_hashmap.cu
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file graph/sampling/frequency_hashmap.cu
@@ -5,7 +7,7 @@
 */

 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <tuple>
 #include <utility>

@@ -71,7 +73,7 @@ __global__ void _count_frequency(
    }
  }

-  using BlockReduce = typename cub::BlockReduce<IdxType, BLOCK_SIZE>;
+  using BlockReduce = typename hipcub::BlockReduce<IdxType, BLOCK_SIZE>;
  __shared__ typename BlockReduce::TempStorage temp_space;

  count = BlockReduce(temp_space).Sum(count);
@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
  int64_t last_idx = start_idx + TILE_SIZE;
  const IdxType block_offset = edge_blocks_prefix[blockIdx.x];

-  using BlockScan = typename cub::BlockScan<IdxType, BLOCK_SIZE>;
+  using BlockScan = typename hipcub::BlockScan<IdxType, BLOCK_SIZE>;
  __shared__ typename BlockScan::TempStorage temp_space;
  BlockPrefixCallbackOp<IdxType> prefix_op(0);

@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
 template <typename IdxType>
 FrequencyHashmap<IdxType>::FrequencyHashmap(
    int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
-    cudaStream_t stream, int64_t edge_table_scale) {
+    hipStream_t stream, int64_t edge_table_scale) {
  _ctx = ctx;
  _stream = stream;
  num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
  constexpr int TILE_SIZE = BLOCK_SIZE * 8;
  dim3 block(BLOCK_SIZE);
  dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
-  CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
+  CUDA_CALL(hipMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
  CUDA_KERNEL_CALL(
      (_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
      _stream, edge_hashmap, (num_dst * num_items_each_dst));
@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // _edge_hashmap
  bool *is_first_position = static_cast<bool *>(
      device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
-  CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
+  CUDA_CALL(hipMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
  // double space to use ExclusiveSum
  auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
      _ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // 2.1 ExclusiveSum the edge_blocks_prefix
  void *d_temp_storage = nullptr;
  size_t temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
      edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
      edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
  device->FreeWorkspace(_ctx, d_temp_storage);
@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // 3.1 ExclusiveSum the num_unique_each_node
  d_temp_storage = nullptr;
  temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
  device->FreeWorkspace(_ctx, d_temp_storage);
  // 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
  // Create a set of DoubleBuffers to wrap pairs of device pointers
-  cub::DoubleBuffer<Idx64Type> d_unique_frequency(
+  hipcub::DoubleBuffer<Idx64Type> d_unique_frequency(
      unique_frequency, unique_frequency_alternate);
-  cub::DoubleBuffer<IdxType> d_unique_src_edges(
+  hipcub::DoubleBuffer<IdxType> d_unique_src_edges(
      unique_src_edges, unique_src_edges_alternate);
  // Determine temporary device storage requirements
  d_temp_storage = nullptr;
@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
  // especially when num_dst_nodes is large (about ~10000)
  if (dtype.bits == 32) {
-    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
        _stream));
  } else {
-    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, num_dst_nodes,
        num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  }
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
  if (dtype.bits == 32) {
-    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
        _stream));
  } else {
-    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, num_dst_nodes,
        num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // use unique_output_offsets;
  d_temp_storage = nullptr;
  temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      unique_output_offsets, num_dst_nodes + 1, _stream));
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      unique_output_offsets, num_dst_nodes + 1, _stream));
  device->FreeWorkspace(_ctx, d_temp_storage);

--- a/src/graph/sampling/randomwalks/get_node_types_gpu.cu
+++ b/src/graph/sampling/randomwalks/get_node_types_gpu.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2021 by Contributors
 * @file graph/sampling/get_node_types_gpu.cu
 * @brief DGL sampler
 */

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>
 #include <dgl/runtime/device_api.h>

--- a/src/graph/sampling/randomwalks/randomwalk_gpu.cu
+++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021-2022 by Contributors
 * @file graph/sampling/randomwalk_gpu.cu
 * @brief CUDA random walk sampleing
 */

-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -48,10 +50,10 @@ __global__ void _RandomWalkKernel(
  int64_t last_idx =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
  int64_t trace_length = (max_num_steps + 1);
-  curandState rng;
+  hiprandState_t rng;
  // reference:
-  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
-  curand_init(rand_seed + idx, 0, 0, &rng);
+  //     https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
+  hiprand_init(rand_seed + idx, 0, 0, &rng);

  while (idx < last_idx) {
    IdType curr = seed_data[idx];
@@ -68,18 +70,18 @@ __global__ void _RandomWalkKernel(
      if (deg == 0) {  // the degree is zero
        break;
      }
-      const int64_t num = curand(&rng) % deg;
+      const int64_t num = hiprand(&rng) % deg;
      IdType pick = graph.in_cols[in_row_start + num];
      IdType eid =
          (graph.data ? graph.data[in_row_start + num] : in_row_start + num);
      *traces_data_ptr = pick;
      *eids_data_ptr = eid;
      if ((restart_prob_size > 1) &&
-          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+          (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
        break;
      } else if (
          (restart_prob_size == 1) &&
-          (curand_uniform(&rng) < restart_prob_data[0])) {
+          (hiprand_uniform(&rng) < restart_prob_data[0])) {
        break;
      }
      ++traces_data_ptr;
@@ -107,10 +109,10 @@ __global__ void _RandomWalkBiasedKernel(
  int64_t last_idx =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
  int64_t trace_length = (max_num_steps + 1);
-  curandState rng;
+  hiprandState_t rng;
  // reference:
-  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
-  curand_init(rand_seed + idx, 0, 0, &rng);
+  //     https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
+  hiprand_init(rand_seed + idx, 0, 0, &rng);

  while (idx < last_idx) {
    IdType curr = seed_data[idx];
@@ -133,9 +135,9 @@ __global__ void _RandomWalkBiasedKernel(
      const FloatType *prob = probs[metapath_id];
      int64_t num;
      if (prob == nullptr) {
-        num = curand(&rng) % deg;
+        num = hiprand(&rng) % deg;
      } else {
-        auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
+        auto rnd_sum_w = prob_sum[curr] * hiprand_uniform(&rng);
        FloatType sum_w{0.};
        for (num = 0; num < deg; ++num) {
          sum_w += prob[in_row_start + num];
@@ -149,11 +151,11 @@ __global__ void _RandomWalkBiasedKernel(
      *traces_data_ptr = pick;
      *eids_data_ptr = eid;
      if ((restart_prob_size > 1) &&
-          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+          (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
        break;
      } else if (
          (restart_prob_size == 1) &&
-          (curand_uniform(&rng) < restart_prob_data[0])) {
+          (hiprand_uniform(&rng) < restart_prob_data[0])) {
        break;
      }
      ++traces_data_ptr;
@@ -202,7 +204,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
                         : nullptr);
  }
  // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = DeviceAPI::Get(ctx);
  auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
      ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
@@ -222,7 +224,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
  ATEN_FLOAT_TYPE_SWITCH(
      restart_prob->dtype, FloatType, "random walk GPU kernel", {
-        CHECK(restart_prob->ctx.device_type == kDGLCUDA)
+        CHECK(restart_prob->ctx.device_type == kDGLCUDA||restart_prob->ctx.device_type == kDGLROCM)
            << "restart prob should be in GPU.";
        CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
        const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
@@ -263,7 +265,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
  IdType *traces_data = traces.Ptr<IdType>();
  IdType *eids_data = eids.Ptr<IdType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = DeviceAPI::Get(ctx);
  // new probs and prob sums pointers
  assert(num_etypes == static_cast<int64_t>(prob.size()));
@@ -297,11 +299,11 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
    // calculate the sum of the neighbor weights
    const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
    size_t temp_storage_size = 0;
-    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
        nullptr, temp_storage_size, probs[etype], prob_sums[etype],
        num_segments, d_offsets, d_offsets + 1, stream));
    void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
-    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
        temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
        num_segments, d_offsets, d_offsets + 1, stream));
    device->FreeWorkspace(ctx, temp_storage);
@@ -335,7 +337,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
  dim3 block(256);
  dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
-  CHECK(restart_prob->ctx.device_type == kDGLCUDA)
+  CHECK(restart_prob->ctx.device_type == kDGLCUDA ||restart_prob->ctx.device_type == kDGLROCM)
      << "restart prob should be in GPU.";
  CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
  const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
@@ -396,7 +398,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
  auto device = dgl::runtime::DeviceAPI::Get(device_ctx);

  // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  device->CopyDataFromTo(
      &restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
      DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
@@ -443,13 +445,13 @@ template <DGLDeviceType XPU, typename IdxType>
 std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
    const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
    const int64_t k) {
-  CHECK(src->ctx.device_type == kDGLCUDA) << "IdArray needs be on GPU!";
+  CHECK(src->ctx.device_type == kDGLCUDA || src->ctx.device_type == kDGLROCM) << "IdArray needs be on GPU!";
  const IdxType *src_data = src.Ptr<IdxType>();
  const IdxType *dst_data = dst.Ptr<IdxType>();
  const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
  auto ctx = src->ctx;
  // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto frequency_hashmap = FrequencyHashmap<IdxType>(
      num_dst_nodes, num_samples_per_node, ctx, stream);
  auto ret = frequency_hashmap.Topk(

--- a/src/graph/serialize/heterograph_serialize.cc
+++ b/src/graph/serialize/heterograph_serialize.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file graph/serialize/heterograph_serialize.cc
@@ -48,8 +49,8 @@
 #include <vector>

 #include "../heterograph.h"
-#include "./dglstream.h"
-#include "./graph_serialize.h"
+#include "dglstream.h"
+#include "graph_serialize.h"
 #include "dmlc/memory_io.h"

 namespace dgl {

--- a/src/graph/subgraph.cc
+++ b/src/graph/subgraph.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/subgraph.cc
 * @brief Functions for extracting subgraphs.
 */
-#include "./heterograph.h"
+#include "heterograph.h"
 using namespace dgl::runtime;

 namespace dgl {

--- a/src/graph/transform/cuda/cuda_compact_graph.cu
+++ b/src/graph/transform/cuda/cuda_compact_graph.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright 2021 Contributors
 *
@@ -18,7 +19,7 @@
 * all given graphs with the same set of nodes.
 */

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/device_api.h>

@@ -55,10 +56,10 @@ template <typename IdType>
 void BuildNodeMaps(
    const std::vector<IdArray> &input_nodes,
    DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
-    std::vector<IdArray> *const unique_nodes_device, cudaStream_t stream) {
+    std::vector<IdArray> *const unique_nodes_device, hipStream_t stream) {
  const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());

-  CUDA_CALL(cudaMemsetAsync(
+  CUDA_CALL(hipMemsetAsync(
      count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
      stream));

@@ -81,7 +82,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
    const std::vector<IdArray> &always_preserve) {
  const auto &ctx = graphs[0]->Context();
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  CHECK_EQ(ctx.device_type, kDGLCUDA);


--- a/src/graph/transform/cuda/cuda_map_edges.cuh
+++ b/src/graph/transform/cuda/cuda_map_edges.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright 2020-2022 Contributors
 *
@@ -22,7 +23,7 @@

 #include <dgl/runtime/c_runtime_api.h>
 #include <dgl/base_heterograph.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/runtime/c_runtime_api.h>

 #include <algorithm>
@@ -113,7 +114,7 @@ class DeviceNodeMap {

  DeviceNodeMap(
      const std::vector<int64_t>& num_nodes, const int64_t offset,
-      DGLContext ctx, cudaStream_t stream)
+      DGLContext ctx, hipStream_t stream)
      : num_types_(num_nodes.size()),
        rhs_offset_(offset),
        hash_tables_(),
@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) {
 template <typename IdType>
 std::tuple<std::vector<IdArray>, std::vector<IdArray>> MapEdges(
    HeteroGraphPtr graph, const std::vector<EdgeArray>& edge_sets,
-    const DeviceNodeMap<IdType>& node_map, cudaStream_t stream) {
+    const DeviceNodeMap<IdType>& node_map, hipStream_t stream) {
  constexpr const int BLOCK_SIZE = 128;
  constexpr const size_t TILE_SIZE = 1024;


--- a/src/graph/transform/cuda/cuda_to_block.cu
+++ b/src/graph/transform/cuda/cuda_to_block.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright 2020-2021 Contributors
 *
@@ -20,7 +21,7 @@
 * Tested via python wrapper: python/dgl/path/to/to_block.py
 */

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/tensordispatch.h>
@@ -69,10 +70,10 @@ class DeviceNodeMapMaker {
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
      DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
-      std::vector<IdArray>* const lhs_device, cudaStream_t stream) {
+      std::vector<IdArray>* const lhs_device, hipStream_t stream) {
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

-    CUDA_CALL(cudaMemsetAsync(
+    CUDA_CALL(hipMemsetAsync(
        count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));

    // possibly dublicate lhs nodes
@@ -112,7 +113,7 @@ class DeviceNodeMapMaker {
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
-      DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) {
+      DeviceNodeMap<IdType>* const node_maps, hipStream_t stream) {
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    // unique lhs nodes
@@ -155,7 +156,7 @@ struct CUDAIdsMapper {
    std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
    const bool generate_lhs_nodes = lhs_nodes.empty();
    auto device = runtime::DeviceAPI::Get(ctx);
-    cudaStream_t stream = runtime::getCurrentCUDAStream();
+    hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

    // Allocate space for map creation process.
    DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
@@ -168,7 +169,7 @@ struct CUDAIdsMapper {
      }
    }

-    cudaEvent_t copyEvent;
+    hipEvent_t copyEvent;
    NDArray new_len_tensor;
    // Populate the mappings.
    if (generate_lhs_nodes) {
@@ -179,7 +180,7 @@ struct CUDAIdsMapper {
          src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
          stream);

-      CUDA_CALL(cudaEventCreate(&copyEvent));
+      CUDA_CALL(hipEventCreate(&copyEvent));
      if (TensorDispatcher::Global()->IsAvailable()) {
        new_len_tensor = NDArray::PinnedEmpty(
            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
@@ -190,11 +191,11 @@ struct CUDAIdsMapper {
            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
            DGLContext{kDGLCPU, 0});
      }
-      CUDA_CALL(cudaMemcpyAsync(
+      CUDA_CALL(hipMemcpyAsync(
          new_len_tensor->data, count_lhs_device,
          sizeof(*num_nodes_per_type.data()) * num_ntypes,
-          cudaMemcpyDeviceToHost, stream));
-      CUDA_CALL(cudaEventRecord(copyEvent, stream));
+          hipMemcpyDeviceToHost, stream));
+      CUDA_CALL(hipEventRecord(copyEvent, stream));

      device->FreeWorkspace(ctx, count_lhs_device);
    } else {
@@ -209,8 +210,8 @@ struct CUDAIdsMapper {

    if (generate_lhs_nodes) {
      // wait for the previous copy
-      CUDA_CALL(cudaEventSynchronize(copyEvent));
-      CUDA_CALL(cudaEventDestroy(copyEvent));
+      CUDA_CALL(hipEventSynchronize(copyEvent));
+      CUDA_CALL(hipEventDestroy(copyEvent));

      // Resize lhs nodes.
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {