Merge branch 'dtk25.04' of http://developer.sourcefind.cn/codes/OpenDAS/dgl into 2.2.1

74d88bf8 · sangwz · 2a1ac588 · 314cedc1 · 74d88bf8 · 74d88bf8
Commit 74d88bf8 authored Feb 20, 2025 by sangwz
20 changed files
--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/utils.cu
 * @brief Utilities for CUDA kernels.
 */

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {
 namespace cuda {
@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
  int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
  // Call CUB's reduction
  size_t workspace_size = 0;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
-  CUDA_CALL(cub::DeviceReduce::Min(
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
+  CUDA_CALL(hipcub::DeviceReduce::Min(
      nullptr, workspace_size, flags, rst, length, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUDA_CALL(cub::DeviceReduce::Min(
+  CUDA_CALL(hipcub::DeviceReduce::Min(
      workspace, workspace_size, flags, rst, length, stream));
  int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
  device->FreeWorkspace(ctx, workspace);

--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cpu/array_index_select_uvm.cuh

--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019-2022 by Contributors
 * @file array/cuda/uvm/array_index_select_uvm.cu
@@ -8,7 +10,7 @@
 #include "../../../runtime/cuda/cuda_common.h"
 #include "../array_index_select.cuh"
 #include "../utils.h"
-#include "./array_index_select_uvm.cuh"
+#include "array_index_select_uvm.cuh"

 namespace dgl {
 using runtime::NDArray;
@@ -17,7 +19,7 @@ namespace impl {

 template <typename DType, typename IdType>
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int64_t arr_len = array->shape[0];
  const int64_t len = index->shape[0];
  int64_t num_feat = 1;
@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);

 template <typename DType, typename IdType>
 void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* source_data = static_cast<DType*>(source->data);
  const IdType* idx_data = static_cast<IdType*>(index->data);
  const int64_t arr_len = dest->shape[0];

--- a/src/array/filter.cc
+++ b/src/array/filter.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/filter.cc
 * @brief Object for selecting items in a set, or selecting items not in a set.
 */

-#include "./filter.h"
+#include "filter.h"

 #include <dgl/packed_func_ext.h>
 #include <dgl/runtime/packed_func.h>
@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
      IdArray array = args[0];
      auto ctx = array->ctx;
      // TODO(nv-dlasalle): Implement CPU version.
-      if (ctx.device_type == kDGLCUDA) {
+      if (ctx.device_type == kDGLCUDA|| ctx.device_type == kDGLROCM) {
 #ifdef DGL_USE_CUDA
        ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
          *rv = CreateSetFilter<kDGLCUDA, IdType>(array);

--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/kernel.cc
@@ -7,7 +8,7 @@
 #include <dgl/packed_func_ext.h>

 #include "../c_api_common.h"
-#include "./check.h"
+#include "check.h"
 #include "kernel_decl.h"

 using namespace dgl::runtime;

--- a/src/array/libra_partition.cc
+++ b/src/array/libra_partition.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2021 Intel Corporation
 *
@@ -21,7 +22,7 @@
 #include <vector>

 #include "../c_api_common.h"
-#include "./check.h"
+#include "check.h"
 #include "kernel_decl.h"

 using namespace dgl::runtime;

--- a/src/array/selector.h
+++ b/src/array/selector.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/selector.h
@@ -12,13 +13,13 @@ namespace dgl {

 namespace {

-#ifdef __CUDACC__
-#define DGLDEVICE __device__
+#ifdef __HIPCC__
+#define DGLDEVICE __device__ __host__
 #define DGLINLINE __forceinline__
 #else
 #define DGLDEVICE
 #define DGLINLINE inline
-#endif  // __CUDACC__
+#endif  // __HIPCC__

 }  // namespace


--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019-2022 by Contributors
 * @file array/uvm_array.cc
@@ -8,7 +9,7 @@
 #include <sstream>

 #include "../c_api_common.h"
-#include "./uvm_array_op.h"
+#include "uvm_array_op.h"

 using namespace dgl::runtime;


--- a/src/geometry/cuda/edge_coarsening_impl.cu
+++ b/src/geometry/cuda/edge_coarsening_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019 by Contributors
 * @file geometry/cuda/edge_coarsening_impl.cu
 * @brief Edge coarsening CUDA implementation
 */
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/random.h>
 #include <dmlc/thread_local.h>
@@ -32,9 +34,9 @@ __global__ void generate_uniform_kernel(
    float *ret_values, size_t num, uint64_t seed) {
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  if (id < num) {
-    curandState state;
-    curand_init(seed, id, 0, &state);
-    ret_values[id] = curand_uniform(&state);
+    hiprandState_t state;
+    hiprand_init(seed, id, 0, &state);
+    ret_values[id] = hiprand_uniform(&state);
  }
 }

@@ -116,7 +118,7 @@ __global__ void weighted_respond_kernel(
 template <typename IdType>
 bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
  // initial done signal
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);

  // generate color prop for each node
@@ -132,8 +134,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
      colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
      result_data);
  bool done_h = false;
-  CUDA_CALL(cudaMemcpyFromSymbol(
-      &done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
+  CUDA_CALL(hipMemcpyFromSymbol(
+      &done_h, done_d, sizeof(done_h), 0, hipMemcpyDeviceToHost));
  return done_h;
 }

@@ -155,7 +157,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
 template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void WeightedNeighborMatching(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto &ctx = result->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  device->SetDevice(ctx);
@@ -216,7 +218,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
  device->SetDevice(ctx);

  // generate random weights
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  NDArray weight = NDArray::Empty(
      {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
  float *weight_data = static_cast<float *>(weight->data);

--- a/src/geometry/cuda/geometry_op_impl.cu
+++ b/src/geometry/cuda/geometry_op_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019 by Contributors
 * @file geometry/cuda/geometry_op_impl.cc
@@ -95,7 +97,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void FarthestPointSampler(
    NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
    IdArray start_idx, IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const FloatType* array_data = static_cast<FloatType*>(array->data);

@@ -110,7 +112,7 @@ void FarthestPointSampler(

  // sample for each cloud in the batch
  IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
-  CUDA_CALL(cudaSetDevice(array->ctx.device_id));
+  CUDA_CALL(hipSetDevice(array->ctx.device_id));

  CUDA_KERNEL_CALL(
      fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,

--- a/src/geometry/geometry.cc
+++ b/src/geometry/geometry.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file geometry/geometry.cc
@@ -10,7 +11,7 @@

 #include "../array/check.h"
 #include "../c_api_common.h"
-#include "./geometry_op.h"
+#include "geometry_op.h"

 using namespace dgl::runtime;


--- a/src/graph/creators.cc
+++ b/src/graph/creators.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/creators.cc
 * @brief Functions for constructing graphs.
 */
-#include "./heterograph.h"
+#include "heterograph.h"
 using namespace dgl::runtime;

 namespace dgl {

--- a/src/graph/heterograph.cc
+++ b/src/graph/heterograph.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file graph/heterograph.cc
 * @brief Heterograph implementation
 */
-#include "./heterograph.h"
+#include "heterograph.h"

 #include <dgl/array.h>
 #include <dgl/graph_serializer.h>

--- a/src/graph/heterograph.h
+++ b/src/graph/heterograph.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file graph/heterograph.h
@@ -18,7 +19,7 @@
 #include <utility>
 #include <vector>

-#include "./unit_graph.h"
+#include "unit_graph.h"
 #include "shared_mem_manager.h"

 namespace dgl {

--- a/src/graph/heterograph_capi.cc
+++ b/src/graph/heterograph_capi.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/heterograph_capi.cc
@@ -14,7 +15,7 @@
 #include <set>

 #include "../c_api_common.h"
-#include "./heterograph.h"
+#include "heterograph.h"
 #include "unit_graph.h"

 using namespace dgl::runtime;

--- a/src/graph/pickle.cc
+++ b/src/graph/pickle.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/pickle.cc
@@ -10,7 +11,7 @@
 #include <dmlc/memory_io.h>

 #include "../c_api_common.h"
-#include "./heterograph.h"
+#include "heterograph.h"
 #include "unit_graph.h"

 using namespace dgl::runtime;

--- a/src/graph/sampling/randomwalks/frequency_hashmap.cuh
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file graph/sampling/frequency_hashmap.cuh
@@ -56,7 +58,7 @@ class FrequencyHashmap {
  FrequencyHashmap() = delete;
  FrequencyHashmap(
      int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
-      cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
+      hipStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
  ~FrequencyHashmap();
  using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
  std::tuple<IdArray, IdArray, IdArray> Topk(
@@ -66,7 +68,7 @@ class FrequencyHashmap {

 private:
  DGLContext _ctx;
-  cudaStream_t _stream;
+  hipStream_t _stream;
  DeviceEdgeHashmap<IdxType> *_device_edge_hashmap;
  IdxType *_dst_unique_edges;
  EdgeItem *_edge_hashmap;

--- a/src/graph/sampling/randomwalks/frequency_hashmap.cu
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file graph/sampling/frequency_hashmap.cu
@@ -5,7 +7,7 @@
 */

 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <tuple>
 #include <utility>

@@ -71,7 +73,7 @@ __global__ void _count_frequency(
    }
  }

-  using BlockReduce = typename cub::BlockReduce<IdxType, BLOCK_SIZE>;
+  using BlockReduce = typename hipcub::BlockReduce<IdxType, BLOCK_SIZE>;
  __shared__ typename BlockReduce::TempStorage temp_space;

  count = BlockReduce(temp_space).Sum(count);
@@ -112,7 +114,7 @@ __global__ void _compact_frequency(
  int64_t last_idx = start_idx + TILE_SIZE;
  const IdxType block_offset = edge_blocks_prefix[blockIdx.x];

-  using BlockScan = typename cub::BlockScan<IdxType, BLOCK_SIZE>;
+  using BlockScan = typename hipcub::BlockScan<IdxType, BLOCK_SIZE>;
  __shared__ typename BlockScan::TempStorage temp_space;
  BlockPrefixCallbackOp<IdxType> prefix_op(0);

@@ -246,7 +248,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
 template <typename IdxType>
 FrequencyHashmap<IdxType>::FrequencyHashmap(
    int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
-    cudaStream_t stream, int64_t edge_table_scale) {
+    hipStream_t stream, int64_t edge_table_scale) {
  _ctx = ctx;
  _stream = stream;
  num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
@@ -259,7 +261,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
  constexpr int TILE_SIZE = BLOCK_SIZE * 8;
  dim3 block(BLOCK_SIZE);
  dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
-  CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
+  CUDA_CALL(hipMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
  CUDA_KERNEL_CALL(
      (_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
      _stream, edge_hashmap, (num_dst * num_items_each_dst));
@@ -300,7 +302,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // _edge_hashmap
  bool *is_first_position = static_cast<bool *>(
      device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
-  CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
+  CUDA_CALL(hipMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
  // double space to use ExclusiveSum
  auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
      _ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
@@ -327,11 +329,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // 2.1 ExclusiveSum the edge_blocks_prefix
  void *d_temp_storage = nullptr;
  size_t temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
      edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
      edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
  device->FreeWorkspace(_ctx, d_temp_storage);
@@ -365,19 +367,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // 3.1 ExclusiveSum the num_unique_each_node
  d_temp_storage = nullptr;
  temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
  device->FreeWorkspace(_ctx, d_temp_storage);
  // 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
  // Create a set of DoubleBuffers to wrap pairs of device pointers
-  cub::DoubleBuffer<Idx64Type> d_unique_frequency(
+  hipcub::DoubleBuffer<Idx64Type> d_unique_frequency(
      unique_frequency, unique_frequency_alternate);
-  cub::DoubleBuffer<IdxType> d_unique_src_edges(
+  hipcub::DoubleBuffer<IdxType> d_unique_src_edges(
      unique_src_edges, unique_src_edges_alternate);
  // Determine temporary device storage requirements
  d_temp_storage = nullptr;
@@ -385,12 +387,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
  // especially when num_dst_nodes is large (about ~10000)
  if (dtype.bits == 32) {
-    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
        _stream));
  } else {
-    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, num_dst_nodes,
        num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
@@ -398,12 +400,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  }
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
  if (dtype.bits == 32) {
-    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
        _stream));
  } else {
-    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, d_unique_frequency,
        d_unique_src_edges, num_unique_edges, num_dst_nodes,
        num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
@@ -422,11 +424,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // use unique_output_offsets;
  d_temp_storage = nullptr;
  temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      unique_output_offsets, num_dst_nodes + 1, _stream));
  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      d_temp_storage, temp_storage_bytes, num_unique_each_node,
      unique_output_offsets, num_dst_nodes + 1, _stream));
  device->FreeWorkspace(_ctx, d_temp_storage);

--- a/src/graph/sampling/randomwalks/get_node_types_gpu.cu
+++ b/src/graph/sampling/randomwalks/get_node_types_gpu.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2021 by Contributors
 * @file graph/sampling/get_node_types_gpu.cu
 * @brief DGL sampler
 */

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>
 #include <dgl/runtime/device_api.h>

--- a/src/graph/sampling/randomwalks/randomwalk_gpu.cu
+++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021-2022 by Contributors
 * @file graph/sampling/randomwalk_gpu.cu
 * @brief CUDA random walk sampleing
 */

-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -35,6 +37,22 @@ struct GraphKernelData {
  const IdType *in_cols;
  const IdType *data;
 };
+template<typename IdType>
+inline IdType* __GetDevicePointer(runtime::NDArray array) {
+  IdType* ptr = array.Ptr<IdType>();
+  if (array.IsPinned()) {
+    CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
+  }
+  return ptr;
+}
+
+inline void* __GetDevicePointer(runtime::NDArray array) {
+  void* ptr = array->data;
+  if (array.IsPinned()) {
+    CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
+  }
+  return ptr;
+}

 template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
 __global__ void _RandomWalkKernel(
@@ -48,10 +66,10 @@ __global__ void _RandomWalkKernel(
  int64_t last_idx =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
  int64_t trace_length = (max_num_steps + 1);
-  curandState rng;
+  hiprandState_t rng;
  // reference:
-  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
-  curand_init(rand_seed + idx, 0, 0, &rng);
+  //     https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
+  hiprand_init(rand_seed + idx, 0, 0, &rng);

  while (idx < last_idx) {
    IdType curr = seed_data[idx];
@@ -68,18 +86,18 @@ __global__ void _RandomWalkKernel(
      if (deg == 0) {  // the degree is zero
        break;
      }
-      const int64_t num = curand(&rng) % deg;
+      const int64_t num = hiprand(&rng) % deg;
      IdType pick = graph.in_cols[in_row_start + num];
      IdType eid =
          (graph.data ? graph.data[in_row_start + num] : in_row_start + num);
      *traces_data_ptr = pick;
      *eids_data_ptr = eid;
      if ((restart_prob_size > 1) &&
-          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+          (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
        break;
      } else if (
          (restart_prob_size == 1) &&
-          (curand_uniform(&rng) < restart_prob_data[0])) {
+          (hiprand_uniform(&rng) < restart_prob_data[0])) {
        break;
      }
      ++traces_data_ptr;
@@ -107,10 +125,10 @@ __global__ void _RandomWalkBiasedKernel(
  int64_t last_idx =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
  int64_t trace_length = (max_num_steps + 1);
-  curandState rng;
+  hiprandState_t rng;
  // reference:
-  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
-  curand_init(rand_seed + idx, 0, 0, &rng);
+  //     https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
+  hiprand_init(rand_seed + idx, 0, 0, &rng);

  while (idx < last_idx) {
    IdType curr = seed_data[idx];
@@ -133,9 +151,9 @@ __global__ void _RandomWalkBiasedKernel(
      const FloatType *prob = probs[metapath_id];
      int64_t num;
      if (prob == nullptr) {
-        num = curand(&rng) % deg;
+        num = hiprand(&rng) % deg;
      } else {
-        auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
+        auto rnd_sum_w = prob_sum[curr] * hiprand_uniform(&rng);
        FloatType sum_w{0.};
        for (num = 0; num < deg; ++num) {
          sum_w += prob[in_row_start + num];
@@ -149,11 +167,11 @@ __global__ void _RandomWalkBiasedKernel(
      *traces_data_ptr = pick;
      *eids_data_ptr = eid;
      if ((restart_prob_size > 1) &&
-          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+          (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
        break;
      } else if (
          (restart_prob_size == 1) &&
-          (curand_uniform(&rng) < restart_prob_data[0])) {
+          (hiprand_uniform(&rng) < restart_prob_data[0])) {
        break;
      }
      ++traces_data_ptr;
@@ -176,14 +194,17 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
    FloatArray restart_prob) {
  const int64_t max_num_steps = metapath->shape[0];
-  const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  // const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  const IdType *metapath_data = static_cast<const IdType *>(__GetDevicePointer(metapath));
  const int64_t begin_ntype =
      hg->meta_graph()->FindEdge(metapath_data[0]).first;
  const int64_t max_nodes = hg->NumVertices(begin_ntype);
  int64_t num_etypes = hg->NumEdgeTypes();
  auto ctx = seeds->ctx;

-  const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  // const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
+  // const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
  CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
  const int64_t num_seeds = seeds->shape[0];
  int64_t trace_length = max_num_steps + 1;
@@ -195,14 +216,19 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
  std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
-    h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
-    h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
+    // h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].data =
+    //     (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+    //                      : nullptr);
+    h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
+    h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
    h_graphs[etype].data =
-        (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+        (CSRHasData(csr) ? static_cast<const IdType *>(__GetDevicePointer(csr.data))
                         : nullptr);
  }
  // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = DeviceAPI::Get(ctx);
  auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
      ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
@@ -222,10 +248,11 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
  ATEN_FLOAT_TYPE_SWITCH(
      restart_prob->dtype, FloatType, "random walk GPU kernel", {
-        CHECK(restart_prob->ctx.device_type == kDGLCUDA)
+        CHECK(restart_prob->ctx.device_type == kDGLCUDA||restart_prob->ctx.device_type == kDGLROCM)
            << "restart prob should be in GPU.";
        CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
-        const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+        // const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+        const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
        const int64_t restart_prob_size = restart_prob->shape[0];
        CUDA_KERNEL_CALL(
            (_RandomWalkKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
@@ -247,23 +274,27 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
    const std::vector<FloatArray> &prob, FloatArray restart_prob) {
  const int64_t max_num_steps = metapath->shape[0];
-  const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  // const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  const IdType *metapath_data = static_cast<IdType *>(__GetDevicePointer(metapath));
  const int64_t begin_ntype =
      hg->meta_graph()->FindEdge(metapath_data[0]).first;
  const int64_t max_nodes = hg->NumVertices(begin_ntype);
  int64_t num_etypes = hg->NumEdgeTypes();
  auto ctx = seeds->ctx;

-  const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  // const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
  CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
  const int64_t num_seeds = seeds->shape[0];
  int64_t trace_length = max_num_steps + 1;
  IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
  IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
  IdType *traces_data = traces.Ptr<IdType>();
-  IdType *eids_data = eids.Ptr<IdType>();
+  // IdType *traces_data = static_cast<IdType *>(__GetDevicePointer(traces));
+  // IdType *eids_data = eids.Ptr<IdType>();
+  IdType *eids_data = static_cast<IdType *>(__GetDevicePointer(eids));

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = DeviceAPI::Get(ctx);
  // new probs and prob sums pointers
  assert(num_etypes == static_cast<int64_t>(prob.size()));
@@ -276,10 +307,15 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
  std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
-    h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
-    h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
+    // h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].data =
+    //     (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+    //                      : nullptr);
+    h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
+    h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
    h_graphs[etype].data =
-        (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+        (CSRHasData(csr) ? static_cast<const IdType *>(__GetDevicePointer(csr.data))
                         : nullptr);

    int64_t num_segments = csr.indptr->shape[0] - 1;
@@ -289,19 +325,22 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
      prob_sums[etype] = nullptr;
      continue;
    }
-    probs[etype] = prob[etype].Ptr<FloatType>();
+    // probs[etype] = prob[etype].Ptr<FloatType>();
+    probs[etype] = static_cast<FloatType *>(__GetDevicePointer(prob[etype]));
    prob_sums_arr.push_back(
        FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx));
-    prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
+    // prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
+    prob_sums[etype] = static_cast<FloatType *>(__GetDevicePointer(prob_sums_arr[etype]));

    // calculate the sum of the neighbor weights
-    const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
+    // const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
+    const IdType *d_offsets = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
    size_t temp_storage_size = 0;
-    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
        nullptr, temp_storage_size, probs[etype], prob_sums[etype],
        num_segments, d_offsets, d_offsets + 1, stream));
    void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
-    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
        temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
        num_segments, d_offsets, d_offsets + 1, stream));
    device->FreeWorkspace(ctx, temp_storage);
@@ -328,18 +367,20 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
      DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype);
  // copy metapath to GPU
  auto d_metapath = metapath.CopyTo(ctx);
-  const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
+  // const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
+  const IdType *d_metapath_data = static_cast<IdType *>(__GetDevicePointer(d_metapath));

  constexpr int BLOCK_SIZE = 256;
  constexpr int TILE_SIZE = BLOCK_SIZE * 4;
  dim3 block(256);
  dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
-  CHECK(restart_prob->ctx.device_type == kDGLCUDA)
+  CHECK(restart_prob->ctx.device_type == kDGLCUDA ||restart_prob->ctx.device_type == kDGLROCM)
      << "restart prob should be in GPU.";
  CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
-  const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
-  const int64_t restart_prob_size = restart_prob->shape[0];
+  // const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+  const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
+  const int64_t restart_prob_size = restart_prob->shape[0];  
  CUDA_KERNEL_CALL(
      (_RandomWalkBiasedKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
      block, 0, stream, random_seed, seed_data, num_seeds, d_metapath_data,
@@ -396,7 +437,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
  auto device = dgl::runtime::DeviceAPI::Get(device_ctx);

  // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  device->CopyDataFromTo(
      &restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
      DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
@@ -443,13 +484,15 @@ template <DGLDeviceType XPU, typename IdxType>
 std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
    const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
    const int64_t k) {
-  CHECK(src->ctx.device_type == kDGLCUDA) << "IdArray needs be on GPU!";
-  const IdxType *src_data = src.Ptr<IdxType>();
-  const IdxType *dst_data = dst.Ptr<IdxType>();
+  CHECK(src->ctx.device_type == kDGLCUDA || src->ctx.device_type == kDGLROCM) << "IdArray needs be on GPU!";
+  // const IdxType *src_data = src.Ptr<IdxType>();
+  const IdxType *src_data = static_cast<IdxType*>(__GetDevicePointer(src));
+  // const IdxType *dst_data = dst.Ptr<IdxType>();
+  const IdxType *dst_data = static_cast<IdxType*>(__GetDevicePointer(dst));
  const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
  auto ctx = src->ctx;
  // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto frequency_hashmap = FrequencyHashmap<IdxType>(
      num_dst_nodes, num_samples_per_node, ctx, stream);
  auto ret = frequency_hashmap.Topk(