update src and graphbolt code

6ac701f8 · sangwzh · 1547bd93 · 6ac701f8 · 6ac701f8 · 6ac701f8
Commit 6ac701f8 authored Sep 13, 2024 by sangwzh
16 changed files
--- a/src/graph/transform/cuda/knn.cu
+++ b/src/graph/transform/cuda/knn.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file graph/transform/cuda/knn.cu
 * @brief k-nearest-neighbor (KNN) implementation (cuda)
 */

-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>

 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <limits>
 #include <string>
 #include <type_traits>
@@ -467,7 +469,7 @@ void BruteForceKNNCuda(
    const NDArray& data_points, const IdArray& data_offsets,
    const NDArray& query_points, const IdArray& query_offsets, const int k,
    IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = data_points->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  const int64_t batch_size = data_offsets->shape[0] - 1;
@@ -512,7 +514,7 @@ void BruteForceKNNSharedCuda(
    const NDArray& data_points, const IdArray& data_offsets,
    const NDArray& query_points, const IdArray& query_offsets, const int k,
    IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = data_points->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  const int64_t batch_size = data_offsets->shape[0] - 1;
@@ -528,8 +530,8 @@ void BruteForceKNNSharedCuda(
  // get max shared memory per block in bytes
  // determine block size according to this value
  int max_sharedmem_per_block = 0;
-  CUDA_CALL(cudaDeviceGetAttribute(
-      &max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+  CUDA_CALL(hipDeviceGetAttribute(
+      &max_sharedmem_per_block, hipDeviceAttributeMaxSharedMemoryPerBlock,
      ctx.device_id));
  const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
      (k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
@@ -552,17 +554,17 @@ void BruteForceKNNSharedCuda(
      GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
      query_offsets_data, num_block_per_segment, batch_size, block_size);
  size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
      batch_size, stream));
  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
      batch_size, stream));
  device->FreeWorkspace(ctx, prefix_temp);

  // wait for results
-  CUDA_CALL(cudaStreamSynchronize(stream));
+  CUDA_CALL(hipStreamSynchronize(stream));

  int64_t num_blocks = 0, final_elem = 0,
          copyoffset = (batch_size - 1) * sizeof(IdType);
@@ -603,10 +605,10 @@ void BruteForceKNNSharedCuda(

 /** @brief Setup rng state for nn-descent */
 __global__ void SetupRngKernel(
-    curandState* states, const uint64_t seed, const size_t n) {
+    hiprandState_t* states, const uint64_t seed, const size_t n) {
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  if (id < n) {
-    curand_init(seed, id, 0, states + id);
+    hiprand_init(seed, id, 0, states + id);
  }
 }

@@ -622,8 +624,8 @@ __global__ void RandomInitNeighborsKernel(
  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  IdType batch_idx = 0;
  if (point_idx >= offsets[batch_size]) return;
-  curandState state;
-  curand_init(seed, point_idx, 0, &state);
+  hiprandState_t state;
+  hiprand_init(seed, point_idx, 0, &state);

  // find the segment location in the input batch
  for (IdType b = 0; b < batch_size + 1; ++b) {
@@ -646,7 +648,7 @@ __global__ void RandomInitNeighborsKernel(
    current_central_nodes[i] = point_idx;
  }
  for (IdType i = k; i < segment_size; ++i) {
-    const IdType j = static_cast<IdType>(curand(&state) % (i + 1));
+    const IdType j = static_cast<IdType>(hiprand(&state) % (i + 1));
    if (j < k) current_neighbors[j] = i + segment_start;
  }

@@ -674,8 +676,8 @@ __global__ void FindCandidatesKernel(
  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  IdType batch_idx = 0;
  if (point_idx >= offsets[batch_size]) return;
-  curandState state;
-  curand_init(seed, point_idx, 0, &state);
+  hiprandState_t state;
+  hiprand_init(seed, point_idx, 0, &state);

  // find the segment location in the input batch
  for (IdType b = 0; b < batch_size + 1; ++b) {
@@ -711,7 +713,7 @@ __global__ void FindCandidatesKernel(
    if (curr_num < num_candidates) {
      candidate_data[curr_num] = candidate;
    } else {
-      IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
+      IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
      if (pos < num_candidates) candidate_data[pos] = candidate;
    }
    ++candidate_array[0];
@@ -732,7 +734,7 @@ __global__ void FindCandidatesKernel(
      if (curr_num < num_candidates) {
        candidate_data[curr_num] = reverse_candidate;
      } else {
-        IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
+        IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
        if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
      }
      ++candidate_array[0];
@@ -873,7 +875,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void NNDescent(
    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
    const int num_iters, const int num_candidates, const double delta) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = points->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  const int64_t num_nodes = points->shape[0];
@@ -887,7 +889,7 @@ void NNDescent(
  uint64_t seed;
  int warp_size = 0;
  CUDA_CALL(
-      cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id));
+      hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, ctx.device_id));
  // We don't need large block sizes, since there's not much inter-thread
  // communication
  int64_t block_size = warp_size;
@@ -911,7 +913,7 @@ void NNDescent(
  IdType* total_num_updates_d =
      static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));

-  CUDA_CALL(cub::DeviceReduce::Sum(
+  CUDA_CALL(hipcub::DeviceReduce::Sum(
      nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
      stream));
  IdType* sum_temp_storage =
@@ -942,7 +944,7 @@ void NNDescent(
        feature_size);

    total_num_updates = 0;
-    CUDA_CALL(cub::DeviceReduce::Sum(
+    CUDA_CALL(hipcub::DeviceReduce::Sum(
        sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
        num_nodes, stream));
    device->CopyDataFromTo(

--- a/src/graph/traversal.cc
+++ b/src/graph/traversal.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2018 by Contributors
 * @file graph/traversal.cc
 * @brief Graph traversal implementation
 */
-#include "./traversal.h"
+#include "traversal.h"

 #include <dgl/packed_func_ext.h>


--- a/src/graph/unit_graph.cc
+++ b/src/graph/unit_graph.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file graph/unit_graph.cc
 * @brief UnitGraph graph implementation
 */
-#include "./unit_graph.h"
+#include "unit_graph.h"

 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>
@@ -11,7 +12,7 @@
 #include <dgl/lazy.h>

 #include "../c_api_common.h"
-#include "./serialize/dglstream.h"
+#include "serialize/dglstream.h"

 namespace dgl {


--- a/src/partition/cuda/partition_op.cu
+++ b/src/partition/cuda/partition_op.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file ndarray_partition.h
@@ -6,7 +8,7 @@

 #include <dgl/runtime/device_api.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
 #include "../../runtime/workspace.h"
@@ -239,7 +241,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(

  const auto& ctx = in_idx->ctx;
  auto device = DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t num_in = in_idx->shape[0];

@@ -263,7 +265,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
  }

  const int64_t part_bits =
-      static_cast<int64_t>(std::ceil(std::log2(num_parts)));
+      static_cast<int64_t>(::ceil(std::log2(num_parts)));

  // First, generate a mapping of indexes to processors
  Workspace<IdType> proc_id_in(device, ctx, num_in);
@@ -295,13 +297,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
    IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);

    size_t sort_workspace_size;
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
        nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
        static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
        stream));

    Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
        sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
        proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
        num_in, 0, part_bits, stream));
@@ -317,7 +319,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
    static_assert(
        sizeof(AtomicCount) == sizeof(*out_counts),
        "AtomicCount must be the same width as int64_t for atomicAdd "
-        "in cub::DeviceHistogram::HistogramEven() to work");
+        "in hipcub::DeviceHistogram::HistogramEven() to work");

    // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
    // add a compile time check against the cub version to allow
@@ -327,14 +329,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
           "value of int.";

    size_t hist_workspace_size;
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
        nullptr, hist_workspace_size, proc_id_out.get(),
        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
        static_cast<IdType>(0), static_cast<IdType>(num_parts),
        static_cast<int>(num_in), stream));

    Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
        hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
        static_cast<IdType>(0), static_cast<IdType>(num_parts),
@@ -352,7 +354,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
 template <DGLDeviceType XPU, typename IdType>
 IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
  const auto& ctx = global_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  if (num_parts > 1) {
    IdArray local_idx =
@@ -387,7 +389,7 @@ IdArray MapToGlobalFromRemainder(
                       << num_parts;

  const auto& ctx = local_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  if (num_parts > 1) {
    IdArray global_idx =
@@ -423,7 +425,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(

  const auto& ctx = in_idx->ctx;
  auto device = DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t num_in = in_idx->shape[0];

@@ -447,7 +449,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
  }

  const int64_t part_bits =
-      static_cast<int64_t>(std::ceil(std::log2(num_parts)));
+      static_cast<int64_t>(::ceil(std::log2(num_parts)));

  // First, generate a mapping of indexes to processors
  Workspace<IdType> proc_id_in(device, ctx, num_in);
@@ -470,13 +472,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
    IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);

    size_t sort_workspace_size;
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
        nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
        static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
        stream));

    Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
        sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
        proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
        num_in, 0, part_bits, stream));
@@ -492,7 +494,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
    static_assert(
        sizeof(AtomicCount) == sizeof(*out_counts),
        "AtomicCount must be the same width as int64_t for atomicAdd "
-        "in cub::DeviceHistogram::HistogramEven() to work");
+        "in hipcub::DeviceHistogram::HistogramEven() to work");

    // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
    // add a compile time check against the cub version to allow
@@ -502,14 +504,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
           "value of int.";

    size_t hist_workspace_size;
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
        nullptr, hist_workspace_size, proc_id_out.get(),
        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
        static_cast<IdType>(0), static_cast<IdType>(num_parts),
        static_cast<int>(num_in), stream));

    Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
        hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
        static_cast<IdType>(0), static_cast<IdType>(num_parts),
@@ -536,7 +538,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
 IdArray MapToLocalFromRange(
    const int num_parts, IdArray range, IdArray global_idx) {
  const auto& ctx = global_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  if (num_parts > 1 && global_idx->shape[0] > 0) {
    IdArray local_idx =
@@ -576,7 +578,7 @@ IdArray MapToGlobalFromRange(
                       << num_parts;

  const auto& ctx = local_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  if (num_parts > 1 && local_idx->shape[0] > 0) {
    IdArray global_idx =

--- a/src/partition/ndarray_partition.cc
+++ b/src/partition/ndarray_partition.cc
@@ -39,7 +39,7 @@ class RemainderPartition : public NDArrayPartition {
      IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDGLCUDA) {
+    if (ctx.device_type == kDGLCUDA || ctx.device_type == kDGLROCM) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        return impl::GeneratePermutationFromRemainder<kDGLCUDA, IdType>(
            ArraySize(), NumParts(), in_idx);
@@ -56,7 +56,7 @@ class RemainderPartition : public NDArrayPartition {
  IdArray MapToLocal(IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDGLCUDA) {
+    if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        return impl::MapToLocalFromRemainder<kDGLCUDA, IdType>(
            NumParts(), in_idx);
@@ -73,7 +73,7 @@ class RemainderPartition : public NDArrayPartition {
  IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDGLCUDA) {
+    if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        return impl::MapToGlobalFromRemainder<kDGLCUDA, IdType>(
            NumParts(), in_idx, part_id);
@@ -118,7 +118,7 @@ class RangePartition : public NDArrayPartition {
      IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDGLCUDA) {
+    if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
      if (ctx.device_type != range_->ctx.device_type ||
          ctx.device_id != range_->ctx.device_id) {
        LOG(FATAL) << "The range for the NDArrayPartition and the input "
@@ -144,7 +144,7 @@ class RangePartition : public NDArrayPartition {
  IdArray MapToLocal(IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDGLCUDA) {
+    if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
          return impl::MapToLocalFromRange<kDGLCUDA, IdType, RangeType>(
@@ -163,7 +163,7 @@ class RangePartition : public NDArrayPartition {
  IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDGLCUDA) {
+    if (ctx.device_type == kDGLCUDA||ctx.device_type == kDGLROCM) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
          return impl::MapToGlobalFromRange<kDGLCUDA, IdType, RangeType>(

--- a/src/random/continuous_seed.h
+++ b/src/random/continuous_seed.h
+// !!! This is a file automatically generated by hipify!!!
 /*!
 *   Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
 *   All rights reserved.
@@ -24,13 +25,13 @@

 #include <cmath>

-#ifdef __NVCC__
-#include <curand_kernel.h>
+#ifdef __HIPCC__
+#include <hiprand/hiprand_kernel.h>
 #else
 #include <random>

 #include "pcg_random.hpp"
-#endif  // __CUDA_ARCH__
+#endif  // __HIP_DEVICE_COMPILE__

 #ifndef M_SQRT1_2
 #define M_SQRT1_2 0.707106781186547524401
@@ -58,24 +59,24 @@ class continuous_seed {
    c[1] = std::sin(pi * r / 2);
  }

-#ifdef __CUDA_ARCH__
+#ifdef __HIP_DEVICE_COMPILE__
  __device__ inline float uniform(const uint64_t t) const {
    const uint64_t kCurandSeed = 999961;  // Could be any random number.
-    curandStatePhilox4_32_10_t rng;
-    curand_init(kCurandSeed, s[0], t, &rng);
+    hiprandStatePhilox4_32_10_t rng;
+    hiprand_init(kCurandSeed, s[0], t, &rng);
    float rnd;
    if (s[0] != s[1]) {
-      rnd = c[0] * curand_normal(&rng);
-      curand_init(kCurandSeed, s[1], t, &rng);
-      rnd += c[1] * curand_normal(&rng);
+      rnd = c[0] * hiprand_normal(&rng);
+      hiprand_init(kCurandSeed, s[1], t, &rng);
+      rnd += c[1] * hiprand_normal(&rng);
      rnd = normcdff(rnd);
    } else {
-      rnd = curand_uniform(&rng);
+      rnd = hiprand_uniform(&rng);
    }
    return rnd;
  }
 #else
-  inline float uniform(const uint64_t t) const {
+  __host__ inline float uniform(const uint64_t t) const {
    pcg32 ng0(s[0], t);
    float rnd;
    if (s[0] != s[1]) {
@@ -91,7 +92,7 @@ class continuous_seed {
    }
    return rnd;
  }
-#endif  // __CUDA_ARCH__
+#endif  // __HIP_DEVICE_COMPILE__
 };

 }  // namespace random

--- a/src/rpc/rpc.cc
+++ b/src/rpc/rpc.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file rpc/rpc.cc
 * @brief Implementation of RPC utilities used by both server and client sides.
 */
 #if defined(__linux__)
-#include "./rpc.h"
+#include "rpc.h"

 #include <dgl/array.h>
 #include <dgl/packed_func_ext.h>

--- a/src/rpc/rpc.h
+++ b/src/rpc/rpc.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file rpc/rpc.h
@@ -19,9 +20,9 @@
 #include <unordered_map>
 #include <vector>

-#include "./network/common.h"
-#include "./rpc_msg.h"
-#include "./server_state.h"
+#include "network/common.h"
+#include "rpc_msg.h"
+#include "server_state.h"
 #include "network/socket_communicator.h"

 namespace dgl {

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2016-2022 by Contributors
 * @file c_runtime_api.cc
@@ -32,6 +33,8 @@ inline std::string DeviceName(int type) {
      return "cpu";
    case kDGLCUDA:
      return "cuda";
+    case kDGLROCM:
+      return "cuda";
    // add more device here once supported
    default:
      LOG(FATAL) << "unknown type =" << type;
@@ -122,13 +125,13 @@ void DeviceAPI::SyncStreamFromTo(
 }

 bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
-  LOG(FATAL) << "Device does not support cudaHostRegister api.";
+  LOG(FATAL) << "Device does not support hipHostRegister api.";
  return false;
 }

 void* DeviceAPI::AllocPinnedDataSpace(
    size_t nbytes, void** ctx, void** deleter) {
-  LOG(FATAL) << "Device does not support cudaHostAlloc api.";
+  LOG(FATAL) << "Device does not support hipHostMalloc api.";
  return nullptr;
 }

@@ -137,7 +140,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
 }

 void DeviceAPI::UnpinData(void* ptr) {
-  LOG(FATAL) << "Device does not support cudaHostUnregister api.";
+  LOG(FATAL) << "Device does not support hipHostUnregister api.";
 }
 }  // namespace runtime
 }  // namespace dgl

--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2017 by Contributors
 * @file cuda_common.h
@@ -6,10 +7,10 @@
 #ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
 #define DGL_RUNTIME_CUDA_CUDA_COMMON_H_

-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <cusparse.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include <dgl/runtime/packed_func.h>

 #include <memory>
@@ -25,8 +26,8 @@ namespace runtime {
  DGL's memory pool and the current cuda stream

  runtime::CUDAWorkspaceAllocator allocator(ctx);
-  const auto stream = runtime::getCurrentCUDAStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);

  now, one can pass exec_policy to thrust functions

@@ -79,112 +80,112 @@ inline bool is_zero<dim3>(dim3 size) {

 #define CUDA_DRIVER_CALL(x)                                             \
  {                                                                     \
-    CUresult result = x;                                                \
-    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+    hipError_t result = x;                                                \
+    if (result != hipSuccess && result != hipErrorDeinitialized) { \
      const char* msg;                                                  \
-      cuGetErrorName(result, &msg);                                     \
+      hipGetErrorName(result, &msg);                                     \
      LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg;     \
    }                                                                   \
  }

 #define CUDA_CALL(func)                                      \
  {                                                          \
-    cudaError_t e = (func);                                  \
-    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
-        << "CUDA: " << cudaGetErrorString(e);                \
+    hipError_t e = (func);                                  \
+    CHECK(e == hipSuccess || e == hipErrorDeinitialized) \
+        << "CUDA: " << hipGetErrorString(e);                \
  }

 #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...)            \
  {                                                                           \
    if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
-      (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__);         \
-      cudaError_t e = cudaGetLastError();                                     \
-      CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)                \
-          << "CUDA kernel launch error: " << cudaGetErrorString(e);           \
+     hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), (stream), __VA_ARGS__);         \
+      hipError_t e = hipGetLastError();                                     \
+      CHECK(e == hipSuccess || e == hipErrorDeinitialized)                \
+          << "CUDA kernel launch error: " << hipGetErrorString(e);           \
    }                                                                         \
  }

 #define CUSPARSE_CALL(func)                                         \
  {                                                                 \
-    cusparseStatus_t e = (func);                                    \
-    CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
+    hipsparseStatus_t e = (func);                                    \
+    CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
  }

 #define CUBLAS_CALL(func)                                       \
  {                                                             \
-    cublasStatus_t e = (func);                                  \
-    CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
+    hipblasStatus_t e = (func);                                  \
+    CHECK(e == HIPBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
  }

 #define CURAND_CALL(func)                                                      \
  {                                                                            \
-    curandStatus_t e = (func);                                                 \
-    CHECK(e == CURAND_STATUS_SUCCESS)                                          \
+    hiprandStatus_t e = (func);                                                 \
+    CHECK(e == HIPRAND_STATUS_SUCCESS)                                          \
        << "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
        << __FILE__ << ":" << __LINE__;                                        \
  }

-inline const char* curandGetErrorString(curandStatus_t error) {
+inline const char* curandGetErrorString(hiprandStatus_t error) {
  switch (error) {
-    case CURAND_STATUS_SUCCESS:
-      return "CURAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:
-      return "CURAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:
-      return "CURAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:
-      return "CURAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:
-      return "CURAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:
-      return "CURAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:
-      return "CURAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return "CURAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return "CURAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:
-      return "CURAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:
-      return "CURAND_STATUS_INTERNAL_ERROR";
+    case HIPRAND_STATUS_SUCCESS:
+      return "HIPRAND_STATUS_SUCCESS";
+    case HIPRAND_STATUS_VERSION_MISMATCH:
+      return "HIPRAND_STATUS_VERSION_MISMATCH";
+    case HIPRAND_STATUS_NOT_INITIALIZED:
+      return "HIPRAND_STATUS_NOT_INITIALIZED";
+    case HIPRAND_STATUS_ALLOCATION_FAILED:
+      return "HIPRAND_STATUS_ALLOCATION_FAILED";
+    case HIPRAND_STATUS_TYPE_ERROR:
+      return "HIPRAND_STATUS_TYPE_ERROR";
+    case HIPRAND_STATUS_OUT_OF_RANGE:
+      return "HIPRAND_STATUS_OUT_OF_RANGE";
+    case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case HIPRAND_STATUS_LAUNCH_FAILURE:
+      return "HIPRAND_STATUS_LAUNCH_FAILURE";
+    case HIPRAND_STATUS_PREEXISTING_FAILURE:
+      return "HIPRAND_STATUS_PREEXISTING_FAILURE";
+    case HIPRAND_STATUS_INITIALIZATION_FAILED:
+      return "HIPRAND_STATUS_INITIALIZATION_FAILED";
+    case HIPRAND_STATUS_ARCH_MISMATCH:
+      return "HIPRAND_STATUS_ARCH_MISMATCH";
+    case HIPRAND_STATUS_INTERNAL_ERROR:
+      return "HIPRAND_STATUS_INTERNAL_ERROR";
  }
  // To suppress compiler warning.
-  return "Unrecognized curand error string";
+  return "Unrecognized hiprand error string";
 }

 /**
- * @brief Cast data type to cudaDataType_t.
+ * @brief Cast data type to hipDataType.
 */
 template <typename T>
 struct cuda_dtype {
-  static constexpr cudaDataType_t value = CUDA_R_32F;
+  static constexpr hipDataType value = HIP_R_32F;
 };

 template <>
 struct cuda_dtype<__half> {
-  static constexpr cudaDataType_t value = CUDA_R_16F;
+  static constexpr hipDataType value = HIP_R_16F;
 };

 #if BF16_ENABLED
 template <>
-struct cuda_dtype<__nv_bfloat16> {
-  static constexpr cudaDataType_t value = CUDA_R_16BF;
+struct cuda_dtype<__hip_bfloat16> {
+  static constexpr hipDataType value = HIP_R_16BF;
 };
 #endif  // BF16_ENABLED

 template <>
 struct cuda_dtype<float> {
-  static constexpr cudaDataType_t value = CUDA_R_32F;
+  static constexpr hipDataType value = HIP_R_32F;
 };

 template <>
 struct cuda_dtype<double> {
-  static constexpr cudaDataType_t value = CUDA_R_64F;
+  static constexpr hipDataType value = HIP_R_64F;
 };

 /*
@@ -202,7 +203,7 @@ struct accum_dtype<__half> {

 #if BF16_ENABLED
 template <>
-struct accum_dtype<__nv_bfloat16> {
+struct accum_dtype<__hip_bfloat16> {
  typedef float type;
 };
 #endif  // BF16_ENABLED
@@ -217,23 +218,23 @@ struct accum_dtype<double> {
  typedef double type;
 };

-#if CUDART_VERSION >= 11000
+#if DTKRT_VERSION >= 11000
 /**
- * @brief Cast index data type to cusparseIndexType_t.
+ * @brief Cast index data type to hipsparseIndexType_t.
 */
 template <typename T>
 struct cusparse_idtype {
-  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+  static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
 };

 template <>
 struct cusparse_idtype<int32_t> {
-  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+  static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
 };

 template <>
 struct cusparse_idtype<int64_t> {
-  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+  static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_64I;
 };
 #endif

@@ -241,9 +242,9 @@ struct cusparse_idtype<int64_t> {
 class CUDAThreadEntry {
 public:
  /** @brief The cusparse handler */
-  cusparseHandle_t cusparse_handle{nullptr};
+  hipsparseHandle_t cusparse_handle{nullptr};
  /** @brief The cublas handler */
-  cublasHandle_t cublas_handle{nullptr};
+  hipblasHandle_t cublas_handle{nullptr};
  /** @brief thread local pool*/
  WorkspacePool pool;
  /** @brief constructor */
@@ -253,7 +254,7 @@ class CUDAThreadEntry {
 };

 /** @brief Get the current CUDA stream */
-cudaStream_t getCurrentCUDAStream();
+hipStream_t getCurrentHIPStreamMasqueradingAsCUDA();
 }  // namespace runtime
 }  // namespace dgl
 #endif  // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2017-2022 by Contributors
 * @file cuda_device_api.cc
 * @brief GPU specific API
 */
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/registry.h>
 #include <dgl/runtime/tensordispatch.h>
@@ -18,13 +19,13 @@ class CUDADeviceAPI final : public DeviceAPI {
 public:
  CUDADeviceAPI() {
    int count;
-    auto err = cudaGetDeviceCount(&count);
+    auto err = hipGetDeviceCount(&count);
    switch (err) {
-      case cudaSuccess:
+      case hipSuccess:
        break;
      default:
        count = 0;
-        cudaGetLastError();
+        hipGetLastError();
    }
    is_available_ = count > 0;
  }
@@ -32,67 +33,68 @@ class CUDADeviceAPI final : public DeviceAPI {
  bool IsAvailable() final { return is_available_; }

  void SetDevice(DGLContext ctx) final {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
  }
  void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final {
    int value = 0;
    switch (kind) {
      case kExist:
        value =
-            (cudaDeviceGetAttribute(
-                 &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) ==
-             cudaSuccess);
+            (hipDeviceGetAttribute(
+                 &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id) ==
+             hipSuccess);
        break;
      case kMaxThreadsPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id));
        break;
      }
      case kWarpSize: {
        CUDA_CALL(
-            cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id));
+            hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id));
        break;
      }
      case kMaxSharedMemoryPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeMaxSharedMemoryPerBlock, ctx.device_id));
        break;
      }
      case kComputeVersion: {
        std::ostringstream os;
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrComputeCapabilityMajor, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id));
        os << value << ".";
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrComputeCapabilityMinor, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id));
        os << value;
        *rv = os.str();
        return;
      }
      case kDeviceName: {
-        cudaDeviceProp props;
-        CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id));
+        hipDeviceProp_t props;
+        CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
        *rv = std::string(props.name);
+        // printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
        return;
      }
      case kMaxClockRate: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrClockRate, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeClockRate, ctx.device_id));
        break;
      }
      case kMultiProcessorCount: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrMultiProcessorCount, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeMultiprocessorCount, ctx.device_id));
        break;
      }
      case kMaxThreadDimensions: {
        int dims[3];
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id));

        std::stringstream ss;  // use json string to return multiple int values;
        ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
@@ -110,11 +112,11 @@ class CUDADeviceAPI final : public DeviceAPI {
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable()) {
      return tensor_dispatcher->CUDAAllocWorkspace(
-          nbytes, getCurrentCUDAStream());
+          nbytes, getCurrentHIPStreamMasqueradingAsCUDA());
    }
    CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
    void* ret;
-    CUDA_CALL(cudaMalloc(&ret, nbytes));
+    CUDA_CALL(hipMalloc(&ret, nbytes));
    return ret;
  }

@@ -124,32 +126,32 @@ class CUDADeviceAPI final : public DeviceAPI {
    if (tensor_dispatcher->IsAvailable()) {
      return tensor_dispatcher->CUDAFreeWorkspace(ptr);
    }
-    CUDA_CALL(cudaFree(ptr));
+    CUDA_CALL(hipFree(ptr));
  }

  void CopyDataFromTo(
      const void* from, size_t from_offset, void* to, size_t to_offset,
      size_t size, DGLContext ctx_from, DGLContext ctx_to,
      DGLDataType type_hint, DGLStreamHandle stream) {
-    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
+    hipStream_t cu_stream = static_cast<hipStream_t>(stream);
    from = static_cast<const char*>(from) + from_offset;
    to = static_cast<char*>(to) + to_offset;
-    if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) {
-      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+    if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
+      CUDA_CALL(hipSetDevice(ctx_from.device_id));
      if (ctx_from.device_id == ctx_to.device_id) {
-        GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
+        GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream);
      } else {
-        CUDA_CALL(cudaMemcpyPeerAsync(
+        CUDA_CALL(hipMemcpyPeerAsync(
            to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
      }
    } else if (
-        ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) {
-      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
-      GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
+        (ctx_from.device_type == kDGLCUDA || ctx_to.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) {
+      CUDA_CALL(hipSetDevice(ctx_from.device_id));
+      GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream);
    } else if (
-        ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) {
-      CUDA_CALL(cudaSetDevice(ctx_to.device_id));
-      GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
+        ctx_from.device_type == kDGLCPU && (ctx_to.device_type == kDGLCUDA||ctx_to.device_type == kDGLROCM)) {
+      CUDA_CALL(hipSetDevice(ctx_to.device_id));
+      GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream);
    } else {
      LOG(FATAL) << "expect copy from/to GPU or between GPU";
    }
@@ -166,9 +168,9 @@ class CUDADeviceAPI final : public DeviceAPI {
  }

  // To ensure correct behavior, `record_event` must be invoked anytime a
-  // pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync
+  // pointer from PyTorch CachingHostAllocator is used in a hipMemcpyAsync
  // call. It provides a way to re-use freed pinned (page-locked) memory
-  // allocations and avoid device sync due to cudaFreeHost calls.
+  // allocations and avoid device sync due to hipHostFree calls.
  void RecordedCopyDataFromTo(
      void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
      DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint,
@@ -179,7 +181,7 @@ class CUDADeviceAPI final : public DeviceAPI {
        stream);
    auto tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable()) {
-      auto custream = static_cast<cudaStream_t>(stream);
+      auto custream = static_cast<hipStream_t>(stream);
      void* ptr = ctx_to.device_type == kDGLCPU ? to : from;
      int id =
          ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id;
@@ -188,34 +190,34 @@ class CUDADeviceAPI final : public DeviceAPI {
  }

  DGLStreamHandle CreateStream(DGLContext ctx) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    cudaStream_t retval;
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    hipStream_t retval;
    // make sure the legacy default stream won't block on this stream
-    CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
+    CUDA_CALL(hipStreamCreateWithFlags(&retval, hipStreamNonBlocking));
    return static_cast<DGLStreamHandle>(retval);
  }

  void FreeStream(DGLContext ctx, DGLStreamHandle stream) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
-    CUDA_CALL(cudaStreamDestroy(cu_stream));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    hipStream_t cu_stream = static_cast<hipStream_t>(stream);
+    CUDA_CALL(hipStreamDestroy(cu_stream));
  }

  void SyncStreamFromTo(
      DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
-    cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
-    cudaEvent_t evt;
-    CUDA_CALL(cudaEventCreate(&evt));
-    CUDA_CALL(cudaEventRecord(evt, src_stream));
-    CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0));
-    CUDA_CALL(cudaEventDestroy(evt));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    hipStream_t src_stream = static_cast<hipStream_t>(event_src);
+    hipStream_t dst_stream = static_cast<hipStream_t>(event_dst);
+    hipEvent_t evt;
+    CUDA_CALL(hipEventCreate(&evt));
+    CUDA_CALL(hipEventRecord(evt, src_stream));
+    CUDA_CALL(hipStreamWaitEvent(dst_stream, evt, 0));
+    CUDA_CALL(hipEventDestroy(evt));
  }

  void StreamSync(DGLContext ctx, DGLStreamHandle stream) final {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    CUDA_CALL(hipStreamSynchronize(static_cast<hipStream_t>(stream)));
  }

  /** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
@@ -227,10 +229,10 @@ class CUDADeviceAPI final : public DeviceAPI {
  void SetStream(DGLContext ctx, DGLStreamHandle stream) final {}

  DGLStreamHandle GetStream() const final {
-    return static_cast<DGLStreamHandle>(getCurrentCUDAStream());
+    return static_cast<DGLStreamHandle>(getCurrentHIPStreamMasqueradingAsCUDA());
  }

-  /** NOTE: cudaHostRegister can be called from an arbitrary GPU device,
+  /** NOTE: hipHostRegister can be called from an arbitrary GPU device,
   *        so we don't need to specify a ctx.
   *        The pinned memory can be seen by all CUDA contexts,
   *        not just the one that performed the allocation
@@ -244,13 +246,13 @@ class CUDADeviceAPI final : public DeviceAPI {
    if (tensor_dispatcher->IsAvailable()) {
      tensor_dispatcher->CUDAHostAllocatorEmptyCache();
    }
-    CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
+    CUDA_CALL(hipHostRegister(ptr, nbytes, hipHostRegisterDefault));
    return true;
  }

  void UnpinData(void* ptr) {
    if (ptr == nullptr) return;
-    CUDA_CALL(cudaHostUnregister(ptr));
+    CUDA_CALL(hipHostUnregister(ptr));
  }

  void* AllocPinnedDataSpace(
@@ -276,33 +278,33 @@ class CUDADeviceAPI final : public DeviceAPI {
    // can't be a pinned tensor if CUDA context is unavailable.
    if (!is_available_) return false;

-    cudaPointerAttributes attr;
-    cudaError_t status = cudaPointerGetAttributes(&attr, ptr);
+    hipPointerAttribute_t attr;
+    hipError_t status = hipPointerGetAttributes(&attr, ptr);
    bool result = false;

    switch (status) {
-      case cudaErrorInvalidValue:
+      case hipErrorInvalidValue:
        // might be a normal CPU tensor in CUDA 10.2-
-        cudaGetLastError();  // clear error
+        hipGetLastError();  // clear error
        break;
-      case cudaSuccess:
-        result = (attr.type == cudaMemoryTypeHost);
+      case hipSuccess:
+        result = (attr.type == hipMemoryTypeHost);
        break;
-      case cudaErrorInitializationError:
-      case cudaErrorNoDevice:
-      case cudaErrorInsufficientDriver:
-      case cudaErrorInvalidDevice:
+      case hipErrorInitializationError:
+      case hipErrorNoDevice:
+      case hipErrorInsufficientDriver:
+      case hipErrorInvalidDevice:
        // We don't want to fail in these particular cases since this function
        // can be called when users only want to run on CPU even if CUDA API is
        // enabled, or in a forked subprocess where CUDA context cannot be
        // initialized.  So we just mark the CUDA context to unavailable and
        // return.
        is_available_ = false;
-        cudaGetLastError();  // clear error
+        hipGetLastError();  // clear error
        break;
      default:
        LOG(FATAL) << "error while determining memory status: "
-                   << cudaGetErrorString(status);
+                   << hipGetErrorString(status);
        break;
    }

@@ -316,7 +318,7 @@ class CUDADeviceAPI final : public DeviceAPI {
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable())
      return tensor_dispatcher->CUDAAllocWorkspace(
-          size, getCurrentCUDAStream());
+          size, getCurrentHIPStreamMasqueradingAsCUDA());

    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
  }
@@ -338,13 +340,13 @@ class CUDADeviceAPI final : public DeviceAPI {

 private:
  static void GPUCopy(
-      const void* from, void* to, size_t size, cudaMemcpyKind kind,
-      cudaStream_t stream) {
-    CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
-    if (stream == 0 && kind == cudaMemcpyDeviceToHost) {
+      const void* from, void* to, size_t size, hipMemcpyKind kind,
+      hipStream_t stream) {
+    CUDA_CALL(hipMemcpyAsync(to, from, size, kind, stream));
+    if (stream == 0 && kind == hipMemcpyDeviceToHost) {
      // only wait for the copy, when it's on the default stream, and it's to
      // host memory
-      CUDA_CALL(cudaStreamSynchronize(stream));
+      CUDA_CALL(hipStreamSynchronize(stream));
    }
  }

@@ -359,7 +361,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
  return CUDAThreadStore::Get();
 }

-cudaStream_t getCurrentCUDAStream() {
+hipStream_t getCurrentHIPStreamMasqueradingAsCUDA() {
  TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
  if (tensor_dispatcher->IsAvailable())
    return tensor_dispatcher->CUDAGetCurrentStream();

--- a/src/runtime/cuda/cuda_hashtable.cuh
+++ b/src/runtime/cuda/cuda_hashtable.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file runtime/cuda/cuda_device_common.cuh
@@ -10,7 +12,7 @@
 #include <dgl/runtime/c_runtime_api.h>

 #include "cuda_common.h"
-#include "cuda_runtime.h"
+#include <hip/hip_runtime.h>

 namespace dgl {
 namespace runtime {
@@ -228,7 +230,7 @@ class OrderedHashTable {
   * @param stream The stream to use for initializing the hashtable.
   */
  OrderedHashTable(
-      const size_t size, DGLContext ctx, cudaStream_t stream,
+      const size_t size, DGLContext ctx, hipStream_t stream,
      const int scale = kDefaultScale);

  /**
@@ -252,7 +254,7 @@ class OrderedHashTable {
   */
  void FillWithDuplicates(
      const IdType* const input, const size_t num_input, IdType* const unique,
-      int64_t* const num_unique, cudaStream_t stream);
+      int64_t* const num_unique, hipStream_t stream);

  /**
   * @brief Fill the hashtable with an array of unique keys.
@@ -262,7 +264,7 @@ class OrderedHashTable {
   * @param stream The stream to perform operations on.
   */
  void FillWithUnique(
-      const IdType* const input, const size_t num_input, cudaStream_t stream);
+      const IdType* const input, const size_t num_input, hipStream_t stream);

  /**
   * @brief Get a verison of the hashtable usable from device functions.

--- a/src/runtime/cuda/cuda_hashtable.cu
+++ b/src/runtime/cuda/cuda_hashtable.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file runtime/cuda/cuda_device_common.cuh
@@ -5,7 +7,7 @@
 */

 #include <cassert>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT

 #include "../../array/cuda/atomic.cuh"
 #include "cuda_common.h"
@@ -54,7 +56,9 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
   * @return The mapping.
   */
  inline __device__ Iterator Search(const IdType id) {
-    const IdType pos = SearchForPosition(id);
+    // const IdType pos = SearchForPosition(id);
+    const IdType pos = DeviceOrderedHashTable<IdType>::SearchForPosition(id);
+

    return GetMutable(pos);
  }
@@ -95,12 +99,16 @@ class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
   * @return An iterator to inserted mapping.
   */
  inline __device__ Iterator Insert(const IdType id, const size_t index) {
-    size_t pos = Hash(id);
+    // size_t pos = Hash(id);
+    size_t pos = DeviceOrderedHashTable<IdType>::Hash(id);
+

    // linearly scan for an empty slot or matching entry
    IdType delta = 1;
    while (!AttemptInsertAt(pos, id, index)) {
-      pos = Hash(pos + delta);
+      // pos = Hash(pos + delta);
+      pos = DeviceOrderedHashTable<IdType>::Hash(pos+delta);
+
      delta += 1;
    }

@@ -246,7 +254,7 @@ __global__ void count_hashmap(
    DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
  assert(BLOCK_SIZE == blockDim.x);

-  using BlockReduce = typename cub::BlockReduce<IdType, BLOCK_SIZE>;
+  using BlockReduce = typename hipcub::BlockReduce<IdType, BLOCK_SIZE>;
  using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;

  const size_t block_start = TILE_SIZE * blockIdx.x;
@@ -300,7 +308,7 @@ __global__ void compact_hashmap(
  assert(BLOCK_SIZE == blockDim.x);

  using FlagType = uint16_t;
-  using BlockScan = typename cub::BlockScan<FlagType, BLOCK_SIZE>;
+  using BlockScan = typename hipcub::BlockScan<FlagType, BLOCK_SIZE>;
  using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;

  constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
@@ -359,7 +367,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {

 template <typename IdType>
 OrderedHashTable<IdType>::OrderedHashTable(
-    const size_t size, DGLContext ctx, cudaStream_t stream, const int scale)
+    const size_t size, DGLContext ctx, hipStream_t stream, const int scale)
    : table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
  // make sure we will at least as many buckets as items.
  CHECK_GT(scale, 0);
@@ -368,7 +376,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
  table_ = static_cast<Mapping*>(
      device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));

-  CUDA_CALL(cudaMemsetAsync(
+  CUDA_CALL(hipMemsetAsync(
      table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
      sizeof(Mapping) * size_, stream));
 }
@@ -382,7 +390,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
 template <typename IdType>
 void OrderedHashTable<IdType>::FillWithDuplicates(
    const IdType* const input, const size_t num_input, IdType* const unique,
-    int64_t* const num_unique, cudaStream_t stream) {
+    int64_t* const num_unique, hipStream_t stream) {
  auto device = runtime::DeviceAPI::Get(ctx_);

  const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
@@ -404,12 +412,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
      input, num_input, device_table, item_prefix);

  size_t workspace_bytes;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
      static_cast<IdType*>(nullptr), grid.x + 1, stream));
  void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);

-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
      stream));
  device->FreeWorkspace(ctx_, workspace);
@@ -422,7 +430,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(

 template <typename IdType>
 void OrderedHashTable<IdType>::FillWithUnique(
-    const IdType* const input, const size_t num_input, cudaStream_t stream) {
+    const IdType* const input, const size_t num_input, hipStream_t stream) {
  const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;

  const dim3 grid(num_tiles);

--- a/src/runtime/cuda/gpu_cache.cu
+++ b/src/runtime/cuda/gpu_cache.cu
+// !!! This is a file automatically generated by hipify!!!
 /*!
 *  Copyright (c) 2022 by Contributors
 *
@@ -20,7 +21,7 @@
 #ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
 #define DGL_RUNTIME_CUDA_GPU_CACHE_H_

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/array.h>
 #include <dgl/aten/array_ops.h>
 #include <dgl/packed_func_ext.h>
@@ -31,7 +32,7 @@

 #include <nv_gpu_cache.hpp>

-#include "../../runtime/cuda/cuda_common.h"
+#include "cuda_common.h"

 namespace dgl {
 namespace runtime {
@@ -55,12 +56,12 @@ class GpuCache : public runtime::Object {
      : num_feats(num_feats),
        cache(std::make_unique<gpu_cache_t>(
            (num_items + bucket_size - 1) / bucket_size, num_feats)) {
-    CUDA_CALL(cudaGetDevice(&cuda_device));
+    CUDA_CALL(hipGetDevice(&cuda_device));
  }

  std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
    const auto &ctx = keys->ctx;
-    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    hipStream_t stream = dgl::runtime::getCurrentHIPStreamMasqueradingAsCUDA();
    auto device = dgl::runtime::DeviceAPI::Get(ctx);
    CHECK_EQ(ctx.device_type, kDGLCUDA)
        << "The keys should be on a CUDA device";
@@ -94,7 +95,7 @@ class GpuCache : public runtime::Object {
  }

  void Replace(IdArray keys, NDArray values) {
-    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    hipStream_t stream = dgl::runtime::getCurrentHIPStreamMasqueradingAsCUDA();
    CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
        << "The keys should be on a CUDA device";
    CHECK_EQ(keys->ctx.device_id, cuda_device)

--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
  } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") {
    f_name = "device_api.cuda";
  } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") {
-    f_name = "device_api.rocm";
+    f_name = "device_api.cuda";
  } else if (target.length() >= 4 && target.substr(0, 4) == "llvm") {
    const PackedFunc* pf =
        runtime::Registry::Get("codegen.llvm_target_enabled");

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2017-2022 by Contributors
 * @file ndarray.cc
@@ -26,7 +27,7 @@ constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
 #ifdef DGL_USE_CUDA
 constexpr DGLDataType DGLDataTypeTraits<__half>::dtype;
 #if BF16_ENABLED
-constexpr DGLDataType DGLDataTypeTraits<__nv_bfloat16>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<__hip_bfloat16>::dtype;
 #endif  // BF16_ENABLED
 #endif  // DGL_USE_CUDA
 constexpr DGLDataType DGLDataTypeTraits<float>::dtype;
@@ -222,7 +223,7 @@ void NDArray::RecordedCopyFromTo(
  CHECK(from->ctx.device_type != to->ctx.device_type)
      << "Recoding event is only called for the copy between CPU and GPU.";

-  CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLCUDA)
+  CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLROCM)
      << "At least one CUDA ctx needs to be involved.";

  DeviceAPI::Get(kDGLCUDA)->RecordedCopyDataFromTo(
@@ -262,7 +263,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
 void NDArray::UnpinContainer(NDArray::Container* ptr) {
  auto container_is_pinned = IsContainerPinned(ptr);
  // The tensor may be pinned outside of DGL via a different CUDA API,
-  // so we cannot unpin it with cudaHostUnregister.
+  // so we cannot unpin it with hipHostUnregister.
  CHECK(ptr->pinned_by_dgl_ || !container_is_pinned)
      << "Cannot unpin a tensor that is pinned outside of DGL.";
  // 1. not pinned, do nothing