update src and graphbolt code

6ac701f8 · sangwzh · 1547bd93 · 6ac701f8 · 6ac701f8 · 6ac701f8
Commit 6ac701f8 authored Sep 13, 2024 by sangwzh
20 changed files
--- a/src/array/cuda/array_sort.cu
+++ b/src/array/cuda/array_sort.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cpu/array_sort.cu
 * @brief Array sort GPU implementation
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"

-#include <cub/cub.cuh>
+
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {
 using runtime::NDArray;
@@ -29,20 +33,20 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
  IdType* keys_out = sorted_array.Ptr<IdType>();
  int64_t* values_out = sorted_idx.Ptr<int64_t>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  if (num_bits == 0) {
    num_bits = sizeof(IdType) * 8;
  }

  // Allocate workspace
  size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
      0, num_bits, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);

  // Compute
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      workspace, workspace_size, keys_in, keys_out, values_in, values_out,
      nitems, 0, num_bits, stream));


--- a/src/array/cuda/atomic.cuh
+++ b/src/array/cuda/atomic.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file array/cuda/atomic.cuh
@@ -6,7 +7,7 @@
 #ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
 #define DGL_ARRAY_CUDA_ATOMIC_CUH_

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>

 #include <cassert>
 #include <cstdint>
@@ -15,8 +16,8 @@
 #include "bf16.cuh"
 #include "fp16.cuh"

-#if __CUDA_ARCH__ >= 600
-#include <cuda_fp16.h>
+#if __HIPCC__ 
+#include <hip/hip_fp16.h>
 #endif

 namespace dgl {
@@ -56,39 +57,39 @@ struct Cast {

 template <>
 struct Cast<half> {
-  typedef Code<sizeof(half)>::Type Type;
-  static __device__ __forceinline__ Type Encode(half val) {
+  typedef half Type;
+  static __host__ __device__ __forceinline__ Type Encode(half val) {
    return __half_as_ushort(val);
  }
-  static __device__ __forceinline__ half Decode(Type code) {
+  static __host__ __device__ __forceinline__ half Decode(Type code) {
    return __ushort_as_half(code);
  }
 };

 #if BF16_ENABLED
 template <>
-struct Cast<__nv_bfloat16> {
-  typedef Code<sizeof(__nv_bfloat16)>::Type Type;
-  static __device__ __forceinline__ Type Encode(__nv_bfloat16 val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+struct Cast<__hip_bfloat16> {
+  typedef __hip_bfloat16 Type;
+  static __host__ __device__ __forceinline__ Type Encode(__hip_bfloat16 val) {
+#if defined(__HIP_DEVICE_COMPILE__)
    return __bfloat16_as_ushort(val);
 #else
    printf(
        "Atomic operations are not supported for bfloat16 (BF16) "
        "on GPUs with compute capability less than 8.0.\n");
-    __trap();
+    // //__trap();
    return static_cast<Type>(0);
 #endif
  }
-  static __device__ __forceinline__ __nv_bfloat16 Decode(Type code) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  static __host__ __device__ __forceinline__ __hip_bfloat16 Decode(Type code) {
+#if defined(__HIP_DEVICE_COMPILE__)
    return __ushort_as_bfloat16(code);
 #else
    printf(
        "Atomic operations are not supported for bfloat16 (BF16) "
        "on GPUs with compute capability less than 8.0.\n");
-    __trap();
-    return static_cast<__nv_bfloat16>(0.0f);
+    //__trap();
+    return static_cast<__hip_bfloat16>(0.0f);
 #endif
  }
 };
@@ -116,12 +117,12 @@ struct Cast<double> {
  }
 };

-static __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
+static __host__ __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
    unsigned short int* address,                                      // NOLINT
    unsigned short int compare,                                       // NOLINT
    unsigned short int val) {                                         // NOLINT
-  static_assert(CUDART_VERSION >= 10000, "Requires at least CUDA 10");
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
+  static_assert(DTKRT_VERSION >= 10000, "Requires at least CUDA 10");
+#if defined(__HIP_DEVICE_COMPILE__) && 0
  return atomicCAS(address, compare, val);
 #else
  (void)address;
@@ -130,9 +131,9 @@ static __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
  printf(
      "Atomic operations are not supported for half precision (FP16) "
      "on this GPU.\n");
-  __trap();
+  abort();
  return val;
-#endif  // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
+#endif  // (defined(__HIP_DEVICE_COMPILE__) 
 }

 #define DEFINE_ATOMIC(NAME)                                   \
@@ -168,19 +169,53 @@ static __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
    return Cast<dtype>::Decode(old);                               \
  }

-#define OP(a, b) max(a, b)
+#define DEFINE_ATOMIC_16BIT_BF(NAME, dtype)                           \
+  template <>                                                      \
+  __device__ __forceinline__ dtype Atomic##NAME<dtype>(            \
+      dtype * addr, dtype val) {                                   \
+    typedef uint16_t CT;                                           \
+    CT* addr_as_ui = reinterpret_cast<CT*>(addr);                  \
+    CT old = *addr_as_ui;                                          \
+    CT assumed = old;                                              \
+    do {                                                           \
+      assumed = old;                                               \
+      old = atomicCASshort(                                        \
+          addr_as_ui, assumed,                                     \
+          Cast<dtype>::Encode(max((double)val, (double)dtype(old)))); \
+    } while (assumed != old);                                      \
+    return Cast<dtype>::Decode(old);                               \
+  }
+
+#define DEFINE_ATOMIC_16BIT_Min(NAME, dtype)                           \
+  template <>                                                      \
+  __device__ __forceinline__ dtype Atomic##NAME<dtype>(            \
+      dtype * addr, dtype val) {                                   \
+    typedef uint16_t CT;                                           \
+    CT* addr_as_ui = reinterpret_cast<CT*>(addr);                  \
+    CT old = *addr_as_ui;                                          \
+    CT assumed = old;                                              \
+    do {                                                           \
+      assumed = old;                                               \
+      old = atomicCASshort(                                        \
+          addr_as_ui, assumed,                                     \
+          Cast<dtype>::Encode(min(val, dtype(old)))); \
+    } while (assumed != old);                                      \
+    return Cast<dtype>::Decode(old);                               \
+  }
+
+#define OP(a, b) max((double)a, (double)b)
 DEFINE_ATOMIC(Max)
 DEFINE_ATOMIC_16BIT(Max, half)
 #if BF16_ENABLED
-DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16)
+DEFINE_ATOMIC_16BIT_BF(Max, __hip_bfloat16)
 #endif  // BF16_ENABLED
 #undef OP

-#define OP(a, b) min(a, b)
+#define OP(a, b) min((double)a, (double)b)
 DEFINE_ATOMIC(Min)
 DEFINE_ATOMIC_16BIT(Min, half)
 #if BF16_ENABLED
-DEFINE_ATOMIC_16BIT(Min, __nv_bfloat16)
+DEFINE_ATOMIC_16BIT_BF(Min, __hip_bfloat16)
 #endif  // BF16_ENABLED
 #undef OP

@@ -256,7 +291,7 @@ inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) {

 template <>
 __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
-#if __CUDA_ARCH__ >= 200
+#if __HIP_DEVICE_COMPILE__ 
  return atomicAdd(addr, val);
 #else
  typedef float T;
@@ -270,12 +305,12 @@ __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
        addr_as_ui, assumed, Cast<T>::Encode(Cast<T>::Decode(old) + val));
  } while (assumed != old);
  return Cast<T>::Decode(old);
-#endif  // __CUDA_ARCH__
+#endif  // __HIP_DEVICE_COMPILE__
 }

 template <>
 __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
-#if __CUDA_ARCH__ >= 600
+#if __HIP_DEVICE_COMPILE__ 
  return atomicAdd(addr, val);
 #else
  typedef double T;
@@ -292,11 +327,11 @@ __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
 #endif
 }

-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000
+#if defined(DTKRT_VERSION) && DTKRT_VERSION >= 10000
 template <>
 __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
 // make sure we have half support
-#if __CUDA_ARCH__ >= 700
+#if __HIP_DEVICE_COMPILE__
  return atomicAdd(addr, val);
 #else
  (void)addr;
@@ -304,18 +339,18 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
  printf(
      "Atomic operations are not supported for half precision (FP16) "
      "on this GPU.\n");
-  __trap();
+  // //__trap();
  return val;
-#endif  // __CUDA_ARCH__ >= 700
+#endif  // __HIP_DEVICE_COMPILE__ 
 }
-#endif  // defined(CUDART_VERSION) && CUDART_VERSION >= 10000
+#endif  // defined(DTKRT_VERSION) && DTKRT_VERSION >= 10000

 #if BF16_ENABLED
 template <>
-__device__ __forceinline__ __nv_bfloat16
-AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
+__device__ __forceinline__ __hip_bfloat16
+AtomicAdd<__hip_bfloat16>(__hip_bfloat16* addr, __hip_bfloat16 val) {
 // make sure we have bfloat16 support
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#if defined(__HIP_DEVICE_COMPILE__) 
  return atomicAdd(addr, val);
 #else
  (void)addr;
@@ -323,9 +358,9 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
  printf(
      "Atomic operations are not supported for bfloat16 (BF16) "
      "on GPUs with compute capability less than 8.0.\n");
-  __trap();
+  //__trap();
  return val;
-#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#endif  // defined(__HIP_DEVICE_COMPILE__) 
 }
 #endif  // BF16_ENABLED


--- a/src/array/cuda/bf16.cuh
+++ b/src/array/cuda/bf16.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2022 by Contributors
 *
@@ -18,131 +19,140 @@
 */
 #ifndef DGL_ARRAY_CUDA_BF16_CUH_
 #define DGL_ARRAY_CUDA_BF16_CUH_
-
+#include <hip/hip_runtime.h>
 #if BF16_ENABLED
-#include <cuda_bf16.h>
+#include <hip/hip_bf16.h>

 #include <algorithm>

-static __device__ __forceinline__ __nv_bfloat16
-max(__nv_bfloat16 a, __nv_bfloat16 b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+static __device__ __forceinline__ __hip_bfloat16
+max(__hip_bfloat16 a, __hip_bfloat16 b) {
+#if defined(__HIP_DEVICE_COMPILE__)
  return __hmax(a, b);
 #else
-  return __nv_bfloat16(max(float(a), float(b)));  // NOLINT
+  return __hip_bfloat16(max(float(a), float(b)));  // NOLINT
 #endif
 }

-static __device__ __forceinline__ __nv_bfloat16
-min(__nv_bfloat16 a, __nv_bfloat16 b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+static __device__ __forceinline__ __hip_bfloat16
+min(__hip_bfloat16 a, __hip_bfloat16 b) {
+#if defined(__HIP_DEVICE_COMPILE__) 
  return __hmin(a, b);
 #else
-  return __nv_bfloat16(min(float(a), float(b)));  // NOLINT
+  return __hip_bfloat16(min(float(a), float(b)));  // NOLINT
 #endif
 }

-#ifdef __CUDACC__
+#ifdef __HIPCC__ 
 // Arithmetic BF16 operations for architecture >= 8.0 are already defined in
-// cuda_bf16.h
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-// CUDA 12.2 adds "emulated" support for older architectures.
-#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
-__device__ __forceinline__ __nv_bfloat16
-operator+(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) + float(rh));  // NOLINT
-}
-__device__ __forceinline__ __nv_bfloat16
-operator-(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) - float(rh));  // NOLINT
-}
-__device__ __forceinline__ __nv_bfloat16
-operator*(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) * float(rh));  // NOLINT
-}
-__device__ __forceinline__ __nv_bfloat16
-operator/(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) / float(rh));  // NOLINT
+// hip/__hip_bfloat16.h
+// #if defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800)
+// // CUDA 12.2 adds "emulated" support for older architectures.
+// #if defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
+__device__ __forceinline__ __hip_bfloat16
+operator+(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) + float(rh));  // NOLINT
+}
+__device__ __forceinline__ __hip_bfloat16
+operator-(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) - float(rh));  // NOLINT
+}
+__device__ __forceinline__ __hip_bfloat16
+operator*(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) * float(rh));  // NOLINT
+}
+__device__ __forceinline__ __hip_bfloat16
+operator/(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) / float(rh));  // NOLINT
 }

-__device__ __forceinline__ __nv_bfloat16& operator+=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) + float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator+=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) + float(rh));       // NOLINT
  return lh;
 }
-__device__ __forceinline__ __nv_bfloat16& operator-=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) - float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator-=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) - float(rh));       // NOLINT
  return lh;
 }
-__device__ __forceinline__ __nv_bfloat16& operator*=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) * float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator*=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) * float(rh));       // NOLINT
  return lh;
 }
-__device__ __forceinline__ __nv_bfloat16& operator/=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) / float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator/=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) / float(rh));       // NOLINT
  return lh;
 }

-__device__ __forceinline__ __nv_bfloat16& operator++(
-    __nv_bfloat16& h) {                // NOLINT
-  h = __nv_bfloat16(float(h) + 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator++(
+    __hip_bfloat16& h) {                // NOLINT
+  h = __hip_bfloat16(float(h) + 1.0f);  // NOLINT
  return h;
 }
-__device__ __forceinline__ __nv_bfloat16& operator--(
-    __nv_bfloat16& h) {                // NOLINT
-  h = __nv_bfloat16(float(h) - 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator--(
+    __hip_bfloat16& h) {                // NOLINT
+  h = __hip_bfloat16(float(h) - 1.0f);  // NOLINT
  return h;
 }
-__device__ __forceinline__ __nv_bfloat16
-operator++(__nv_bfloat16& h, int) {  // NOLINT
-  __nv_bfloat16 ret = h;
-  h = __nv_bfloat16(float(h) + 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator++(__hip_bfloat16& h, int) {  // NOLINT
+  __hip_bfloat16 ret = h;
+  h = __hip_bfloat16(float(h) + 1.0f);  // NOLINT
  return ret;
 }
-__device__ __forceinline__ __nv_bfloat16
-operator--(__nv_bfloat16& h, int) {  // NOLINT
-  __nv_bfloat16 ret = h;
-  h = __nv_bfloat16(float(h) - 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator--(__hip_bfloat16& h, int) {  // NOLINT
+  __hip_bfloat16 ret = h;
+  h = __hip_bfloat16(float(h) - 1.0f);  // NOLINT
  return ret;
 }

-__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) {
+__device__ __forceinline__ __hip_bfloat16 operator+(const __hip_bfloat16& h) {
  return h;
 }
-__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) {
-  return __nv_bfloat16(-float(h));  // NOLINT
+__device__ __forceinline__ __hip_bfloat16 operator-(const __hip_bfloat16& h) {
+  return __hip_bfloat16(-float(h));  // NOLINT
 }

 __device__ __forceinline__ bool operator==(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
  return float(lh) == float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator!=(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
  return float(lh) != float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator>(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
  return float(lh) > float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator<(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
  return float(lh) < float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator>=(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
  return float(lh) >= float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator<=(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
  return float(lh) <= float(rh);  // NOLINT
 }
-#endif  // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
-#endif  // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-#endif  // __CUDACC__
+// #endif  // defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
+// #endif  // defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800)
+__device__
+inline
+__hip_bfloat16 __shfl_down(__hip_bfloat16 var, unsigned int lane_delta, int width = warpSize) {
+    union { unsigned short s; __hip_bfloat16 us; } tmp;
+    tmp.us = var;
+    tmp.s = __shfl_down(tmp.s, lane_delta, width);
+    return tmp.us;
+}
+#endif  // __HIPCC__
+

 #endif  // BF16_ENABLED


--- a/src/array/cuda/coo2csr.cu
+++ b/src/array/cuda/coo2csr.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/coo2csr.cc
 * @brief COO2CSR
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
+

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {

@@ -24,12 +28,12 @@ CSRMatrix COOToCSR(COOMatrix coo) {
 template <>
 CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));

  bool row_sorted = coo.row_sorted;
  bool col_sorted = coo.col_sorted;
@@ -50,9 +54,9 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
  NDArray indptr =
      aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
  int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
-  CUSPARSE_CALL(cusparseXcoo2csr(
+  CUSPARSE_CALL(hipsparseXcoo2csr(
      thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
-      indptr_ptr, CUSPARSE_INDEX_BASE_ZERO));
+      indptr_ptr, HIPSPARSE_INDEX_BASE_ZERO));

  return CSRMatrix(
      coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
@@ -100,7 +104,7 @@ template <>
 CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
  const auto& ctx = coo.row->ctx;
  const auto nbits = coo.row->dtype.bits;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  bool row_sorted = coo.row_sorted;
  bool col_sorted = coo.col_sorted;
  if (!row_sorted) {

--- a/src/array/cuda/coo_sort.cu
+++ b/src/array/cuda/coo_sort.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/coo_sort.cc
 * @brief Sort COO index
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
+

 #include "../../c_api_common.h"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {

@@ -65,7 +69,7 @@ __global__ void _COODecodeEdgesKernel(

 template <DGLDeviceType XPU, typename IdType>
 void COOSort_(COOMatrix* coo, bool sort_column) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int row_bits = cuda::_NumberOfBits(coo->num_rows);

  const int64_t nnz = coo->row->shape[0];
@@ -138,7 +142,7 @@ template <DGLDeviceType XPU, typename IdType>
 std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
  const int64_t nnz = coo.row->shape[0];
  const auto& ctx = coo.row->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(ctx);
  // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
  // should be fine.

--- a/src/array/cuda/csr2coo.cu
+++ b/src/array/cuda/csr2coo.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/csr2coo.cc
@@ -8,10 +10,10 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {

@@ -29,12 +31,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
 template <>
 COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));

  NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
  const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
@@ -42,9 +44,9 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
      aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
  int32_t* row_ptr = static_cast<int32_t*>(row->data);

-  CUSPARSE_CALL(cusparseXcsr2coo(
+  CUSPARSE_CALL(hipsparseXcsr2coo(
      thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
-      row_ptr, CUSPARSE_INDEX_BASE_ZERO));
+      row_ptr, HIPSPARSE_INDEX_BASE_ZERO));

  return COOMatrix(
      csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
@@ -72,10 +74,40 @@ struct AdjacentDifference {
  }
 };

+/*!
+ * \brief Repeat elements
+ * \param val Value to repeat
+ * \param repeats Number of repeats for each value
+ * \param pos The position of the output buffer to write the value.
+ * \param out Output buffer.
+ * \param length Number of values
+ *
+ * For example:
+ * val = [3, 0, 1]
+ * repeats = [1, 0, 2]
+ * pos = [0, 1, 1]  # write to output buffer position 0, 1, 1
+ * then,
+ * out = [3, 1, 1]
+ */
+template <typename DType, typename IdType>
+__global__ void _RepeatKernel(
+    const DType* val, const IdType* pos,
+    DType* out, int64_t n_row, int64_t length) {
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1;
+    out[tx] = val[i];
+    tx += stride_x;
+  }
+}
+
+
+#if 0
 template <>
 COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
  const auto& ctx = csr.indptr->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t nnz = csr.indices->shape[0];
  const auto nbits = csr.indptr->dtype.bits;
@@ -96,14 +128,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
    std::size_t temp_storage_bytes = 0;
    CUDA_CALL(cub::DeviceCopy::Batched(
        nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
-        buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once),
+        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
        stream));

    auto temp = allocator.alloc_unique<char>(temp_storage_bytes);

    CUDA_CALL(cub::DeviceCopy::Batched(
        temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
-        buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once),
+        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
        stream));
  }

@@ -111,6 +143,30 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
      csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
      csr.sorted);
 }
+#else
+template <>
+COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
+  const auto& ctx = csr.indptr->ctx;
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  const int64_t nnz = csr.indices->shape[0];
+  const auto nbits = csr.indptr->dtype.bits;
+  IdArray rowids = Range(0, csr.num_rows, nbits, ctx);
+  IdArray ret_row = NewIdArray(nnz, ctx, nbits);
+
+  const int nt = 256;
+  const int nb = (nnz + nt - 1) / nt;
+  CUDA_KERNEL_CALL(_RepeatKernel,
+      nb, nt, 0, stream,
+      rowids.Ptr<int64_t>(),
+      csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
+      csr.num_rows, nnz);
+
+  return COOMatrix(csr.num_rows, csr.num_cols,
+                   ret_row, csr.indices, csr.data,
+                   true, csr.sorted);
+}
+#endif

 template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
 template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
@@ -128,12 +184,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));

  NDArray row = coo.row, col = coo.col, data = coo.data;
  int32_t* row_ptr = static_cast<int32_t*>(row->data);
@@ -141,11 +197,11 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
  int32_t* data_ptr = static_cast<int32_t*>(data->data);

  size_t workspace_size = 0;
-  CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
+  CUSPARSE_CALL(hipsparseXcoosort_bufferSizeExt(
      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
      data_ptr, row_ptr, &workspace_size));
  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
-  CUSPARSE_CALL(cusparseXcoosortByRow(
+  CUSPARSE_CALL(hipsparseXcoosortByRow(
      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
      data_ptr, row_ptr, col_ptr, workspace));
  device->FreeWorkspace(row->ctx, workspace);

--- a/src/array/cuda/csr_get_data.cu
+++ b/src/array/cuda/csr_get_data.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/csr_get_data.cu
 * @brief Retrieve entries of a CSR matrix
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
+

 #include <numeric>
 #include <unordered_set>
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {

@@ -32,11 +36,11 @@ NDArray CSRGetData(
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;

-  const int64_t rstlen = std::max(rowlen, collen);
+  const int64_t rstlen = ::max(rowlen, collen);
  IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx);
  if (rstlen == 0) return rst;

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int nt = cuda::FindNumThreads(rstlen);
  const int nb = (rstlen + nt - 1) / nt;
  if (return_eids)
@@ -67,12 +71,12 @@ template NDArray CSRGetData<kDGLCUDA, int64_t, __half>(
    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
    NDArray weights, __half filler);
 #if BF16_ENABLED
-template NDArray CSRGetData<kDGLCUDA, int32_t, __nv_bfloat16>(
+template NDArray CSRGetData<kDGLCUDA, int32_t, __hip_bfloat16>(
    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
-    NDArray weights, __nv_bfloat16 filler);
-template NDArray CSRGetData<kDGLCUDA, int64_t, __nv_bfloat16>(
+    NDArray weights, __hip_bfloat16 filler);
+template NDArray CSRGetData<kDGLCUDA, int64_t, __hip_bfloat16>(
    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
-    NDArray weights, __nv_bfloat16 filler);
+    NDArray weights, __hip_bfloat16 filler);
 #endif  // BF16_ENABLED
 template NDArray CSRGetData<kDGLCUDA, int32_t, float>(
    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,

--- a/src/array/cuda/csr_mm.cu
+++ b/src/array/cuda/csr_mm.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/csr_mm.cu
 * @brief SpSpMM/SpGEMM C APIs and definitions.
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
+
 #include <dgl/runtime/device_api.h>

 #include <limits>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./cusparse_dispatcher.cuh"
-#include "./functor.cuh"
+#include "cusparse_dispatcher.cuh"
+#include "functor.cuh"
 namespace dgl {

 using namespace dgl::runtime;
@@ -18,7 +22,7 @@ using namespace dgl::runtime;
 namespace aten {
 namespace cusparse {

-#if CUDART_VERSION >= 12000
+#if DTKRT_VERSION >= 12000

 /** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */
 template <typename DType, typename IdType>
@@ -31,74 +35,74 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
  const int nnzB = B.indices->shape[0];
  const DType alpha = 1.0;
  const DType beta = 0.0;
-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
  // device
  auto ctx = A.indptr->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* A_weights = A_weights_array.Ptr<DType>();
  const DType* B_weights = B_weights_array.Ptr<DType>();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
  // all one data array
-  cusparseSpMatDescr_t matA, matB, matC;
+  hipsparseSpMatDescr_t matA, matB, matC;
  IdArray dC_csrOffsets =
      IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx);
  IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
  constexpr auto idtype = cusparse_idtype<IdType>::value;
  constexpr auto dtype = cuda_dtype<DType>::value;
  // Create sparse matrix A, B and C in CSR format
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
      &matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(),
      A.indices.Ptr<IdType>(),
-      // cusparseCreateCsr only accepts non-const pointers.
-      const_cast<DType*>(A_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO,
+      // hipsparseCreateCsr only accepts non-const pointers.
+      const_cast<DType*>(A_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO,
      dtype));
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
      &matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(),
      B.indices.Ptr<IdType>(),
-      // cusparseCreateCsr only accepts non-const pointers.
-      const_cast<DType*>(B_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO,
+      // hipsparseCreateCsr only accepts non-const pointers.
+      const_cast<DType*>(B_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO,
      dtype));
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
      &matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr,
-      idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, dtype));
+      idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO, dtype));
  // SpGEMM Computation
-  cusparseSpGEMMDescr_t spgemmDesc;
-  cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_DEFAULT;
+  hipsparseSpGEMMDescr_t spgemmDesc;
+  cusparseSpGEMMAlg_t alg = HIPSPARSE_SPGEMM_DEFAULT;

-  CUSPARSE_CALL(cusparseSpGEMM_createDescr(&spgemmDesc));
+  CUSPARSE_CALL(hipsparseSpGEMM_createDescr(&spgemmDesc));
  size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0;
  // ask bufferSize1 bytes for external memory
-  CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+  CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
      matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
  void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
  // inspect the matrices A and B to understand the memory requiremnent
-  cusparseStatus_t e = cusparseSpGEMM_workEstimation(
+  hipsparseStatus_t e = hipsparseSpGEMM_workEstimation(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
      matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1);
-  // CUSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
+  // HIPSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
  // and throws insufficient memory error within workEstimation call
  if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
    // fall back to ALG2 to estimate num_prods
    alg = CUSPARSE_SPGEMM_ALG2;
    device->FreeWorkspace(ctx, workspace1);
-    // rerun cusparseSpGEMM_workEstimation
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    // rerun hipsparseSpGEMM_workEstimation
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
        matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
    workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
        matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
  } else {
-    CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
+    CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
  }

  // get the number of intermediate products required for SpGEMM compute
@@ -113,22 +117,22 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
  int64_t LARGE_NUM_PRODUCTS = 800000000;  // 800*1000*1000;

  // switch to ALG2/ALG3 for medium & large problem size
-  if (alg == CUSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
+  if (alg == HIPSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
    // use ALG3 for very large problem
    alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3
                                         : CUSPARSE_SPGEMM_ALG2;

    device->FreeWorkspace(ctx, workspace1);
-    // rerun cusparseSpGEMM_workEstimation
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    // rerun hipsparseSpGEMM_workEstimation
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
        matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
    workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
        matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
  } else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
-    // no need to rerun cusparseSpGEMM_workEstimation between ALG2 and ALG3
+    // no need to rerun hipsparseSpGEMM_workEstimation between ALG2 and ALG3
    alg = CUSPARSE_SPGEMM_ALG3;
  }

@@ -147,40 +151,40 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
        workspace3, &workspace_size2));
    device->FreeWorkspace(ctx, workspace3);
  } else {
-    CUSPARSE_CALL(cusparseSpGEMM_compute(
+    CUSPARSE_CALL(hipsparseSpGEMM_compute(
        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
        matC, dtype, alg, spgemmDesc, &workspace_size2, NULL));
  }
  // ask bufferSize2 bytes for external memory
  void* workspace2 = device->AllocWorkspace(ctx, workspace_size2);
  // compute the intermediate product of A * B
-  CUSPARSE_CALL(cusparseSpGEMM_compute(
+  CUSPARSE_CALL(hipsparseSpGEMM_compute(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
      matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2));
  // get matrix C non-zero entries C_nnz1
  int64_t C_num_rows1, C_num_cols1, C_nnz1;
  CUSPARSE_CALL(
-      cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
+      hipsparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
  IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx);
  NDArray dC_weights =
      NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx);
  IdType* dC_columns_data = dC_columns.Ptr<IdType>();
  DType* dC_weights_data = dC_weights.Ptr<DType>();
  // update matC with the new pointers
-  CUSPARSE_CALL(cusparseCsrSetPointers(
+  CUSPARSE_CALL(hipsparseCsrSetPointers(
      matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data));
  // copy the final products to the matrix C
-  CUSPARSE_CALL(cusparseSpGEMM_copy(
+  CUSPARSE_CALL(hipsparseSpGEMM_copy(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
      matC, dtype, alg, spgemmDesc));

  device->FreeWorkspace(ctx, workspace1);
  device->FreeWorkspace(ctx, workspace2);
  // destroy matrix/vector descriptors
-  CUSPARSE_CALL(cusparseSpGEMM_destroyDescr(spgemmDesc));
-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroySpMat(matB));
-  CUSPARSE_CALL(cusparseDestroySpMat(matC));
+  CUSPARSE_CALL(hipsparseSpGEMM_destroyDescr(spgemmDesc));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matB));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matC));
  return {
      CSRMatrix(
          A.num_rows, B.num_cols, dC_csrOffsets, dC_columns,
@@ -188,7 +192,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
      dC_weights};
 }

-#else  // CUDART_VERSION < 12000
+#else  // DTKRT_VERSION < 12000

 /** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA
 * versions */
@@ -208,25 +212,25 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
  auto ctx = A.indptr->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto idtype = A.indptr->dtype;
  auto dtype = A_weights_array->dtype;
  const DType* A_weights = A_weights_array.Ptr<DType>();
  const DType* B_weights = B_weights_array.Ptr<DType>();
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
-  CUSPARSE_CALL(cusparseSetPointerMode(
-      thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetPointerMode(
+      thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST));

-  CUSPARSE_CALL(cusparseCreateCsrgemm2Info(&info));
+  CUSPARSE_CALL(hipsparseCreateCsrgemm2Info(&info));

-  cusparseMatDescr_t matA, matB, matC, matD;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matA));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matB));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matC));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matD));  // needed even if D is null
+  hipsparseMatDescr_t matA, matB, matC, matD;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matA));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matB));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matC));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matD));  // needed even if D is null

  CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt(
      thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA,
@@ -252,11 +256,11 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
      C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace));

  device->FreeWorkspace(ctx, workspace);
-  CUSPARSE_CALL(cusparseDestroyCsrgemm2Info(info));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matA));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matB));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matC));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matD));
+  CUSPARSE_CALL(hipsparseDestroyCsrgemm2Info(info));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matA));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matB));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matC));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matD));

  return {
      CSRMatrix(
@@ -264,7 +268,7 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
      C_weights};
 }

-#endif  // CUDART_VERSION >= 12000
+#endif  // DTKRT_VERSION >= 12000
 }  // namespace cusparse

 template <int XPU, typename IdType, typename DType>
@@ -314,9 +318,9 @@ template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>(
 template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>(
    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
 #if BF16_ENABLED
-template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __hip_bfloat16>(
    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
-template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __hip_bfloat16>(
    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
 #endif  // BF16_ENABLED
 template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>(

--- a/src/array/cuda/csr_sort.cu
+++ b/src/array/cuda/csr_sort.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/csr_sort.cc
@@ -5,10 +7,10 @@
 */
 #include <dgl/array.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {

@@ -39,7 +41,7 @@ __global__ void _SegmentIsSorted(
 template <DGLDeviceType XPU, typename IdType>
 bool CSRIsSorted(CSRMatrix csr) {
  const auto& ctx = csr.indptr->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(ctx);
  // We allocate a workspace of num_rows bytes. It wastes a little bit memory
  // but should be fine.
@@ -67,12 +69,12 @@ template <>
 void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));

  NDArray indptr = csr->indptr;
  NDArray indices = csr->indices;
@@ -83,16 +85,16 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
  NDArray data = csr->data;

  size_t workspace_size = 0;
-  CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
+  CUSPARSE_CALL(hipsparseXcsrsort_bufferSizeExt(
      thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);

-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-  CUSPARSE_CALL(cusparseXcsrsort(
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CALL(hipsparseXcsrsort(
      thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
      workspace));
@@ -100,13 +102,13 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
  csr->sorted = true;

  // free resources
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
  device->FreeWorkspace(ctx, workspace);
 }

 template <>
 void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);

  const auto& ctx = csr->indptr->ctx;
@@ -125,13 +127,13 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {

  // Allocate workspace
  size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs(
      nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
      csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);

  // Compute
-  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs(
      workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
      csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));


--- a/src/array/cuda/csr_sum.cu
+++ b/src/array/cuda/csr_sum.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cu
@@ -7,8 +9,8 @@
 #include <dgl/runtime/device_api.h>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./cusparse_dispatcher.cuh"
-#include "./functor.cuh"
+#include "cusparse_dispatcher.cuh"
+#include "functor.cuh"

 namespace dgl {

@@ -32,21 +34,21 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
  auto ctx = A.indptr->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* A_weights = A_weights_array.Ptr<DType>();
  const DType* B_weights = B_weights_array.Ptr<DType>();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle)
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));

-  cusparseMatDescr_t matA, matB, matC;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matA));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matB));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matC));
+  hipsparseMatDescr_t matA, matB, matC;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matA));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matB));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matC));

-  cusparseSetPointerMode(
-      thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST);
+  hipsparseSetPointerMode(
+      thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST);
  size_t workspace_size = 0;
  /* prepare output C */
  IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx);
@@ -81,9 +83,9 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(

  device->FreeWorkspace(ctx, workspace);
  // destroy matrix/vector descriptors
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matA));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matB));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matC));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matA));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matB));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matC));
  return {
      CSRMatrix(
          A.num_rows, A.num_cols, dC_csrOffsets, dC_columns,
@@ -159,9 +161,9 @@ template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>(
 template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>(
    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
 #if BF16_ENABLED
-template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
-template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
 #endif  // BF16_ENABLED
 template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>(

--- a/src/array/cuda/csr_transpose.cc
+++ b/src/array/cuda/csr_transpose.cc
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/csr_transpose.cc
@@ -23,12 +25,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
 template <>
 CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));

  NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
  const int64_t nnz = indices->shape[0];
@@ -48,30 +50,30 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
  int32_t* t_indices_ptr = static_cast<int32_t*>(t_indices->data);
  void* t_data_ptr = t_data->data;

-#if CUDART_VERSION >= 10010
+#if DTKRT_VERSION >= 10010
  auto device = runtime::DeviceAPI::Get(csr.indptr->ctx);
  // workspace
  size_t workspace_size;
-  CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize(
+  CUSPARSE_CALL(hipsparseCsr2cscEx2_bufferSize(
      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
      indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
-      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
+      HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO,
+      HIPSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
      &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseCsr2cscEx2(
+  CUSPARSE_CALL(hipsparseCsr2cscEx2(
      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
      indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
-      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
+      HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO,
+      HIPSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
      workspace));
  device->FreeWorkspace(ctx, workspace);
 #else
-  CUSPARSE_CALL(cusparseScsr2csc(
+  CUSPARSE_CALL(hipsparseScsr2csc(
      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz,
      static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr,
      static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr,
-      CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
+      HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO));
 #endif

  return CSRMatrix(

--- a/src/array/cuda/cuda_filter.cu
+++ b/src/array/cuda/cuda_filter.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/cuda_filter.cc
@@ -6,7 +8,7 @@

 #include <dgl/runtime/device_api.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
 #include "../../runtime/cuda/cuda_hashtable.cuh"
@@ -45,7 +47,7 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
  const auto& ctx = test->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  const int64_t size = test->shape[0];
-  cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
+  hipStream_t cudaStream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  if (size == 0) {
    return test;
@@ -74,12 +76,12 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
  // generate prefix-sum
  {
    size_t workspace_bytes;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
        nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
        static_cast<IdType*>(nullptr), size + 1, cudaStream));
    void* workspace = device->AllocWorkspace(ctx, workspace_bytes);

-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
        workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
    device->FreeWorkspace(ctx, workspace);
  }
@@ -108,8 +110,8 @@ template <typename IdType>
 class CudaFilterSet : public Filter {
 public:
  explicit CudaFilterSet(IdArray array)
-      : table_(array->shape[0], array->ctx, runtime::getCurrentCUDAStream()) {
-    cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
+      : table_(array->shape[0], array->ctx, runtime::getCurrentHIPStreamMasqueradingAsCUDA()) {
+    hipStream_t cudaStream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
    table_.FillWithUnique(
        static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
  }

--- a/src/array/cuda/cusparse_dispatcher.cuh
+++ b/src/array/cuda/cusparse_dispatcher.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/dispatcher.cuh
@@ -7,7 +8,7 @@
 #ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
 #define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_

-#include <cusparse.h>
+#include <hipsparse/hipsparse.h>
 #include <dgl/runtime/c_runtime_api.h>

 #include "bf16.cuh"
@@ -20,70 +21,70 @@ namespace aten {
 template <typename DType>
 struct CSRGEMM {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }
 };

 template <>
 struct CSRGEMM<__half> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
    // different implementation would be required.
    LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
    // implementation would be required.
    LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }
 };

 #if BF16_ENABLED
 template <>
-struct CSRGEMM<__nv_bfloat16> {
+struct CSRGEMM<__hip_bfloat16> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
    // different implementation would be required.
    LOG(FATAL)
        << "CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
    // implementation would be required.
    LOG(FATAL) << "CSRGEMM::compute does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }
 };
 #endif  // BF16_ENABLED
@@ -91,36 +92,36 @@ struct CSRGEMM<__nv_bfloat16> {
 template <>
 struct CSRGEMM<float> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseScsrgemm2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseScsrgemm2_bufferSizeExt(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseScsrgemm2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseScsrgemm2(args...);
  }
 };

 template <>
 struct CSRGEMM<double> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseDcsrgemm2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseDcsrgemm2_bufferSizeExt(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseDcsrgemm2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseDcsrgemm2(args...);
  }
 };

@@ -128,70 +129,70 @@ struct CSRGEMM<double> {
 template <typename DType>
 struct CSRGEAM {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }
 };

 template <>
 struct CSRGEAM<__half> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
    // different implementation would be required.
    LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
    // implementation would be required.
    LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }
 };

 #if BF16_ENABLED
 template <>
-struct CSRGEAM<__nv_bfloat16> {
+struct CSRGEAM<__hip_bfloat16> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
    // different implementation would be required.
    LOG(FATAL)
        << "CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
    // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
    // implementation would be required.
    LOG(FATAL) << "CSRGEAM::compute does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
  }
 };
 #endif  // BF16_ENABLED
@@ -199,36 +200,36 @@ struct CSRGEAM<__nv_bfloat16> {
 template <>
 struct CSRGEAM<float> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseScsrgeam2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseScsrgeam2_bufferSizeExt(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseScsrgeam2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseScsrgeam2(args...);
  }
 };

 template <>
 struct CSRGEAM<double> {
  template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseDcsrgeam2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseDcsrgeam2_bufferSizeExt(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
  }

  template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseDcsrgeam2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseDcsrgeam2(args...);
  }
 };


--- a/src/array/cuda/disjoint_union.cu
+++ b/src/array/cuda/disjoint_union.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *   Copyright (c) 2022, NVIDIA CORPORATION.
 *
@@ -24,7 +26,7 @@
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {
 using runtime::NDArray;
@@ -78,7 +80,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
 template <DGLDeviceType XPU, typename IdType>
 void _Merge(
    IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
-    int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
+    int n_elms, DGLContext ctx, DGLDataType dtype, hipStream_t stream) {
  auto device = runtime::DeviceAPI::Get(ctx);
  int nt = 256;
  int nb = (n_elms + nt - 1) / nt;
@@ -99,7 +101,7 @@ void _Merge(

 template <DGLDeviceType XPU, typename IdType>
 COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
  uint64_t src_offset = 0, dst_offset = 0;
  bool has_data = false;

--- a/src/array/cuda/fp16.cuh
+++ b/src/array/cuda/fp16.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020-2022 by Contributors
 *
@@ -21,12 +22,12 @@
 #ifndef DGL_ARRAY_CUDA_FP16_CUH_
 #define DGL_ARRAY_CUDA_FP16_CUH_

-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>

 #include <algorithm>

 static __device__ __forceinline__ half max(half a, half b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__HIP_DEVICE_COMPILE__) 
  return __hgt(__half(a), __half(b)) ? a : b;
 #else
  return __half(max(float(a), float(b)));  // NOLINT
@@ -34,19 +35,19 @@ static __device__ __forceinline__ half max(half a, half b) {
 }

 static __device__ __forceinline__ half min(half a, half b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__HIP_DEVICE_COMPILE__) 
  return __hlt(__half(a), __half(b)) ? a : b;
 #else
  return __half(min(float(a), float(b)));  // NOLINT
 #endif
 }
-
-#ifdef __CUDACC__
+#if 0
+#ifdef __HIPCC__
 // Arithmetic FP16 operations for architecture >= 5.3 are already defined in
-// cuda_fp16.h
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
+// hip/hip_fp16.h
+#if defined(__HIP_DEVICE_COMPILE__) 
 // CUDA 12.2 adds "emulated" support for older architectures.
-#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
+#if defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
 __device__ __forceinline__ __half
 operator+(const __half& lh, const __half& rh) {
  return __half(float(lh) + float(rh));  // NOLINT
@@ -127,8 +128,8 @@ __device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
 __device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
  return float(lh) <= float(rh);  // NOLINT
 }
-#endif  // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
-#endif  // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
-#endif  // __CUDACC__
-
+#endif  // defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020)
+#endif  // defined(__HIP_DEVICE_COMPILE__) 
+#endif  // __HIPCC__
+#endif
 #endif  // DGL_ARRAY_CUDA_FP16_CUH_
--- a/src/array/cuda/functor.cuh
+++ b/src/array/cuda/functor.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/functor.cuh
@@ -9,8 +10,8 @@
 #include <cmath>
 #include <limits>

-#include "./atomic.cuh"
-#include "./fp16.cuh"
+#include "atomic.cuh"
+#include "fp16.cuh"
 #include "bf16.cuh"

 namespace dgl {
@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {

 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Sum<Idx, __nv_bfloat16, atomic> : _Sum<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
-    return __float2bfloat16_rn(0.);
+struct Sum<Idx, __hip_bfloat16, atomic> : _Sum<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
+    return __float2bfloat16(0.);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Sum<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Sum<Idx, __hip_bfloat16, atomic>::Call(
        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Sum<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Sum<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
  }
  // sometimes we have to use float in reduction for better precision
  static __device__ __forceinline__ void Call(
      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
    _Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
        static_cast<float>(val), uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
    _Sum<Idx, float, atomic>::Call(out_buf, arg_buf,
        static_cast<float>(val), id);
  }
@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {

 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Max<Idx, __nv_bfloat16, atomic> : _Max<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
-    return __float2bfloat16_rn(-std::numeric_limits<float>::infinity());
+struct Max<Idx, __hip_bfloat16, atomic> : _Max<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
+    return __float2bfloat16(-std::numeric_limits<float>::infinity());
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Max<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Max<Idx, __hip_bfloat16, atomic>::Call(
        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Max<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Max<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
  }
  // sometimes we have to use float in reduction for better precision
  static __device__ __forceinline__ void Call(
      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
    _Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
        static_cast<float>(val), uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
    _Max<Idx, float, atomic>::Call(out_buf, arg_buf,
        static_cast<float>(val), id);
  }
@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {

 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Min<Idx, __nv_bfloat16, atomic> : _Min<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
-    return __float2bfloat16_rn(std::numeric_limits<float>::infinity());
+struct Min<Idx, __hip_bfloat16, atomic> : _Min<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
+    return __float2bfloat16(std::numeric_limits<float>::infinity());
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Min<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Min<Idx, __hip_bfloat16, atomic>::Call(
        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Min<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Min<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
  }
  // sometimes we have to use float in reduction for better precision
  static __device__ __forceinline__ void Call(
      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
    _Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
        static_cast<float>(val), uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
    _Min<Idx, float, atomic>::Call(out_buf, arg_buf,
        static_cast<float>(val), id);
  }

--- a/src/array/cuda/gather_mm.cu
+++ b/src/array/cuda/gather_mm.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/gather_mm.cu
@@ -7,9 +9,9 @@

 #include <algorithm>  // std::swap

-#include "./atomic.cuh"
-#include "./functor.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "functor.cuh"
+#include "utils.h"

 namespace dgl {
 using namespace cuda;
@@ -20,54 +22,63 @@ namespace {
 /** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
 */
 template <typename DType>
-cublasStatus_t cublasGemm(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const DType* alpha, const DType* A, int lda,
    const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
  LOG(INFO) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }

 template <>
-cublasStatus_t cublasGemm<__half>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<__half>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const __half* alpha, const __half* A, int lda,
    const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
-  return cublasHgemm(
-      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return hipblasHgemm(
+      handle, transa, transb, m, n, k, (hipblasHalf*)alpha, (hipblasHalf*)A, lda, (hipblasHalf*)B, ldb, (hipblasHalf*)beta, (hipblasHalf*)C, ldc);
 }

+// template <>
+// hipblasStatus_t cublasGemm<__half>(
+//     hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+//     int m, int n, int k, const __half* alpha, const __half* A, int lda,
+//     const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
+//   return hipblasHgemm(
+//       handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+// }
+
 #if BF16_ENABLED
 template <>
-cublasStatus_t cublasGemm<__nv_bfloat16>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A,
-    int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta,
-    __nv_bfloat16* C, int ldc) {
+hipblasStatus_t cublasGemm<__hip_bfloat16>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+    int m, int n, int k, const __hip_bfloat16* alpha, const __hip_bfloat16* A,
+    int lda, const __hip_bfloat16* B, int ldb, const __hip_bfloat16* beta,
+    __hip_bfloat16* C, int ldc) {
  float alpha_float = __bfloat162float(*alpha);
  float beta_float = __bfloat162float(*beta);
-  return cublasGemmEx(
-      handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B,
-      CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F,
-      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+  return hipblasGemmEx(
+      handle, transa, transb, m, n, k, &alpha_float, A, HIPBLAS_R_16B, lda, B,
+      HIPBLAS_R_16B, ldb, &beta_float, C, HIPBLAS_R_16B, ldc, HIPBLAS_R_32F,
+      HIPBLAS_GEMM_DEFAULT);
 }
 #endif  // BF16_ENABLED

 template <>
-cublasStatus_t cublasGemm<float>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<float>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const float* alpha, const float* A, int lda,
    const float* B, int ldb, const float* beta, float* C, int ldc) {
-  return cublasSgemm(
+  return hipblasSgemm(
      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }

 template <>
-cublasStatus_t cublasGemm<double>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<double>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const double* alpha, const double* A, int lda,
    const double* B, int ldb, const double* beta, double* C, int ldc) {
-  return cublasDgemm(
+  return hipblasDgemm(
      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }

@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
      // Load A in shared mem in a coalesced way
      for (unsigned int l = laneId; l < a_tile; l += 32)
        sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
-      __syncwarp();
+      // __syncwarp();

      for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
        DType out_reg = static_cast<DType>(0.0f);  // thread private
@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
      /* Load A in shared mem in a coalesced way */
      for (unsigned int l = laneId; l < a_tile; l += 32)
        sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
-      __syncwarp();
+      // __syncwarp();

      for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
        DType out_reg = static_cast<DType>(0.0f);  // thread private
@@ -203,7 +214,7 @@ void SegmentMM(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* A_data = A.Ptr<DType>();
  const DType* B_data = B.Ptr<DType>();
  const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
@@ -215,8 +226,8 @@ void SegmentMM(

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));

  IdType m_offset = 0;
  for (IdType etype = 0; etype < num_rel; ++etype) {
@@ -226,10 +237,10 @@ void SegmentMM(
    n = B->shape[2];  // cols of B
    k = B->shape[1];  // cols of A == rows of B
    int ldb = n, lda = k, ldc = n;
-    cublasOperation_t transB = CUBLAS_OP_N;
-    cublasOperation_t transA = CUBLAS_OP_N;
+    hipblasOperation_t transB = HIPBLAS_OP_N;
+    hipblasOperation_t transA = HIPBLAS_OP_N;
    if (b_trans) {
-      transB = CUBLAS_OP_T;
+      transB = HIPBLAS_OP_T;
      ldb = n, lda = n, ldc = k;
      std::swap(n, k);
    }
@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
 void SegmentMMBackwardB(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* A_data = A.Ptr<DType>();
  const DType* dC_data = dC.Ptr<DType>();
  const IdType* seglen_data = seglen.Ptr<IdType>();
@@ -260,8 +271,8 @@ void SegmentMMBackwardB(

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));

  IdType k_offset = 0;
  for (IdType etype = 0; etype < num_rel; ++etype) {
@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
    CHECK_LE(k_offset + k, A->shape[0])
        << "Segement index out of bound of A->shape[0].";
    int lddC = m, ldA = n, lddB = m;
-    cublasOperation_t trans_dC = CUBLAS_OP_N;
-    cublasOperation_t trans_A = CUBLAS_OP_T;
+    hipblasOperation_t trans_dC = HIPBLAS_OP_N;
+    hipblasOperation_t trans_A = HIPBLAS_OP_T;
    CUBLAS_CALL(cublasGemm<DType>(
        thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
        dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
@@ -299,7 +310,7 @@ void GatherMM(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t out_len = B->shape[2];  // cols of B
  int64_t in_len = A->shape[1];   // cols of A
  const int64_t tot_num_rows = A->shape[0];
@@ -332,7 +343,7 @@ void GatherMMScatter(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const IdType* idx_c_data = idx_c.Ptr<IdType>();
  int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2];  // cols of B
  int64_t in_len = A->shape[1];                                  // cols of A
@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b);
 #if BF16_ENABLED
-template void GatherMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void GatherMM<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b);
-template void GatherMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void GatherMM<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b);
 #endif  // BF16_ENABLED
@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c);
 #if BF16_ENABLED
-template void GatherMMScatter<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void GatherMMScatter<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c);
-template void GatherMMScatter<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void GatherMMScatter<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c);
 #endif  // BF16_ENABLED
@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans);
 #if BF16_ENABLED
-template void SegmentMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentMM<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans);
-template void SegmentMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentMM<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans);
 #endif  // BF16_ENABLED
@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
 template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
 #if BF16_ENABLED
-template void SegmentMMBackwardB<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentMMBackwardB<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
-template void SegmentMMBackwardB<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentMMBackwardB<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
 #endif  // BF16_ENABLED
 template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(

--- a/src/array/cuda/ge_spmm.cuh
+++ b/src/array/cuda/ge_spmm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 * Copyright (c) 2020 by Contributors
 * @file array/cuda/ge_spmm.cuh
@@ -7,7 +9,7 @@
 #define DGL_ARRAY_CUDA_GE_SPMM_CUH_

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"
 #include "atomic.cuh"
 #include "macro.cuh"

@@ -121,7 +123,7 @@ void GESpMMCsr(
  const DType* efeat_data = efeat.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int ntx = 32;
  const int nty = 32;

--- a/src/array/cuda/labor_sampling.cu
+++ b/src/array/cuda/labor_sampling.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*!
 *   Copyright (c) 2022, NVIDIA Corporation
 *   Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -34,19 +36,19 @@
 #include <thrust/zip_function.h>

 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <limits>
 #include <numeric>
 #include <type_traits>
 #include <utility>

-#include "../../array/cuda/atomic.cuh"
-#include "../../array/cuda/utils.h"
+#include "atomic.cuh"
+#include "utils.h"
 #include "../../graph/transform/cuda/cuda_map_edges.cuh"
 #include "../../random/continuous_seed.h"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./functor.cuh"
-#include "./spmm.cuh"
+#include "functor.cuh"
+#include "spmm.cuh"

 namespace dgl {
 namespace aten {
@@ -131,7 +133,7 @@ struct StencilOpFused {
  const IdType* indices;
  const IdType* nids;
  bool is_pinned;
-  __device__ auto operator()(IdType idx) {
+  __host__ __device__ auto operator()(IdType idx) {
    const auto in_row = idx_coo[idx];
    const auto ps = probs[idx];
    IdType rofs = idx - subindptr[in_row];
@@ -277,7 +279,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
    const FloatType* const ds, const FloatType* const d2s,
    const IdType* const indptr, const FloatType* const probs,
    const FloatType* const A, const IdType* const subindptr) {
-  typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
+  typedef hipcub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  __shared__ FloatType var_1_bcast[BLOCK_CTAS];

@@ -351,7 +353,7 @@ int log_size(const IdType size) {

 template <typename IdType, typename FloatType, typename exec_policy_t>
 void compute_importance_sampling_probabilities(
-    CSRMatrix mat, const IdType hop_size, cudaStream_t stream,
+    CSRMatrix mat, const IdType hop_size, hipStream_t stream,
    const continuous_seed seed, const IdType num_rows, const IdType* indptr,
    const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
    const IdType* nids,
@@ -398,17 +400,17 @@ void compute_importance_sampling_probabilities(
        hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
        mat.indptr->dtype);

-    cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
+    hipcub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());

    {
      std::size_t temp_storage_bytes = 0;
-      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+      CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
          nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
          stream));

      auto temp = allocator.alloc_unique<char>(temp_storage_bytes);

-      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+      CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
          temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
          max_log_num_vertices, stream));
    }
@@ -418,13 +420,13 @@ void compute_importance_sampling_probabilities(

    {
      std::size_t temp_storage_bytes = 0;
-      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+      CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
          nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
          hop_counts.get(), hop_unique_size.get(), hop_size, stream));

      auto temp = allocator.alloc_unique<char>(temp_storage_bytes);

-      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+      CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
          temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
          hop_counts.get(), hop_unique_size.get(), hop_size, stream));

@@ -511,7 +513,7 @@ void compute_importance_sampling_probabilities(
 /////////////////////////////// CSR ///////////////////////////////

 template <DGLDeviceType XPU, typename IdType, typename FloatType>
-std::pair<COOMatrix, FloatArray> CSRLaborSampling(
+__host__ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
    FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
    float seed2_contribution, IdArray NIDs) {
@@ -521,8 +523,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(

  runtime::CUDAWorkspaceAllocator allocator(ctx);

-  const auto stream = runtime::getCurrentCUDAStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);

  auto device = runtime::DeviceAPI::Get(ctx);

@@ -569,11 +571,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    auto ds_d2s = thrust::make_zip_iterator(ds, d2s);

    size_t prefix_temp_size = 0;
-    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
        nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
        TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
    auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
        temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
        e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
        stream));
@@ -586,11 +588,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
  IdType hop_size;
  {
    size_t prefix_temp_size = 0;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
        nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
        stream));
    auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
        temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
        stream));

@@ -619,11 +621,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
      auto modified_in_deg = thrust::make_transform_iterator(
          iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
      size_t prefix_temp_size = 0;
-      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
          nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
          num_rows + 1, stream));
      auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
          temp.get(), prefix_temp_size, modified_in_deg,
          subindptr_aligned.get(), num_rows + 1, stream));


--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/macro.cuh
@@ -30,14 +31,14 @@
      const auto device = runtime::DeviceAPI::Get(ctx);                       \
      (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
          ctx, sizeof(int64_t) * info.lhs_offset.size()));                    \
-      CUDA_CALL(cudaMemcpy(                                                   \
+      CUDA_CALL(hipMemcpy(                                                   \
          (LHS_OFF), &info.lhs_offset[0],                                     \
-          sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
+          sizeof(int64_t) * info.lhs_offset.size(), hipMemcpyHostToDevice)); \
      (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
          ctx, sizeof(int64_t) * info.rhs_offset.size()));                    \
-      CUDA_CALL(cudaMemcpy(                                                   \
+      CUDA_CALL(hipMemcpy(                                                   \
          (RHS_OFF), &info.rhs_offset[0],                                     \
-          sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
+          sizeof(int64_t) * info.rhs_offset.size(), hipMemcpyHostToDevice)); \
      if ((EDGE_MAP)) {                                                       \
        constexpr bool UseIdx = true;                                         \
        { __VA_ARGS__ }                                                       \