Merge branch 'dtk25.04' of http://developer.sourcefind.cn/codes/OpenDAS/dgl into 2.2.1

74d88bf8 · sangwz · 2a1ac588 · 314cedc1 · 74d88bf8 · 74d88bf8
Commit 74d88bf8 authored Feb 20, 2025 by sangwz
20 changed files
--- a/src/array/cuda/functor.cuh
+++ b/src/array/cuda/functor.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/functor.cuh
@@ -9,8 +10,8 @@
 #include <cmath>
 #include <limits>

-#include "./atomic.cuh"
-#include "./fp16.cuh"
+#include "atomic.cuh"
+#include "fp16.cuh"
 #include "bf16.cuh"

 namespace dgl {
@@ -208,29 +209,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {

 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Sum<Idx, __nv_bfloat16, atomic> : _Sum<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
-    return __float2bfloat16_rn(0.);
+struct Sum<Idx, __hip_bfloat16, atomic> : _Sum<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
+    return __float2bfloat16(0.);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Sum<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Sum<Idx, __hip_bfloat16, atomic>::Call(
        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Sum<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Sum<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
  }
  // sometimes we have to use float in reduction for better precision
  static __device__ __forceinline__ void Call(
      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
    _Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
        static_cast<float>(val), uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
    _Sum<Idx, float, atomic>::Call(out_buf, arg_buf,
        static_cast<float>(val), id);
  }
@@ -313,29 +314,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {

 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Max<Idx, __nv_bfloat16, atomic> : _Max<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
-    return __float2bfloat16_rn(-std::numeric_limits<float>::infinity());
+struct Max<Idx, __hip_bfloat16, atomic> : _Max<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
+    return __float2bfloat16(-std::numeric_limits<float>::infinity());
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Max<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Max<Idx, __hip_bfloat16, atomic>::Call(
        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Max<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Max<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
  }
  // sometimes we have to use float in reduction for better precision
  static __device__ __forceinline__ void Call(
      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
    _Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
        static_cast<float>(val), uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
    _Max<Idx, float, atomic>::Call(out_buf, arg_buf,
        static_cast<float>(val), id);
  }
@@ -418,29 +419,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {

 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Min<Idx, __nv_bfloat16, atomic> : _Min<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
-    return __float2bfloat16_rn(std::numeric_limits<float>::infinity());
+struct Min<Idx, __hip_bfloat16, atomic> : _Min<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
+    return __float2bfloat16(std::numeric_limits<float>::infinity());
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Min<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Min<Idx, __hip_bfloat16, atomic>::Call(
        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Min<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Min<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
  }
  // sometimes we have to use float in reduction for better precision
  static __device__ __forceinline__ void Call(
      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
    _Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
        static_cast<float>(val), uid, eid);
  }
  static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
    _Min<Idx, float, atomic>::Call(out_buf, arg_buf,
        static_cast<float>(val), id);
  }

--- a/src/array/cuda/gather_mm.cu
+++ b/src/array/cuda/gather_mm.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/gather_mm.cu
@@ -7,9 +9,9 @@

 #include <algorithm>  // std::swap

-#include "./atomic.cuh"
-#include "./functor.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "functor.cuh"
+#include "utils.h"

 namespace dgl {
 using namespace cuda;
@@ -20,54 +22,63 @@ namespace {
 /** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
 */
 template <typename DType>
-cublasStatus_t cublasGemm(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const DType* alpha, const DType* A, int lda,
    const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
  LOG(INFO) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }

 template <>
-cublasStatus_t cublasGemm<__half>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<__half>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const __half* alpha, const __half* A, int lda,
    const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
-  return cublasHgemm(
-      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return hipblasHgemm(
+      handle, transa, transb, m, n, k, (hipblasHalf*)alpha, (hipblasHalf*)A, lda, (hipblasHalf*)B, ldb, (hipblasHalf*)beta, (hipblasHalf*)C, ldc);
 }

+// template <>
+// hipblasStatus_t cublasGemm<__half>(
+//     hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+//     int m, int n, int k, const __half* alpha, const __half* A, int lda,
+//     const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
+//   return hipblasHgemm(
+//       handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+// }
+
 #if BF16_ENABLED
 template <>
-cublasStatus_t cublasGemm<__nv_bfloat16>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A,
-    int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta,
-    __nv_bfloat16* C, int ldc) {
+hipblasStatus_t cublasGemm<__hip_bfloat16>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+    int m, int n, int k, const __hip_bfloat16* alpha, const __hip_bfloat16* A,
+    int lda, const __hip_bfloat16* B, int ldb, const __hip_bfloat16* beta,
+    __hip_bfloat16* C, int ldc) {
  float alpha_float = __bfloat162float(*alpha);
  float beta_float = __bfloat162float(*beta);
-  return cublasGemmEx(
-      handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B,
-      CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F,
-      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+  return hipblasGemmEx(
+      handle, transa, transb, m, n, k, &alpha_float, A, HIPBLAS_R_16B, lda, B,
+      HIPBLAS_R_16B, ldb, &beta_float, C, HIPBLAS_R_16B, ldc, HIPBLAS_R_32F,
+      HIPBLAS_GEMM_DEFAULT);
 }
 #endif  // BF16_ENABLED

 template <>
-cublasStatus_t cublasGemm<float>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<float>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const float* alpha, const float* A, int lda,
    const float* B, int ldb, const float* beta, float* C, int ldc) {
-  return cublasSgemm(
+  return hipblasSgemm(
      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }

 template <>
-cublasStatus_t cublasGemm<double>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<double>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, int k, const double* alpha, const double* A, int lda,
    const double* B, int ldb, const double* beta, double* C, int ldc) {
-  return cublasDgemm(
+  return hipblasDgemm(
      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }

@@ -108,7 +119,7 @@ __global__ void GatherMMScatterKernel(
      // Load A in shared mem in a coalesced way
      for (unsigned int l = laneId; l < a_tile; l += 32)
        sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
-      __syncwarp();
+      // __syncwarp();

      for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
        DType out_reg = static_cast<DType>(0.0f);  // thread private
@@ -165,7 +176,7 @@ __global__ void GatherMMScatterKernel2(
      /* Load A in shared mem in a coalesced way */
      for (unsigned int l = laneId; l < a_tile; l += 32)
        sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
-      __syncwarp();
+      // __syncwarp();

      for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
        DType out_reg = static_cast<DType>(0.0f);  // thread private
@@ -203,7 +214,7 @@ void SegmentMM(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* A_data = A.Ptr<DType>();
  const DType* B_data = B.Ptr<DType>();
  const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
@@ -215,8 +226,8 @@ void SegmentMM(

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));

  IdType m_offset = 0;
  for (IdType etype = 0; etype < num_rel; ++etype) {
@@ -226,10 +237,10 @@ void SegmentMM(
    n = B->shape[2];  // cols of B
    k = B->shape[1];  // cols of A == rows of B
    int ldb = n, lda = k, ldc = n;
-    cublasOperation_t transB = CUBLAS_OP_N;
-    cublasOperation_t transA = CUBLAS_OP_N;
+    hipblasOperation_t transB = HIPBLAS_OP_N;
+    hipblasOperation_t transA = HIPBLAS_OP_N;
    if (b_trans) {
-      transB = CUBLAS_OP_T;
+      transB = HIPBLAS_OP_T;
      ldb = n, lda = n, ldc = k;
      std::swap(n, k);
    }
@@ -248,7 +259,7 @@ template <int XPU, typename IdType, typename DType>
 void SegmentMMBackwardB(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* A_data = A.Ptr<DType>();
  const DType* dC_data = dC.Ptr<DType>();
  const IdType* seglen_data = seglen.Ptr<IdType>();
@@ -260,8 +271,8 @@ void SegmentMMBackwardB(

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));

  IdType k_offset = 0;
  for (IdType etype = 0; etype < num_rel; ++etype) {
@@ -271,8 +282,8 @@ void SegmentMMBackwardB(
    CHECK_LE(k_offset + k, A->shape[0])
        << "Segement index out of bound of A->shape[0].";
    int lddC = m, ldA = n, lddB = m;
-    cublasOperation_t trans_dC = CUBLAS_OP_N;
-    cublasOperation_t trans_A = CUBLAS_OP_T;
+    hipblasOperation_t trans_dC = HIPBLAS_OP_N;
+    hipblasOperation_t trans_A = HIPBLAS_OP_T;
    CUBLAS_CALL(cublasGemm<DType>(
        thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
        dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
@@ -299,7 +310,7 @@ void GatherMM(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t out_len = B->shape[2];  // cols of B
  int64_t in_len = A->shape[1];   // cols of A
  const int64_t tot_num_rows = A->shape[0];
@@ -332,7 +343,7 @@ void GatherMMScatter(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c) {
  auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const IdType* idx_c_data = idx_c.Ptr<IdType>();
  int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2];  // cols of B
  int64_t in_len = A->shape[1];                                  // cols of A
@@ -367,10 +378,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b);
 #if BF16_ENABLED
-template void GatherMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void GatherMM<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b);
-template void GatherMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void GatherMM<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b);
 #endif  // BF16_ENABLED
@@ -394,10 +405,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c);
 #if BF16_ENABLED
-template void GatherMMScatter<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void GatherMMScatter<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c);
-template void GatherMMScatter<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void GatherMMScatter<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
    const NDArray idx_b, const NDArray idx_c);
 #endif  // BF16_ENABLED
@@ -421,10 +432,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans);
 #if BF16_ENABLED
-template void SegmentMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentMM<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans);
-template void SegmentMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentMM<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
    bool a_trans, bool b_trans);
 #endif  // BF16_ENABLED
@@ -446,9 +457,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
 template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
 #if BF16_ENABLED
-template void SegmentMMBackwardB<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentMMBackwardB<kDGLCUDA, int32_t, __hip_bfloat16>(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
-template void SegmentMMBackwardB<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentMMBackwardB<kDGLCUDA, int64_t, __hip_bfloat16>(
    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
 #endif  // BF16_ENABLED
 template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(

--- a/src/array/cuda/ge_spmm.cuh
+++ b/src/array/cuda/ge_spmm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 * Copyright (c) 2020 by Contributors
 * @file array/cuda/ge_spmm.cuh
@@ -7,7 +9,7 @@
 #define DGL_ARRAY_CUDA_GE_SPMM_CUH_

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"
 #include "atomic.cuh"
 #include "macro.cuh"

@@ -121,7 +123,7 @@ void GESpMMCsr(
  const DType* efeat_data = efeat.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int ntx = 32;
  const int nty = 32;

--- a/src/array/cuda/labor_sampling.cu
+++ b/src/array/cuda/labor_sampling.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*!
 *   Copyright (c) 2022, NVIDIA Corporation
 *   Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -34,17 +36,21 @@
 #include <thrust/zip_function.h>

 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <limits>
 #include <numeric>
 #include <type_traits>
 #include <utility>

+
 #include "../../array/cuda/utils.h"
+#include "atomic.cuh"
+#include "../../graph/transform/cuda/cuda_map_edges.cuh"
+
 #include "../../random/continuous_seed.h"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./functor.cuh"
-#include "./spmm.cuh"
+#include "functor.cuh"
+#include "spmm.cuh"

 namespace dgl {
 namespace aten {
@@ -129,7 +135,7 @@ struct StencilOpFused {
  const IdType* indices;
  const IdType* nids;
  bool is_pinned;
-  __device__ auto operator()(IdType idx) {
+  __host__ __device__ auto operator()(IdType idx) {
    const auto in_row = idx_coo[idx];
    const auto ps = probs[idx];
    IdType rofs = idx - subindptr[in_row];
@@ -275,7 +281,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
    const FloatType* const ds, const FloatType* const d2s,
    const IdType* const indptr, const FloatType* const probs,
    const FloatType* const A, const IdType* const subindptr) {
-  typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
+  typedef hipcub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  __shared__ FloatType var_1_bcast[BLOCK_CTAS];

@@ -349,7 +355,7 @@ int log_size(const IdType size) {

 template <typename IdType, typename FloatType, typename exec_policy_t>
 void compute_importance_sampling_probabilities(
-    CSRMatrix mat, const IdType hop_size, cudaStream_t stream,
+    CSRMatrix mat, const IdType hop_size, hipStream_t stream,
    const continuous_seed seed, const IdType num_rows, const IdType* indptr,
    const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
    const IdType* nids,
@@ -396,17 +402,17 @@ void compute_importance_sampling_probabilities(
        hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
        mat.indptr->dtype);

-    cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
+    hipcub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());

    {
      std::size_t temp_storage_bytes = 0;
-      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+      CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
          nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
          stream));

      auto temp = allocator.alloc_unique<char>(temp_storage_bytes);

-      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+      CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
          temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
          max_log_num_vertices, stream));
    }
@@ -416,13 +422,13 @@ void compute_importance_sampling_probabilities(

    {
      std::size_t temp_storage_bytes = 0;
-      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+      CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
          nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
          hop_counts.get(), hop_unique_size.get(), hop_size, stream));

      auto temp = allocator.alloc_unique<char>(temp_storage_bytes);

-      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+      CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
          temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
          hop_counts.get(), hop_unique_size.get(), hop_size, stream));

@@ -509,7 +515,7 @@ void compute_importance_sampling_probabilities(
 /////////////////////////////// CSR ///////////////////////////////

 template <DGLDeviceType XPU, typename IdType, typename FloatType>
-std::pair<COOMatrix, FloatArray> CSRLaborSampling(
+__host__ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
    FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
    float seed2_contribution, IdArray NIDs) {
@@ -519,19 +525,25 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(

  runtime::CUDAWorkspaceAllocator allocator(ctx);

-  const auto stream = runtime::getCurrentCUDAStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);

  auto device = runtime::DeviceAPI::Get(ctx);

  const IdType num_rows = rows_arr->shape[0];
-  IdType* const rows = rows_arr.Ptr<IdType>();
-  IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>();
-  FloatType* const A = prob_arr.Ptr<FloatType>();
-
-  IdType* const indptr_ = mat.indptr.Ptr<IdType>();
-  IdType* const indices_ = mat.indices.Ptr<IdType>();
-  IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
+  // IdType* const rows = rows_arr.Ptr<IdType>();
+  IdType* const rows = static_cast<IdType*>(GetDevicePointer(rows_arr));
+  // IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>();
+  IdType* const nids = IsNullArray(NIDs) ? nullptr : static_cast<IdType*>(GetDevicePointer(NIDs));
+  // FloatType* const A = prob_arr.Ptr<FloatType>();
+  FloatType* const A = static_cast<FloatType*>(GetDevicePointer(prob_arr));;
+
+  // IdType* const indptr_ = mat.indptr.Ptr<IdType>();
+  IdType* const indptr_ = static_cast<IdType*>(GetDevicePointer(mat.indptr));
+  // IdType* const indices_ = mat.indices.Ptr<IdType>();
+  IdType* const indices_ = static_cast<IdType*>(GetDevicePointer(mat.indices));
+  // IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
+  IdType* const data = CSRHasData(mat) ? static_cast<IdType*>(GetDevicePointer(mat.data)) : nullptr;

  // Read indptr only once in case it is pinned and access is slow.
  auto indptr = allocator.alloc_unique<IdType>(num_rows);
@@ -567,11 +579,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    auto ds_d2s = thrust::make_zip_iterator(ds, d2s);

    size_t prefix_temp_size = 0;
-    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
        nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
        TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
    auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
        temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
        e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
        stream));
@@ -584,11 +596,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
  IdType hop_size;
  {
    size_t prefix_temp_size = 0;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
        nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
        stream));
    auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
        temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
        stream));

@@ -617,11 +629,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
      auto modified_in_deg = thrust::make_transform_iterator(
          iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
      size_t prefix_temp_size = 0;
-      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
          nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
          num_rows + 1, stream));
      auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
          temp.get(), prefix_temp_size, modified_in_deg,
          subindptr_aligned.get(), num_rows + 1, stream));


--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/macro.cuh
@@ -30,14 +31,14 @@
      const auto device = runtime::DeviceAPI::Get(ctx);                       \
      (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
          ctx, sizeof(int64_t) * info.lhs_offset.size()));                    \
-      CUDA_CALL(cudaMemcpy(                                                   \
+      CUDA_CALL(hipMemcpy(                                                   \
          (LHS_OFF), &info.lhs_offset[0],                                     \
-          sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
+          sizeof(int64_t) * info.lhs_offset.size(), hipMemcpyHostToDevice)); \
      (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
          ctx, sizeof(int64_t) * info.rhs_offset.size()));                    \
-      CUDA_CALL(cudaMemcpy(                                                   \
+      CUDA_CALL(hipMemcpy(                                                   \
          (RHS_OFF), &info.rhs_offset[0],                                     \
-          sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
+          sizeof(int64_t) * info.rhs_offset.size(), hipMemcpyHostToDevice)); \
      if ((EDGE_MAP)) {                                                       \
        constexpr bool UseIdx = true;                                         \
        { __VA_ARGS__ }                                                       \

--- a/src/array/cuda/negative_sampling.cu
+++ b/src/array/cuda/negative_sampling.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/negative_sampling.cu
 * @brief rowwise sampling
 */

-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/array_iterator.h>
 #include <dgl/random.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 using namespace dgl::runtime;

@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
  int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;

-  curandStatePhilox4_32_10_t
+  hiprandStatePhilox4_32_10_t
      rng;  // this allows generating 4 32-bit ints at a time
-  curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (tx < num_samples) {
    for (int i = 0; i < num_trials; ++i) {
-      uint4 result = curand4(&rng);
+      uint4 result = hiprand4(&rng);
      // Turns out that result.x is always 0 with the above RNG.
      uint64_t y_hi = result.y >> 16;
      uint64_t y_lo = result.y & 0xFFFF;
@@ -88,7 +90,7 @@ struct IsNotMinusOne {
 template <typename IdType>
 void SortOrderedPairs(
    runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
-    IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
+    IdType* tmp_major, IdType* tmp_minor, int64_t n, hipStream_t stream) {
  // Sort ordered pairs in lexicographical order by two radix sorts since
  // cub's radix sorts are stable.
  // We need a 2*n auxiliary storage to store the results form the first radix
@@ -98,21 +100,21 @@ void SortOrderedPairs(
  void* tmp2 = nullptr;

  // Radix sort by minor key first, reorder the major key in the progress.
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
      stream));
  tmp1 = device->AllocWorkspace(ctx, s1);
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
      stream));

  // Radix sort by major key next.
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
      stream));
  tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
                   : tmp1;  // reuse buffer if s2 <= s1
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
      stream));

@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  IdType* out_row_data = out_row.Ptr<IdType>();
  IdType* out_col_data = out_col.Ptr<IdType>();
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int nt = cuda::FindNumThreads(num_actual_samples);
  const int nb = (num_actual_samples + nt - 1) / nt;
  std::pair<IdArray, IdArray> result;
@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  IsNotMinusOne<IdType> op;
  PairIterator<IdType> begin(row_data, col_data);
  PairIterator<IdType> out_begin(out_row_data, out_col_data);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
      nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
      stream));
  void* tmp = device->AllocWorkspace(ctx, tmp_size);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
      tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
      stream));
  num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(

    size_t tmp_size_unique = 0;
    void* tmp_unique = nullptr;
-    CUDA_CALL(cub::DeviceSelect::Unique(
+    CUDA_CALL(hipcub::DeviceSelect::Unique(
        nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
        num_out, stream));
    tmp_unique = (tmp_size_unique > tmp_size)
                     ? device->AllocWorkspace(ctx, tmp_size_unique)
                     : tmp;  // reuse buffer
-    CUDA_CALL(cub::DeviceSelect::Unique(
+    CUDA_CALL(hipcub::DeviceSelect::Unique(
        tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
        num_out, stream));
    num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);

-    num_out = std::min(num_samples, num_out);
+    num_out = ::min(num_samples, num_out);
    result = {
        unique_row.CreateView({num_out}, dtype),
        unique_col.CreateView({num_out}, dtype)};

    if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
  } else {
-    num_out = std::min(num_samples, num_out);
+    num_out = ::min(num_samples, num_out);
    result = {
        out_row.CreateView({num_out}, dtype),
        out_col.CreateView({num_out}, dtype)};

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/rowwise_sampling.cu
 * @brief uniform rowwise sampling
 */

-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/tensordispatch.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>

-#include "../../array/cuda/atomic.cuh"
+#include "atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 using namespace dgl::cuda;
 using namespace dgl::aten::cuda;
@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
      __syncthreads();

      for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
-        const int num = curand(&rng) % (idx + 1);
+        const int num = hiprand(&rng) % (idx + 1);
        if (num < num_picks) {
          // use max so as to achieve the replacement order the serial
          // algorithm would have
@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
    if (deg > 0) {
      // each thread then blindly copies in rows only if deg > 0.
      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
-        const int64_t edge = curand(&rng) % deg;
+        const int64_t edge = hiprand(&rng) % deg;
        const int64_t out_idx = out_row_start + idx;
        out_rows[out_idx] = row;
        out_cols[out_idx] = in_index[in_row_start + edge];
@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
    CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
  const auto& ctx = rows->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t num_rows = rows->shape[0];
  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
  IdType* out_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, out_deg);

-  cudaEvent_t copyEvent;
-  CUDA_CALL(cudaEventCreate(&copyEvent));
+  hipEvent_t copyEvent;
+  CUDA_CALL(hipEventCreate(&copyEvent));

  NDArray new_len_tensor;
  if (TensorDispatcher::Global()->IsAvailable()) {
@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
  }

  // copy using the internal current stream
-  CUDA_CALL(cudaMemcpyAsync(
+  CUDA_CALL(hipMemcpyAsync(
      new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
-      cudaMemcpyDeviceToHost, stream));
-  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+      hipMemcpyDeviceToHost, stream));
+  CUDA_CALL(hipEventRecord(copyEvent, stream));

  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);

@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
  device->FreeWorkspace(ctx, out_ptr);

  // wait for copying `new_len` to finish
-  CUDA_CALL(cudaEventSynchronize(copyEvent));
-  CUDA_CALL(cudaEventDestroy(copyEvent));
+  CUDA_CALL(hipEventSynchronize(copyEvent));
+  CUDA_CALL(hipEventDestroy(copyEvent));

  const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2022 by Contributors
 * @file array/cuda/rowwise_sampling_prob.cu
@@ -6,20 +8,20 @@
 * sampling code rowwise_sampling.cu.
 * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
 */
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>

-#include "../../array/cuda/atomic.cuh"
+#include "atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 // require CUB 1.17 to use DeviceSegmentedSort
-static_assert(
-    CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
+// static_assert(
+//     CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");

 namespace dgl {
 using namespace cuda;
@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
            prob, data, idx, in_row_start, &item_prob);
        // compute A-Res value
        ares[ares_idx] = static_cast<FloatType>(
-            __powf(curand_uniform(&rng), 1.0f / item_prob));
+            __powf(hiprand_uniform(&rng), 1.0f / item_prob));
        ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
      }
    }
@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(

    if (deg > 0) {
      // Specialize BlockScan for a 1D block of BLOCK_SIZE threads
-      typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
+      typedef hipcub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
      // Allocate shared memory for BlockScan
      __shared__ typename BlockScan::TempStorage temp_storage;
      // Initialize running total
@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
      for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
        // get random value
        FloatType sum = cdf[cdf_row_start + deg - 1];
-        FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum);
+        FloatType rand = static_cast<FloatType>(hiprand_uniform(&rng) * sum);
        // get the offset of the first value within cdf array which is greater
        // than random value.
-        int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>(
+        int64_t item = hipcub::UpperBound<FloatType*, int64_t, FloatType>(
            &cdf[cdf_row_start], deg, rand);
        item = min(item, deg - 1);
        // get in and out index
@@ -400,18 +402,20 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
  const auto idtype = coo.row->dtype;
  const auto ctx = coo.row->ctx;
  const int64_t nnz = coo.row->shape[0];
-  const IdType* row = coo.row.Ptr<IdType>();
-  const IdType* col = coo.col.Ptr<IdType>();
+  // const IdType* row = coo.row.Ptr<IdType>();
+  const IdType* row = static_cast<IdType*>(GetDevicePointer(coo.row));
+  // const IdType* col = coo.col.Ptr<IdType>();
+  const IdType* col = static_cast<IdType*>(GetDevicePointer(coo.col));
  const IdArray& eid =
      COOHasData(coo) ? coo.data : Range(0, nnz, sizeof(IdType) * 8, ctx);
-  const IdType* data = coo.data.Ptr<IdType>();
+  const IdType* data = static_cast<IdType*>(GetDevicePointer(coo.data));
  IdArray new_row = IdArray::Empty({nnz}, idtype, ctx);
  IdArray new_col = IdArray::Empty({nnz}, idtype, ctx);
  IdArray new_eid = IdArray::Empty({nnz}, idtype, ctx);
  IdType* new_row_data = new_row.Ptr<IdType>();
  IdType* new_col_data = new_col.Ptr<IdType>();
  IdType* new_eid_data = new_eid.Ptr<IdType>();
-  auto stream = runtime::getCurrentCUDAStream();
+  auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(ctx);

  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
@@ -439,9 +443,10 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
 template <DGLDeviceType XPU, typename IdType, typename DType>
 COOMatrix _COORemoveIf(
    const COOMatrix& coo, const NDArray& values, DType criteria) {
-  const DType* val = values.Ptr<DType>();
+  // const DType* val = values.Ptr<DType>();
+  const DType* val = static_cast<DType*>(GetDevicePointer(values));
  auto maskgen = [val, criteria](
-                     int nb, int nt, cudaStream_t stream, int64_t nnz,
+                     int nb, int nt, hipStream_t stream, int64_t nnz,
                     const IdType* data, int8_t* flags) {
    CUDA_KERNEL_CALL(
        (_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
@@ -481,7 +486,7 @@ COOMatrix _CSRRowWiseSampling(
    const FloatArray& prob, bool replace) {
  const auto& ctx = rows->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t num_rows = rows->shape[0];
  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
@@ -530,10 +535,10 @@ COOMatrix _CSRRowWiseSampling(
  IdType* temp_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, temp_deg);
@@ -551,16 +556,16 @@ COOMatrix _CSRRowWiseSampling(
  IdType* out_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, out_deg);

-  cudaEvent_t copyEvent;
-  CUDA_CALL(cudaEventCreate(&copyEvent));
+  hipEvent_t copyEvent;
+  CUDA_CALL(hipEventCreate(&copyEvent));
  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
  // wait on a cudaevent
  IdType new_len;
@@ -568,7 +573,7 @@ COOMatrix _CSRRowWiseSampling(
  device->CopyDataFromTo(
      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
-  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+  CUDA_CALL(hipEventRecord(copyEvent, stream));

  // allocate workspace
  // 1) for w/ replacement, it's a global buffer to store cdf segments (one
@@ -612,16 +617,16 @@ COOMatrix _CSRRowWiseSampling(
    IdType* sort_temp_idxs = static_cast<IdType*>(
        device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));

-    cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
-    cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
+    hipcub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
+    hipcub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);

    void* d_temp_storage = nullptr;
    size_t temp_storage_bytes = 0;
-    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
        num_rows, temp_ptr, temp_ptr + 1, stream));
    d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
-    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
        num_rows, temp_ptr, temp_ptr + 1, stream));
    device->FreeWorkspace(ctx, d_temp_storage);
@@ -641,8 +646,8 @@ COOMatrix _CSRRowWiseSampling(
  device->FreeWorkspace(ctx, out_ptr);

  // wait for copying `new_len` to finish
-  CUDA_CALL(cudaEventSynchronize(copyEvent));
-  CUDA_CALL(cudaEventDestroy(copyEvent));
+  CUDA_CALL(hipEventSynchronize(copyEvent));
+  CUDA_CALL(hipEventDestroy(copyEvent));

  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);

--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cuh
@@ -10,8 +12,8 @@

 #include "../../runtime/cuda/cuda_common.h"
 #include "../selector.h"
-#include "./functor.cuh"
-#include "./utils.h"
+#include "functor.cuh"
+#include "utils.h"
 #include "atomic.cuh"
 #include "bf16.cuh"
 #include "fp16.cuh"
@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
      }
 #pragma unroll
      for (int offset = 16; offset > 0; offset /= 2)
-        val += __shfl_down_sync(full_mask, val, offset);
+        val += __shfl_down(val, offset);
      if (tx == 0) outoff[i] = val;
    }
  }
@@ -275,7 +277,7 @@ void SDDMMCoo(
  const DType* lhs_data = lhs.Ptr<DType>();
  const DType* rhs_data = rhs.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  int64_t *lhs_off = nullptr, *rhs_off = nullptr;
  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
@@ -337,7 +339,7 @@ void SDDMMCsr(
  const DType* lhs_data = lhs.Ptr<DType>();
  const DType* rhs_data = rhs.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0];

  int64_t *lhs_off = nullptr, *rhs_off = nullptr;

--- a/src/array/cuda/sddmm.cu
+++ b/src/array/cuda/sddmm.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
@@ -5,8 +6,8 @@
 */
 #include <dgl/array.h>

-#include "./functor.cuh"
-#include "./sddmm.cuh"
+#include "functor.cuh"
+#include "sddmm.cuh"

 namespace dgl {
 namespace aten {
@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #if BF16_ENABLED
-template void SDDMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
-template void SDDMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #endif  // BF16_ENABLED
@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #if BF16_ENABLED
-template void SDDMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
-template void SDDMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #endif  // BF16_ENABLED

--- a/src/array/cuda/sddmm_hetero_coo.cu
+++ b/src/array/cuda/sddmm_hetero_coo.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
@@ -5,7 +6,7 @@
 */
 #include <dgl/array.h>

-#include "./sddmm.cuh"
+#include "sddmm.cuh"

 namespace dgl {
 namespace aten {
@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
 #if BF16_ENABLED
-template void SDDMMCooHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCooHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCooHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,

--- a/src/array/cuda/sddmm_hetero_csr.cu
+++ b/src/array/cuda/sddmm_hetero_csr.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
@@ -5,7 +6,7 @@
 */
 #include <dgl/array.h>

-#include "./sddmm.cuh"
+#include "sddmm.cuh"

 namespace dgl {
 namespace aten {
@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
 #if BF16_ENABLED
-template void SDDMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,

--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/segment_reduce.cuh
@@ -10,8 +12,8 @@
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./atomic.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "utils.h"

 namespace dgl {

@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
  DType* out_data = out.Ptr<DType>();
  IdType* arg_data = arg.Ptr<IdType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t n = out->shape[0];
  int64_t dim = 1;
  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
  const IdType* idx_data = idx.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
    const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
    const std::vector<NDArray>& list_idx_types,
    std::vector<NDArray>* list_out) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  if (op == "copy_lhs" || op == "copy_rhs") {
    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
        graph->NumVertexTypes(), std::vector<dgl_id_t>());
@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
  const IdType* arg_data = arg.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];

--- a/src/array/cuda/segment_reduce.cu
+++ b/src/array/cuda/segment_reduce.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/segment_reduce.cu
@@ -6,9 +7,9 @@
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>

-#include "./functor.cuh"
-#include "./segment_reduce.cuh"
-#include "./utils.h"
+#include "functor.cuh"
+#include "segment_reduce.cuh"
+#include "utils.h"

 namespace dgl {

@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
    NDArray arg);
 #if BF16_ENABLED
-template void SegmentReduce<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentReduce<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
    NDArray arg);
-template void SegmentReduce<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentReduce<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
    NDArray arg);
 #endif  // BF16_ENABLED
@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
 template void ScatterAdd<kDGLCUDA, int64_t, __half>(
    NDArray feat, NDArray idx, NDArray out);
 #if BF16_ENABLED
-template void ScatterAdd<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void ScatterAdd<kDGLCUDA, int32_t, __hip_bfloat16>(
    NDArray feat, NDArray idx, NDArray out);
-template void ScatterAdd<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void ScatterAdd<kDGLCUDA, int64_t, __hip_bfloat16>(
    NDArray feat, NDArray idx, NDArray out);
 #endif  // BF16_ENABLED
 template void ScatterAdd<kDGLCUDA, int32_t, float>(
@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
 #if BF16_ENABLED
-template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
 template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
    NDArray feat, NDArray arg, NDArray out);
 #if BF16_ENABLED
-template void BackwardSegmentCmp<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, __hip_bfloat16>(
    NDArray feat, NDArray arg, NDArray out);
-template void BackwardSegmentCmp<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, __hip_bfloat16>(
    NDArray feat, NDArray arg, NDArray out);
 #endif  // BF16_ENABLED
 template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(

--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by contributors.
 * @file array/cuda/spmat_op_impl_coo.cu
@@ -10,8 +12,8 @@
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./atomic.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "utils.h"

 namespace dgl {

@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(

 template <DGLDeviceType XPU, typename IdType>
 int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = coo.row->ctx;
  IdType nnz = coo.row->shape[0];
  IdType nt = 1024;
@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(

 template <DGLDeviceType XPU, typename IdType>
 NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = coo.row->ctx;
  IdType nnz = coo.row->shape[0];
  IdType num_rows = coo.num_rows;

--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmat_op_impl_csr.cu
@@ -7,14 +9,14 @@
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>
 #include <unordered_set>
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./atomic.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "utils.h"

 namespace dgl {

@@ -28,7 +30,7 @@ namespace impl {

 template <DGLDeviceType XPU, typename IdType>
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = csr.indptr->ctx;
  IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
  IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
 NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
  const auto rowlen = row->shape[0];
  const auto collen = col->shape[0];
-  const auto rstlen = std::max(rowlen, collen);
+  const auto rstlen = ::max(rowlen, collen);
  NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
  if (rstlen == 0) return rst;
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int nt = dgl::cuda::FindNumThreads(rstlen);
  const int nb = (rstlen + nt - 1) / nt;
  const IdType* data = nullptr;
@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
 bool CSRHasDuplicate(CSRMatrix csr) {
  if (!csr.sorted) csr = CSRSort(csr);
  const auto& ctx = csr.indptr->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(ctx);
  // We allocate a workspace of num_rows bytes. It wastes a little bit memory
  // but should be fine.
@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(

 template <DGLDeviceType XPU, typename IdType>
 NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto len = rows->shape[0];
  const IdType* vid_data = rows.Ptr<IdType>();
  const IdType* indptr_data =
@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(

 template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int64_t len = rows->shape[0];
  IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
  const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
    CSRMatrix csr, NDArray row, NDArray col) {
  const auto rowlen = row->shape[0];
  const auto collen = col->shape[0];
-  const auto len = std::max(rowlen, collen);
+  const auto len = ::max(rowlen, collen);
  if (len == 0) return {NullArray(), NullArray(), NullArray()};

  const auto& ctx = row->ctx;
@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
  const int64_t nnz = csr.indices->shape[0];
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const IdType* indptr_data =
      static_cast<IdType*>(GetDevicePointer(csr.indptr));
@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
          static_cast<IdType>(num_rows));

  NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
-  typedef cub::WarpReduce<IdType> WarpReduce;
+  typedef hipcub::WarpReduce<IdType> WarpReduce;
  __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];

  while (out_row < last_row) {
@@ -546,7 +548,7 @@ __global__ void _SegmentMaskColKernel(
        mask[idx] = 1;
      }
    }
-    IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
+    IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);    
    if (laneid == 0) {
      count[out_row] = reduce_count;
    }
@@ -557,7 +559,7 @@ __global__ void _SegmentMaskColKernel(
 template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceMatrix(
    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = rows->ctx;
  const auto& dtype = rows->dtype;
  const auto nbits = dtype.bits;
@@ -582,7 +584,7 @@ CSRMatrix CSRSliceMatrix(
  // A count for how many masked values per row.
  IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
  CUDA_CALL(
-      cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
+      hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));

  // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
  // For performance, the load factor of the hashmap is in (0.25, 0.5);
@@ -593,7 +595,7 @@ CSRMatrix CSRSliceMatrix(

  using it = thrust::counting_iterator<int64_t>;
  runtime::CUDAWorkspaceAllocator allocator(ctx);
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
  thrust::for_each(
      exec_policy, it(0), it(new_ncols),
      [key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
@@ -609,14 +611,15 @@ CSRMatrix CSRSliceMatrix(

  // Execute SegmentMaskColKernel
  const int64_t num_rows = csr.num_rows;
-  constexpr int WARP_SIZE = 32;
+  constexpr int WARP_SIZE = 64;
  // With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
-  constexpr int TILE_SIZE = 16;
+  constexpr int TILE_SIZE = 32;
  constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
  IdType nb =
      dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
  const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
  const dim3 nblks(nb);
+
  CUDA_KERNEL_CALL(
      (_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
      nthrs, 0, stream, indptr_data, indices_data, num_rows,

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cuh
@@ -11,7 +13,7 @@
 #include <limits>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"
 #include "atomic.cuh"
 #include "bf16.cuh"
 #include "fp16.cuh"
@@ -28,14 +30,14 @@ namespace aten {
 */
 template <typename DType, typename IdType>
 inline bool cusparse_available(bool more_nnz_than_matrix_size) {
-#if CUDART_VERSION < 11000
+#if DTKRT_VERSION < 11000
  if (std::is_same<IdType, int>::value &&
      (std::is_same<DType, float>::value || std::is_same<DType, double>::value))
    return true;
  return false;
 #else
  if (std::is_same<DType, __half>::value ||
-      std::is_same<DType, __nv_bfloat16>::value)
+      std::is_same<DType, __hip_bfloat16>::value)
    return false;  // cusparse's SpMM on fp16 is slow, temporally disabled.
  // If the CSR matrix has more NNZ than matrix size, we should not use
  // cuSPARSE 11.1.
@@ -47,54 +49,54 @@ namespace {

 /** @brief Call cuBLAS geam API for transpose operation for float and double. */
 template <typename DType>
-cublasStatus_t Xgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const DType* alpha, const DType* A, int lda,
    const DType* beta, const DType* B, int ldb, DType* C, int ldc) {
  LOG(FATAL) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }

 template <>
-cublasStatus_t Xgeam<__half>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<__half>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const __half* alpha, const __half* A, int lda,
    const __half* beta, const __half* B, int ldb, __half* C, int ldc) {
  // TODO(ndickson): There is no cublasHgeam, so a different
  // implementation would be required.
  LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }

 #if BF16_ENABLED
 template <>
-cublasStatus_t Xgeam<__nv_bfloat16>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda,
-    const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb,
-    __nv_bfloat16* C, int ldc) {
+hipblasStatus_t Xgeam<__hip_bfloat16>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+    int m, int n, const __hip_bfloat16* alpha, const __hip_bfloat16* A, int lda,
+    const __hip_bfloat16* beta, const __hip_bfloat16* B, int ldb,
+    __hip_bfloat16* C, int ldc) {
  // TODO(ndickson): There is no cublasHgeam, so a different
  // implementation would be required.
  LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }
 #endif  // BF16_ENABLED

 template <>
-cublasStatus_t Xgeam<float>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<float>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const float* alpha, const float* A, int lda,
    const float* beta, const float* B, int ldb, float* C, int ldc) {
-  return cublasSgeam(
+  return hipblasSgeam(
      handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }

 template <>
-cublasStatus_t Xgeam<double>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<double>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const double* alpha, const double* A, int lda,
    const double* beta, const double* B, int ldb, double* C, int ldc) {
-  return cublasDgeam(
+  return hipblasDgeam(
      handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }

@@ -119,12 +121,12 @@ template <typename DType>
 void _Transpose(const DType* in, DType* out, int row, int col) {
  DType alpha = 1., beta = 0.;
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
  CUBLAS_CALL(Xgeam<DType>(
-      thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in,
+      thr_entry->cublas_handle, HIPBLAS_OP_T, HIPBLAS_OP_N, row, col, &alpha, in,
      col, &beta, nullptr, row, out, row));
 }

@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
 */
 template <>
 void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = FindNumThreads(row);
  int nb = col;
  CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
 * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
 */
 template <>
-void _Transpose<__nv_bfloat16>(
-    const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+void _Transpose<__hip_bfloat16>(
+    const __hip_bfloat16* in, __hip_bfloat16* out, int row, int col) {
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = FindNumThreads(row);
  int nb = col;
  CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
 }
 #endif  // BF16_ENABLED

-#if CUDART_VERSION < 11000
+#if DTKRT_VERSION < 11000
 template <typename DType>
-cusparseStatus_t Xcsrmm2(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA,
+hipsparseStatus_t Xcsrmm2(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const DType* alpha, const hipsparseMatDescr_t descrA, const DType* csrValA,
    const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb,
    const DType* beta, DType* C, int ldc) {
  LOG(INFO) << "Not supported dtype";
-  return CUSPARSE_STATUS_EXECUTION_FAILED;
+  return HIPSPARSE_STATUS_EXECUTION_FAILED;
 }

 template <>
-cusparseStatus_t Xcsrmm2<float>(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA,
+hipsparseStatus_t Xcsrmm2<float>(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const float* alpha, const hipsparseMatDescr_t descrA, const float* csrValA,
    const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb,
    const float* beta, float* C, int ldc) {
-  return cusparseScsrmm2(
+  return hipsparseScsrmm2(
      handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
      csrColIndA, B, ldb, beta, C, ldc);
 }

 template <>
-cusparseStatus_t Xcsrmm2<double>(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA,
+hipsparseStatus_t Xcsrmm2<double>(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const double* alpha, const hipsparseMatDescr_t descrA, const double* csrValA,
    const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb,
    const double* beta, double* C, int ldc) {
-  return cusparseDcsrmm2(
+  return hipsparseDcsrmm2(
      handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
      csrColIndA, B, ldb, beta, C, ldc);
 }
@@ -214,12 +216,12 @@ void CusparseCsrmm2(
  // device
  auto device = runtime::DeviceAPI::Get(ctx);
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
  // all one data array
  DType* valptr = nullptr;
  if (!A_data) {
@@ -227,55 +229,59 @@ void CusparseCsrmm2(
        static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
    _Fill(valptr, nnz, static_cast<DType>(1.));
  }
-#if CUDART_VERSION >= 11000
-  cusparseSpMatDescr_t matA;
-  cusparseDnMatDescr_t matB, matC;
+#if DTKRT_VERSION >= 11000
+  hipsparseSpMatDescr_t matA;
+  hipsparseDnMatDescr_t matB, matC;
  constexpr auto dtype = cuda_dtype<DType>::value;
  constexpr auto idtype = cusparse_idtype<IdType>::value;
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
      &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
      static_cast<IdType*>(csr.indices->data),
      const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
-      CUSPARSE_INDEX_BASE_ZERO, dtype));
-  CUSPARSE_CALL(cusparseCreateDnMat(
-      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+      HIPSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(hipsparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
  CUSPARSE_CALL(
-      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+      hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));

-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
  size_t workspace_size;
+
  cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
                                   ? CUSPARSE_SPMM_CSR_ALG3
                                   : CUSPARSE_SPMM_CSR_ALG2;
-  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+  CUSPARSE_CALL(hipsparseSpMM_bufferSize(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, spmm_alg, &workspace_size));
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
+
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseSpMM(
+  CUSPARSE_CALL(hipsparseSpMM(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, spmm_alg, workspace));
+
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
+
  device->FreeWorkspace(ctx, workspace);

-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
 #else
  // allocate matrix for temporary transposed output
  DType* trans_out =
      static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType)));

-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
  CUSPARSE_CALL(Xcsrmm2<DType>(
-      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+      HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
      (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
      static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out,
      m));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
  // transpose the output matrix
  _Transpose(trans_out, C_data, n, m);
  device->FreeWorkspace(ctx, trans_out);
@@ -287,8 +293,10 @@ void CusparseCsrmm2(
 template <typename DType, typename IdType>
 void CusparseCsrmm2Hetero(
    const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
+
    const DType* A_data, DType* C_data, int64_t x_length, cudaStream_t strm_id,
    bool use_deterministic_alg_only = false) {
+
  // We use csrmm2 to perform following operation:
  // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
  // for node feature tensor. However, since cusparse only supports
@@ -311,9 +319,9 @@ void CusparseCsrmm2Hetero(
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, strm_id));
  // all one data array
  DType* valptr = nullptr;
  if (!A_data) {
@@ -321,51 +329,53 @@ void CusparseCsrmm2Hetero(
        static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
    _Fill(valptr, nnz, static_cast<DType>(1.));
  }
-#if CUDART_VERSION >= 11000
-  cusparseSpMatDescr_t matA;
-  cusparseDnMatDescr_t matB, matC;
+#if DTKRT_VERSION >= 11000
+  hipsparseSpMatDescr_t matA;
+  hipsparseDnMatDescr_t matB, matC;
  constexpr auto dtype = cuda_dtype<DType>::value;
  constexpr auto idtype = cusparse_idtype<IdType>::value;
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
      &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
      static_cast<IdType*>(csr.indices->data),
      const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
-      CUSPARSE_INDEX_BASE_ZERO, dtype));
-  CUSPARSE_CALL(cusparseCreateDnMat(
-      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+      HIPSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(hipsparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
  CUSPARSE_CALL(
-      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+      hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));

-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
  size_t workspace_size;
+
  cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
                                   ? CUSPARSE_SPMM_CSR_ALG3
                                   : CUSPARSE_SPMM_CSR_ALG2;
-  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+  CUSPARSE_CALL(hipsparseSpMM_bufferSize(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, spmm_alg, &workspace_size));
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseSpMM(
+  CUSPARSE_CALL(hipsparseSpMM(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, spmm_alg, workspace));
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
+
  device->FreeWorkspace(ctx, workspace);

-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
 #else
-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
  CHECK_EQ(sizeof(IdType), sizeof(int32_t));
  CUSPARSE_CALL(Xcsrmm2<DType>(
-      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+      HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
      (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
      static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
 #endif
  if (valptr) device->FreeWorkspace(ctx, valptr);
 }
@@ -632,7 +642,7 @@ void SpMMCoo(
   */
 #if BF16_ENABLED
  if (std::is_same<DType, __half>::value ||
-      std::is_same<DType, __nv_bfloat16>::value)
+      std::is_same<DType, __hip_bfloat16>::value)
 #else
  if (std::is_same<DType, __half>::value)
 #endif  // BF16_ENABLED
@@ -645,7 +655,7 @@ void SpMMCoo(
              *efeat_data = efeat.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();
  Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0];

  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
@@ -710,7 +720,7 @@ void SpMMCsr(
  Idx* argu_data = argu.Ptr<Idx>();
  Idx* arge_data = arge.Ptr<Idx>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
@@ -771,7 +781,7 @@ void SpMMCmpCsrHetero(
  Idx* argu_data = argu.Ptr<Idx>();
  Idx* arge_data = arge.Ptr<Idx>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cu
@@ -8,9 +9,9 @@
 #include <cstdlib>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./functor.cuh"
-#include "./ge_spmm.cuh"
-#include "./spmm.cuh"
+#include "functor.cuh"
+#include "ge_spmm.cuh"
+#include "spmm.cuh"

 namespace dgl {

@@ -114,11 +115,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
 #if BF16_ENABLED
-template void SpMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
-template void SpMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
@@ -149,11 +150,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
 #if BF16_ENABLED
-template void SpMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
-template void SpMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);

--- a/src/array/cuda/spmm_hetero.cu
+++ b/src/array/cuda/spmm_hetero.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cu
@@ -8,9 +10,9 @@
 #include <cstdlib>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./functor.cuh"
-#include "./ge_spmm.cuh"
-#include "./spmm.cuh"
+#include "functor.cuh"
+#include "ge_spmm.cuh"
+#include "spmm.cuh"

 namespace dgl {

@@ -42,7 +44,7 @@ void SpMMCsrHetero(
    use_deterministic_alg_only = true;

  bool use_legacy_cusparsemm =
-      (CUDART_VERSION < 11000) && (reduce == "sum") &&
+      (DTKRT_VERSION < 11000) && (reduce == "sum") &&
      // legacy cuSPARSE does not care about NNZ, hence the argument "false".
      ((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
       (op == "mul" && is_scalar_efeat &&
@@ -55,7 +57,7 @@ void SpMMCsrHetero(
      if (m == 0) continue;
      DType* out = static_cast<DType*>(device->AllocWorkspace(
          vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
-      CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType)));
+      CUDA_CALL(hipMemset(out, 0, m * n * sizeof(DType)));
      trans_out[ntype] = out;
    }
  }
@@ -116,7 +118,7 @@ void SpMMCsrHetero(
    }
  }

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
    const dgl_type_t src_id = ufeat_ntids[etype];
    const dgl_type_t dst_id = out_ntids[etype];
@@ -128,7 +130,7 @@ void SpMMCsrHetero(
          cusparse_available<DType, IdType>(more_nnz)) {  // cusparse
        /* If CUDA is less than 11.0, put the output in trans_out for later
         * transposition */
-        DType* out = (CUDART_VERSION < 11000)
+        DType* out = (DTKRT_VERSION < 11000)
                         ? trans_out[dst_id]
                         : static_cast<DType*>((*vec_out)[dst_id]->data);
        CusparseCsrmm2Hetero<DType, IdType>(
@@ -214,14 +216,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
    const std::vector<dgl_type_t>& ufeat_ntids,
    const std::vector<dgl_type_t>& out_ntids);
 #if BF16_ENABLED
-template void SpMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
    std::vector<std::vector<NDArray>>* out_aux,
    const std::vector<dgl_type_t>& ufeat_ntids,
    const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/utils.h
@@ -11,7 +13,7 @@
 #include <dgl/runtime/ndarray.h>
 #include <dmlc/logging.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <type_traits>

 #include "../../runtime/cuda/cuda_common.h"
@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {

 template <typename T>
 __device__ __forceinline__ T _ldg(T* addr) {
-#if __CUDA_ARCH__ >= 350
+#if __HIP_DEVICE_COMPILE__ 
  return __ldg(addr);
 #else
  return *addr;
@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
 /** @brief Fill the vector started from ptr of size length with val */
 template <typename DType>
 void _Fill(DType* ptr, size_t length, DType val) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = FindNumThreads(length);
  int nb =
      (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
@@ -185,8 +187,8 @@ template <typename IdType>
 __global__ void _LinearSearchKernel(
    const IdType* indptr, const IdType* indices, const IdType* data,
    const IdType* row, const IdType* col, int64_t row_stride,
-    int64_t col_stride, int64_t length, const __nv_bfloat16* weights,
-    __nv_bfloat16 filler, __nv_bfloat16* out) {
+    int64_t col_stride, int64_t length, const __hip_bfloat16* weights,
+    __hip_bfloat16 filler, __hip_bfloat16* out) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
    } else {
      // If the result is saved in bf16, it should be fine to convert it to
      // float first
-      out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast<float>(v));
+      out[tx] = weights ? weights[v] : __hip_bfloat16(static_cast<float>(v));
    }
    tx += stride_x;
  }
@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
 void MaskSelect(
    runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input,
    const BoolType* mask, DType* output, int64_t n, int64_t* rst,
-    cudaStream_t stream) {
+    hipStream_t stream) {
  size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceSelect::Flagged(
+  CUDA_CALL(hipcub::DeviceSelect::Flagged(
      nullptr, workspace_size, input, mask, output, rst, n, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUDA_CALL(cub::DeviceSelect::Flagged(
+  CUDA_CALL(hipcub::DeviceSelect::Flagged(
      workspace, workspace_size, input, mask, output, rst, n, stream));
  device->FreeWorkspace(ctx, workspace);
 }
@@ -290,7 +292,7 @@ void MaskSelect(
 inline void* GetDevicePointer(runtime::NDArray array) {
  void* ptr = array->data;
  if (array.IsPinned()) {
-    CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0));
+    CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
  }
  return ptr;
 }