update src and graphbolt code

6ac701f8 · sangwzh · 1547bd93 · 6ac701f8 · 6ac701f8 · 6ac701f8
Commit 6ac701f8 authored Sep 13, 2024 by sangwzh
20 changed files
--- a/src/array/cuda/negative_sampling.cu
+++ b/src/array/cuda/negative_sampling.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/negative_sampling.cu
 * @brief rowwise sampling
 */

-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/array_iterator.h>
 #include <dgl/random.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 using namespace dgl::runtime;

@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
  int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;

-  curandStatePhilox4_32_10_t
+  hiprandStatePhilox4_32_10_t
      rng;  // this allows generating 4 32-bit ints at a time
-  curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (tx < num_samples) {
    for (int i = 0; i < num_trials; ++i) {
-      uint4 result = curand4(&rng);
+      uint4 result = hiprand4(&rng);
      // Turns out that result.x is always 0 with the above RNG.
      uint64_t y_hi = result.y >> 16;
      uint64_t y_lo = result.y & 0xFFFF;
@@ -88,7 +90,7 @@ struct IsNotMinusOne {
 template <typename IdType>
 void SortOrderedPairs(
    runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
-    IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
+    IdType* tmp_major, IdType* tmp_minor, int64_t n, hipStream_t stream) {
  // Sort ordered pairs in lexicographical order by two radix sorts since
  // cub's radix sorts are stable.
  // We need a 2*n auxiliary storage to store the results form the first radix
@@ -98,21 +100,21 @@ void SortOrderedPairs(
  void* tmp2 = nullptr;

  // Radix sort by minor key first, reorder the major key in the progress.
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
      stream));
  tmp1 = device->AllocWorkspace(ctx, s1);
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
      stream));

  // Radix sort by major key next.
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
      stream));
  tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
                   : tmp1;  // reuse buffer if s2 <= s1
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
      stream));

@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  IdType* out_row_data = out_row.Ptr<IdType>();
  IdType* out_col_data = out_col.Ptr<IdType>();
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int nt = cuda::FindNumThreads(num_actual_samples);
  const int nb = (num_actual_samples + nt - 1) / nt;
  std::pair<IdArray, IdArray> result;
@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  IsNotMinusOne<IdType> op;
  PairIterator<IdType> begin(row_data, col_data);
  PairIterator<IdType> out_begin(out_row_data, out_col_data);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
      nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
      stream));
  void* tmp = device->AllocWorkspace(ctx, tmp_size);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
      tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
      stream));
  num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(

    size_t tmp_size_unique = 0;
    void* tmp_unique = nullptr;
-    CUDA_CALL(cub::DeviceSelect::Unique(
+    CUDA_CALL(hipcub::DeviceSelect::Unique(
        nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
        num_out, stream));
    tmp_unique = (tmp_size_unique > tmp_size)
                     ? device->AllocWorkspace(ctx, tmp_size_unique)
                     : tmp;  // reuse buffer
-    CUDA_CALL(cub::DeviceSelect::Unique(
+    CUDA_CALL(hipcub::DeviceSelect::Unique(
        tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
        num_out, stream));
    num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);

-    num_out = std::min(num_samples, num_out);
+    num_out = ::min(num_samples, num_out);
    result = {
        unique_row.CreateView({num_out}, dtype),
        unique_col.CreateView({num_out}, dtype)};

    if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
  } else {
-    num_out = std::min(num_samples, num_out);
+    num_out = ::min(num_samples, num_out);
    result = {
        out_row.CreateView({num_out}, dtype),
        out_col.CreateView({num_out}, dtype)};

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/rowwise_sampling.cu
 * @brief uniform rowwise sampling
 */

-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/tensordispatch.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>

-#include "../../array/cuda/atomic.cuh"
+#include "atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 using namespace dgl::cuda;
 using namespace dgl::aten::cuda;
@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
      __syncthreads();

      for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
-        const int num = curand(&rng) % (idx + 1);
+        const int num = hiprand(&rng) % (idx + 1);
        if (num < num_picks) {
          // use max so as to achieve the replacement order the serial
          // algorithm would have
@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
    if (deg > 0) {
      // each thread then blindly copies in rows only if deg > 0.
      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
-        const int64_t edge = curand(&rng) % deg;
+        const int64_t edge = hiprand(&rng) % deg;
        const int64_t out_idx = out_row_start + idx;
        out_rows[out_idx] = row;
        out_cols[out_idx] = in_index[in_row_start + edge];
@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
    CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
  const auto& ctx = rows->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t num_rows = rows->shape[0];
  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
  IdType* out_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, out_deg);

-  cudaEvent_t copyEvent;
-  CUDA_CALL(cudaEventCreate(&copyEvent));
+  hipEvent_t copyEvent;
+  CUDA_CALL(hipEventCreate(&copyEvent));

  NDArray new_len_tensor;
  if (TensorDispatcher::Global()->IsAvailable()) {
@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
  }

  // copy using the internal current stream
-  CUDA_CALL(cudaMemcpyAsync(
+  CUDA_CALL(hipMemcpyAsync(
      new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
-      cudaMemcpyDeviceToHost, stream));
-  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+      hipMemcpyDeviceToHost, stream));
+  CUDA_CALL(hipEventRecord(copyEvent, stream));

  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);

@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
  device->FreeWorkspace(ctx, out_ptr);

  // wait for copying `new_len` to finish
-  CUDA_CALL(cudaEventSynchronize(copyEvent));
-  CUDA_CALL(cudaEventDestroy(copyEvent));
+  CUDA_CALL(hipEventSynchronize(copyEvent));
+  CUDA_CALL(hipEventDestroy(copyEvent));

  const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2022 by Contributors
 * @file array/cuda/rowwise_sampling_prob.cu
@@ -6,20 +8,20 @@
 * sampling code rowwise_sampling.cu.
 * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
 */
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>

-#include "../../array/cuda/atomic.cuh"
+#include "atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 // require CUB 1.17 to use DeviceSegmentedSort
-static_assert(
-    CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
+// static_assert(
+//     CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");

 namespace dgl {
 using namespace cuda;
@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
            prob, data, idx, in_row_start, &item_prob);
        // compute A-Res value
        ares[ares_idx] = static_cast<FloatType>(
-            __powf(curand_uniform(&rng), 1.0f / item_prob));
+            __powf(hiprand_uniform(&rng), 1.0f / item_prob));
        ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
      }
    }
@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
  const int64_t last_row =
      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(

    if (deg > 0) {
      // Specialize BlockScan for a 1D block of BLOCK_SIZE threads
-      typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
+      typedef hipcub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
      // Allocate shared memory for BlockScan
      __shared__ typename BlockScan::TempStorage temp_storage;
      // Initialize running total
@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
      for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
        // get random value
        FloatType sum = cdf[cdf_row_start + deg - 1];
-        FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum);
+        FloatType rand = static_cast<FloatType>(hiprand_uniform(&rng) * sum);
        // get the offset of the first value within cdf array which is greater
        // than random value.
-        int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>(
+        int64_t item = hipcub::UpperBound<FloatType*, int64_t, FloatType>(
            &cdf[cdf_row_start], deg, rand);
        item = min(item, deg - 1);
        // get in and out index
@@ -411,7 +413,7 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
  IdType* new_row_data = new_row.Ptr<IdType>();
  IdType* new_col_data = new_col.Ptr<IdType>();
  IdType* new_eid_data = new_eid.Ptr<IdType>();
-  auto stream = runtime::getCurrentCUDAStream();
+  auto stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(ctx);

  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
@@ -441,7 +443,7 @@ COOMatrix _COORemoveIf(
    const COOMatrix& coo, const NDArray& values, DType criteria) {
  const DType* val = values.Ptr<DType>();
  auto maskgen = [val, criteria](
-                     int nb, int nt, cudaStream_t stream, int64_t nnz,
+                     int nb, int nt, hipStream_t stream, int64_t nnz,
                     const IdType* data, int8_t* flags) {
    CUDA_KERNEL_CALL(
        (_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
@@ -481,7 +483,7 @@ COOMatrix _CSRRowWiseSampling(
    const FloatArray& prob, bool replace) {
  const auto& ctx = rows->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t num_rows = rows->shape[0];
  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
@@ -530,10 +532,10 @@ COOMatrix _CSRRowWiseSampling(
  IdType* temp_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, temp_deg);
@@ -551,16 +553,16 @@ COOMatrix _CSRRowWiseSampling(
  IdType* out_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, out_deg);

-  cudaEvent_t copyEvent;
-  CUDA_CALL(cudaEventCreate(&copyEvent));
+  hipEvent_t copyEvent;
+  CUDA_CALL(hipEventCreate(&copyEvent));
  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
  // wait on a cudaevent
  IdType new_len;
@@ -568,7 +570,7 @@ COOMatrix _CSRRowWiseSampling(
  device->CopyDataFromTo(
      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
-  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+  CUDA_CALL(hipEventRecord(copyEvent, stream));

  // allocate workspace
  // 1) for w/ replacement, it's a global buffer to store cdf segments (one
@@ -612,16 +614,16 @@ COOMatrix _CSRRowWiseSampling(
    IdType* sort_temp_idxs = static_cast<IdType*>(
        device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));

-    cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
-    cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
+    hipcub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
+    hipcub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);

    void* d_temp_storage = nullptr;
    size_t temp_storage_bytes = 0;
-    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
        num_rows, temp_ptr, temp_ptr + 1, stream));
    d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
-    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
        d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
        num_rows, temp_ptr, temp_ptr + 1, stream));
    device->FreeWorkspace(ctx, d_temp_storage);
@@ -641,8 +643,8 @@ COOMatrix _CSRRowWiseSampling(
  device->FreeWorkspace(ctx, out_ptr);

  // wait for copying `new_len` to finish
-  CUDA_CALL(cudaEventSynchronize(copyEvent));
-  CUDA_CALL(cudaEventDestroy(copyEvent));
+  CUDA_CALL(hipEventSynchronize(copyEvent));
+  CUDA_CALL(hipEventDestroy(copyEvent));

  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);

--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cuh
@@ -10,8 +12,8 @@

 #include "../../runtime/cuda/cuda_common.h"
 #include "../selector.h"
-#include "./functor.cuh"
-#include "./utils.h"
+#include "functor.cuh"
+#include "utils.h"
 #include "atomic.cuh"
 #include "bf16.cuh"
 #include "fp16.cuh"
@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
      }
 #pragma unroll
      for (int offset = 16; offset > 0; offset /= 2)
-        val += __shfl_down_sync(full_mask, val, offset);
+        val += __shfl_down(val, offset);
      if (tx == 0) outoff[i] = val;
    }
  }
@@ -275,7 +277,7 @@ void SDDMMCoo(
  const DType* lhs_data = lhs.Ptr<DType>();
  const DType* rhs_data = rhs.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  int64_t *lhs_off = nullptr, *rhs_off = nullptr;
  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
@@ -337,7 +339,7 @@ void SDDMMCsr(
  const DType* lhs_data = lhs.Ptr<DType>();
  const DType* rhs_data = rhs.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0];

  int64_t *lhs_off = nullptr, *rhs_off = nullptr;

--- a/src/array/cuda/sddmm.cu
+++ b/src/array/cuda/sddmm.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
@@ -5,8 +6,8 @@
 */
 #include <dgl/array.h>

-#include "./functor.cuh"
-#include "./sddmm.cuh"
+#include "functor.cuh"
+#include "sddmm.cuh"

 namespace dgl {
 namespace aten {
@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #if BF16_ENABLED
-template void SDDMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
-template void SDDMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #endif  // BF16_ENABLED
@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #if BF16_ENABLED
-template void SDDMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
-template void SDDMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #endif  // BF16_ENABLED

--- a/src/array/cuda/sddmm_hetero_coo.cu
+++ b/src/array/cuda/sddmm_hetero_coo.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
@@ -5,7 +6,7 @@
 */
 #include <dgl/array.h>

-#include "./sddmm.cuh"
+#include "sddmm.cuh"

 namespace dgl {
 namespace aten {
@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
 #if BF16_ENABLED
-template void SDDMMCooHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCooHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCooHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,

--- a/src/array/cuda/sddmm_hetero_csr.cu
+++ b/src/array/cuda/sddmm_hetero_csr.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
@@ -5,7 +6,7 @@
 */
 #include <dgl/array.h>

-#include "./sddmm.cuh"
+#include "sddmm.cuh"

 namespace dgl {
 namespace aten {
@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
 #if BF16_ENABLED
-template void SDDMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
    int rhs_target, const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,

--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/segment_reduce.cuh
@@ -10,8 +12,8 @@
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./atomic.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "utils.h"

 namespace dgl {

@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
  DType* out_data = out.Ptr<DType>();
  IdType* arg_data = arg.Ptr<IdType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t n = out->shape[0];
  int64_t dim = 1;
  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
  const IdType* idx_data = idx.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
    const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
    const std::vector<NDArray>& list_idx_types,
    std::vector<NDArray>* list_out) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  if (op == "copy_lhs" || op == "copy_rhs") {
    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
        graph->NumVertexTypes(), std::vector<dgl_id_t>());
@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
  const IdType* arg_data = arg.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];

--- a/src/array/cuda/segment_reduce.cu
+++ b/src/array/cuda/segment_reduce.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/segment_reduce.cu
@@ -6,9 +7,9 @@
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>

-#include "./functor.cuh"
-#include "./segment_reduce.cuh"
-#include "./utils.h"
+#include "functor.cuh"
+#include "segment_reduce.cuh"
+#include "utils.h"

 namespace dgl {

@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
    NDArray arg);
 #if BF16_ENABLED
-template void SegmentReduce<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentReduce<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
    NDArray arg);
-template void SegmentReduce<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentReduce<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
    NDArray arg);
 #endif  // BF16_ENABLED
@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
 template void ScatterAdd<kDGLCUDA, int64_t, __half>(
    NDArray feat, NDArray idx, NDArray out);
 #if BF16_ENABLED
-template void ScatterAdd<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void ScatterAdd<kDGLCUDA, int32_t, __hip_bfloat16>(
    NDArray feat, NDArray idx, NDArray out);
-template void ScatterAdd<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void ScatterAdd<kDGLCUDA, int64_t, __hip_bfloat16>(
    NDArray feat, NDArray idx, NDArray out);
 #endif  // BF16_ENABLED
 template void ScatterAdd<kDGLCUDA, int32_t, float>(
@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
 #if BF16_ENABLED
-template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
 template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
    NDArray feat, NDArray arg, NDArray out);
 #if BF16_ENABLED
-template void BackwardSegmentCmp<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, __hip_bfloat16>(
    NDArray feat, NDArray arg, NDArray out);
-template void BackwardSegmentCmp<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, __hip_bfloat16>(
    NDArray feat, NDArray arg, NDArray out);
 #endif  // BF16_ENABLED
 template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(

--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by contributors.
 * @file array/cuda/spmat_op_impl_coo.cu
@@ -10,8 +12,8 @@
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./atomic.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "utils.h"

 namespace dgl {

@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(

 template <DGLDeviceType XPU, typename IdType>
 int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = coo.row->ctx;
  IdType nnz = coo.row->shape[0];
  IdType nt = 1024;
@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(

 template <DGLDeviceType XPU, typename IdType>
 NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = coo.row->ctx;
  IdType nnz = coo.row->shape[0];
  IdType num_rows = coo.num_rows;

--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmat_op_impl_csr.cu
@@ -7,14 +9,14 @@
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>
 #include <unordered_set>
 #include <vector>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./atomic.cuh"
-#include "./utils.h"
+#include "atomic.cuh"
+#include "utils.h"

 namespace dgl {

@@ -28,7 +30,7 @@ namespace impl {

 template <DGLDeviceType XPU, typename IdType>
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = csr.indptr->ctx;
  IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
  IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
 NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
  const auto rowlen = row->shape[0];
  const auto collen = col->shape[0];
-  const auto rstlen = std::max(rowlen, collen);
+  const auto rstlen = ::max(rowlen, collen);
  NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
  if (rstlen == 0) return rst;
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int nt = dgl::cuda::FindNumThreads(rstlen);
  const int nb = (rstlen + nt - 1) / nt;
  const IdType* data = nullptr;
@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
 bool CSRHasDuplicate(CSRMatrix csr) {
  if (!csr.sorted) csr = CSRSort(csr);
  const auto& ctx = csr.indptr->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = runtime::DeviceAPI::Get(ctx);
  // We allocate a workspace of num_rows bytes. It wastes a little bit memory
  // but should be fine.
@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(

 template <DGLDeviceType XPU, typename IdType>
 NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto len = rows->shape[0];
  const IdType* vid_data = rows.Ptr<IdType>();
  const IdType* indptr_data =
@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(

 template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int64_t len = rows->shape[0];
  IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
  const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
    CSRMatrix csr, NDArray row, NDArray col) {
  const auto rowlen = row->shape[0];
  const auto collen = col->shape[0];
-  const auto len = std::max(rowlen, collen);
+  const auto len = ::max(rowlen, collen);
  if (len == 0) return {NullArray(), NullArray(), NullArray()};

  const auto& ctx = row->ctx;
@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
  const int64_t nnz = csr.indices->shape[0];
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const IdType* indptr_data =
      static_cast<IdType*>(GetDevicePointer(csr.indptr));
@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
          static_cast<IdType>(num_rows));

  NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
-  typedef cub::WarpReduce<IdType> WarpReduce;
+  typedef hipcub::WarpReduce<IdType> WarpReduce;
  __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];

  while (out_row < last_row) {
@@ -547,6 +549,7 @@ __global__ void _SegmentMaskColKernel(
      }
    }
    IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
+    printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
    if (laneid == 0) {
      count[out_row] = reduce_count;
    }
@@ -557,13 +560,16 @@ __global__ void _SegmentMaskColKernel(
 template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceMatrix(
    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const auto& ctx = rows->ctx;
  const auto& dtype = rows->dtype;
  const auto nbits = dtype.bits;
  const int64_t new_nrows = rows->shape[0];
  const int64_t new_ncols = cols->shape[0];

+  std::cout << "new_nrows : " << new_nrows << std::endl;
+  std::cout << "new_ncols : " << new_ncols << std::endl;
+
  if (new_nrows == 0 || new_ncols == 0)
    return CSRMatrix(
        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
@@ -572,6 +578,7 @@ CSRMatrix CSRSliceMatrix(
  // First slice rows
  csr = CSRSliceRows(csr, rows);

+  std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
  if (csr.indices->shape[0] == 0)
    return CSRMatrix(
        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
@@ -581,9 +588,11 @@ CSRMatrix CSRSliceMatrix(
  IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
  // A count for how many masked values per row.
  IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
+  std::cout << "1 IdArray count : " << count << std::endl;
  CUDA_CALL(
-      cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
+      hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));

+  std::cout << "2 IdArray count : " << count << std::endl;
  // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
  // For performance, the load factor of the hashmap is in (0.25, 0.5);
  // Because num_cols is usually less than 1 Million (on GPU), the
@@ -593,7 +602,7 @@ CSRMatrix CSRSliceMatrix(

  using it = thrust::counting_iterator<int64_t>;
  runtime::CUDAWorkspaceAllocator allocator(ctx);
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
  thrust::for_each(
      exec_policy, it(0), it(new_ncols),
      [key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
@@ -617,20 +626,37 @@ CSRMatrix CSRSliceMatrix(
      dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
  const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
  const dim3 nblks(nb);
+
+  std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
+  std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
+  std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
+  std::cout << "indptr_data : " << indptr_data << std::endl;
+  std::cout << "indices_data : " << indices_data << std::endl;
+  std::cout << "num_rows : " << num_rows << std::endl;
+  std::cout << "buffer_size  : " << buffer_size << std::endl;
+  std::cout << "mask  : " << mask << std::endl;
+  std::cout << "count  : " << count << std::endl;
+  std::cout << "hashmap_buffer  : " << hashmap_buffer << std::endl;
+
+
  CUDA_KERNEL_CALL(
      (_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
      nthrs, 0, stream, indptr_data, indices_data, num_rows,
      hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
      count.Ptr<IdType>());

+  std::cout << "3 IdArray count : " << count << std::endl;
  IdArray idx = AsNumBits(NonZero(mask), nbits);
+  std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
  if (idx->shape[0] == 0)
    return CSRMatrix(
        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
        NullArray(dtype, ctx), NullArray(dtype, ctx));

  // Indptr needs to be adjusted according to the new nnz per row.
+  std::cout << " count  : " << count << std::endl;  
  IdArray ret_indptr = CumSum(count, true);
+  std::cout << " IdArray ret_indptr  : " << ret_indptr << std::endl;

  // Column & data can be obtained by index select.
  IdArray ret_col = IndexSelect(csr.indices, idx);
@@ -641,6 +667,8 @@ CSRMatrix CSRSliceMatrix(
  Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
  ret_col = IndexSelect(col_hash, ret_col);

+  // std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
+
  return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
 }


--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cuh
@@ -11,7 +13,7 @@
 #include <limits>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"
 #include "atomic.cuh"
 #include "bf16.cuh"
 #include "fp16.cuh"
@@ -28,14 +30,14 @@ namespace aten {
 */
 template <typename DType, typename IdType>
 inline bool cusparse_available(bool more_nnz_than_matrix_size) {
-#if CUDART_VERSION < 11000
+#if DTKRT_VERSION < 11000
  if (std::is_same<IdType, int>::value &&
      (std::is_same<DType, float>::value || std::is_same<DType, double>::value))
    return true;
  return false;
 #else
  if (std::is_same<DType, __half>::value ||
-      std::is_same<DType, __nv_bfloat16>::value)
+      std::is_same<DType, __hip_bfloat16>::value)
    return false;  // cusparse's SpMM on fp16 is slow, temporally disabled.
  // If the CSR matrix has more NNZ than matrix size, we should not use
  // cuSPARSE 11.1.
@@ -47,54 +49,54 @@ namespace {

 /** @brief Call cuBLAS geam API for transpose operation for float and double. */
 template <typename DType>
-cublasStatus_t Xgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const DType* alpha, const DType* A, int lda,
    const DType* beta, const DType* B, int ldb, DType* C, int ldc) {
  LOG(FATAL) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }

 template <>
-cublasStatus_t Xgeam<__half>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<__half>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const __half* alpha, const __half* A, int lda,
    const __half* beta, const __half* B, int ldb, __half* C, int ldc) {
  // TODO(ndickson): There is no cublasHgeam, so a different
  // implementation would be required.
  LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }

 #if BF16_ENABLED
 template <>
-cublasStatus_t Xgeam<__nv_bfloat16>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda,
-    const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb,
-    __nv_bfloat16* C, int ldc) {
+hipblasStatus_t Xgeam<__hip_bfloat16>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+    int m, int n, const __hip_bfloat16* alpha, const __hip_bfloat16* A, int lda,
+    const __hip_bfloat16* beta, const __hip_bfloat16* B, int ldb,
+    __hip_bfloat16* C, int ldc) {
  // TODO(ndickson): There is no cublasHgeam, so a different
  // implementation would be required.
  LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }
 #endif  // BF16_ENABLED

 template <>
-cublasStatus_t Xgeam<float>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<float>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const float* alpha, const float* A, int lda,
    const float* beta, const float* B, int ldb, float* C, int ldc) {
-  return cublasSgeam(
+  return hipblasSgeam(
      handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }

 template <>
-cublasStatus_t Xgeam<double>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<double>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
    int m, int n, const double* alpha, const double* A, int lda,
    const double* beta, const double* B, int ldb, double* C, int ldc) {
-  return cublasDgeam(
+  return hipblasDgeam(
      handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }

@@ -119,12 +121,12 @@ template <typename DType>
 void _Transpose(const DType* in, DType* out, int row, int col) {
  DType alpha = 1., beta = 0.;
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
  CUBLAS_CALL(Xgeam<DType>(
-      thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in,
+      thr_entry->cublas_handle, HIPBLAS_OP_T, HIPBLAS_OP_N, row, col, &alpha, in,
      col, &beta, nullptr, row, out, row));
 }

@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
 */
 template <>
 void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = FindNumThreads(row);
  int nb = col;
  CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
 * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
 */
 template <>
-void _Transpose<__nv_bfloat16>(
-    const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+void _Transpose<__hip_bfloat16>(
+    const __hip_bfloat16* in, __hip_bfloat16* out, int row, int col) {
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = FindNumThreads(row);
  int nb = col;
  CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
 }
 #endif  // BF16_ENABLED

-#if CUDART_VERSION < 11000
+#if DTKRT_VERSION < 11000
 template <typename DType>
-cusparseStatus_t Xcsrmm2(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA,
+hipsparseStatus_t Xcsrmm2(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const DType* alpha, const hipsparseMatDescr_t descrA, const DType* csrValA,
    const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb,
    const DType* beta, DType* C, int ldc) {
  LOG(INFO) << "Not supported dtype";
-  return CUSPARSE_STATUS_EXECUTION_FAILED;
+  return HIPSPARSE_STATUS_EXECUTION_FAILED;
 }

 template <>
-cusparseStatus_t Xcsrmm2<float>(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA,
+hipsparseStatus_t Xcsrmm2<float>(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const float* alpha, const hipsparseMatDescr_t descrA, const float* csrValA,
    const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb,
    const float* beta, float* C, int ldc) {
-  return cusparseScsrmm2(
+  return hipsparseScsrmm2(
      handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
      csrColIndA, B, ldb, beta, C, ldc);
 }

 template <>
-cusparseStatus_t Xcsrmm2<double>(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA,
+hipsparseStatus_t Xcsrmm2<double>(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const double* alpha, const hipsparseMatDescr_t descrA, const double* csrValA,
    const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb,
    const double* beta, double* C, int ldc) {
-  return cusparseDcsrmm2(
+  return hipsparseDcsrmm2(
      handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
      csrColIndA, B, ldb, beta, C, ldc);
 }
@@ -213,12 +215,12 @@ void CusparseCsrmm2(
  // device
  auto device = runtime::DeviceAPI::Get(ctx);
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
  // all one data array
  DType* valptr = nullptr;
  if (!A_data) {
@@ -226,52 +228,52 @@ void CusparseCsrmm2(
        static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
    _Fill(valptr, nnz, static_cast<DType>(1.));
  }
-#if CUDART_VERSION >= 11000
-  cusparseSpMatDescr_t matA;
-  cusparseDnMatDescr_t matB, matC;
+#if DTKRT_VERSION >= 11000
+  hipsparseSpMatDescr_t matA;
+  hipsparseDnMatDescr_t matB, matC;
  constexpr auto dtype = cuda_dtype<DType>::value;
  constexpr auto idtype = cusparse_idtype<IdType>::value;
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
      &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
      static_cast<IdType*>(csr.indices->data),
      const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
-      CUSPARSE_INDEX_BASE_ZERO, dtype));
-  CUSPARSE_CALL(cusparseCreateDnMat(
-      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+      HIPSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(hipsparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
  CUSPARSE_CALL(
-      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+      hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));

-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
  size_t workspace_size;
-  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+  CUSPARSE_CALL(hipsparseSpMM_bufferSize(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, CUSPARSE_SPMM_CSR_ALG2, &workspace_size));
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseSpMM(
+  CUSPARSE_CALL(hipsparseSpMM(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, CUSPARSE_SPMM_CSR_ALG2, workspace));
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
  device->FreeWorkspace(ctx, workspace);

-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
 #else
  // allocate matrix for temporary transposed output
  DType* trans_out =
      static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType)));

-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
  CUSPARSE_CALL(Xcsrmm2<DType>(
-      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+      HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
      (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
      static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out,
      m));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
  // transpose the output matrix
  _Transpose(trans_out, C_data, n, m);
  device->FreeWorkspace(ctx, trans_out);
@@ -284,7 +286,7 @@ template <typename DType, typename IdType>
 void CusparseCsrmm2Hetero(
    const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
    const DType* A_data, DType* C_data, int64_t x_length,
-    cudaStream_t strm_id) {
+    hipStream_t strm_id) {
  // We use csrmm2 to perform following operation:
  // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
  // for node feature tensor. However, since cusparse only supports
@@ -307,9 +309,9 @@ void CusparseCsrmm2Hetero(
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, strm_id));
  // all one data array
  DType* valptr = nullptr;
  if (!A_data) {
@@ -317,48 +319,48 @@ void CusparseCsrmm2Hetero(
        static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
    _Fill(valptr, nnz, static_cast<DType>(1.));
  }
-#if CUDART_VERSION >= 11000
-  cusparseSpMatDescr_t matA;
-  cusparseDnMatDescr_t matB, matC;
+#if DTKRT_VERSION >= 11000
+  hipsparseSpMatDescr_t matA;
+  hipsparseDnMatDescr_t matB, matC;
  constexpr auto dtype = cuda_dtype<DType>::value;
  constexpr auto idtype = cusparse_idtype<IdType>::value;
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
      &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
      static_cast<IdType*>(csr.indices->data),
      const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
-      CUSPARSE_INDEX_BASE_ZERO, dtype));
-  CUSPARSE_CALL(cusparseCreateDnMat(
-      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+      HIPSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(hipsparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
  CUSPARSE_CALL(
-      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+      hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));

-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
  size_t workspace_size;
-  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+  CUSPARSE_CALL(hipsparseSpMM_bufferSize(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, CUSPARSE_SPMM_CSR_ALG2, &workspace_size));
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseSpMM(
+  CUSPARSE_CALL(hipsparseSpMM(
      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
-      matC, dtype, CUSPARSE_SPMM_CSR_ALG2, workspace));
+      matC, dtype, HIPSPARSE_SPMM_CSR_ALG2, workspace));
  device->FreeWorkspace(ctx, workspace);

-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
 #else
-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
  CHECK_EQ(sizeof(IdType), sizeof(int32_t));
  CUSPARSE_CALL(Xcsrmm2<DType>(
-      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+      HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
      (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
      static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
 #endif
  if (valptr) device->FreeWorkspace(ctx, valptr);
 }
@@ -625,7 +627,7 @@ void SpMMCoo(
   */
 #if BF16_ENABLED
  if (std::is_same<DType, __half>::value ||
-      std::is_same<DType, __nv_bfloat16>::value)
+      std::is_same<DType, __hip_bfloat16>::value)
 #else
  if (std::is_same<DType, __half>::value)
 #endif  // BF16_ENABLED
@@ -638,7 +640,7 @@ void SpMMCoo(
              *efeat_data = efeat.Ptr<DType>();
  DType* out_data = out.Ptr<DType>();
  Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0];

  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
@@ -703,7 +705,7 @@ void SpMMCsr(
  Idx* argu_data = argu.Ptr<Idx>();
  Idx* arge_data = arge.Ptr<Idx>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
@@ -764,7 +766,7 @@ void SpMMCmpCsrHetero(
  Idx* argu_data = argu.Ptr<Idx>();
  Idx* arge_data = arge.Ptr<Idx>();

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cu
@@ -6,9 +7,9 @@
 #include <dgl/array.h>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./functor.cuh"
-#include "./ge_spmm.cuh"
-#include "./spmm.cuh"
+#include "functor.cuh"
+#include "ge_spmm.cuh"
+#include "spmm.cuh"

 namespace dgl {

@@ -109,11 +110,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
 #if BF16_ENABLED
-template void SpMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
-template void SpMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
@@ -144,11 +145,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
 #if BF16_ENABLED
-template void SpMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);
-template void SpMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
    std::vector<NDArray> out_aux);

--- a/src/array/cuda/spmm_hetero.cu
+++ b/src/array/cuda/spmm_hetero.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cu
@@ -6,9 +8,9 @@
 #include <dgl/array.h>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./functor.cuh"
-#include "./ge_spmm.cuh"
-#include "./spmm.cuh"
+#include "functor.cuh"
+#include "ge_spmm.cuh"
+#include "spmm.cuh"

 namespace dgl {

@@ -37,7 +39,7 @@ void SpMMCsrHetero(
  std::vector<DType*> trans_out((*vec_out).size(), NULL);

  bool use_legacy_cusparsemm =
-      (CUDART_VERSION < 11000) && (reduce == "sum") &&
+      (DTKRT_VERSION < 11000) && (reduce == "sum") &&
      // legacy cuSPARSE does not care about NNZ, hence the argument "false".
      ((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
       (op == "mul" && is_scalar_efeat &&
@@ -50,7 +52,7 @@ void SpMMCsrHetero(
      if (m == 0) continue;
      DType* out = static_cast<DType*>(device->AllocWorkspace(
          vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
-      CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType)));
+      CUDA_CALL(hipMemset(out, 0, m * n * sizeof(DType)));
      trans_out[ntype] = out;
    }
  }
@@ -111,7 +113,7 @@ void SpMMCsrHetero(
    }
  }

-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
    const dgl_type_t src_id = ufeat_ntids[etype];
    const dgl_type_t dst_id = out_ntids[etype];
@@ -123,7 +125,7 @@ void SpMMCsrHetero(
          cusparse_available<DType, IdType>(more_nnz)) {  // cusparse
        /* If CUDA is less than 11.0, put the output in trans_out for later
         * transposition */
-        DType* out = (CUDART_VERSION < 11000)
+        DType* out = (DTKRT_VERSION < 11000)
                         ? trans_out[dst_id]
                         : static_cast<DType*>((*vec_out)[dst_id]->data);
        CusparseCsrmm2Hetero<DType, IdType>(
@@ -209,14 +211,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
    const std::vector<dgl_type_t>& ufeat_ntids,
    const std::vector<dgl_type_t>& out_ntids);
 #if BF16_ENABLED
-template void SpMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
    std::vector<std::vector<NDArray>>* out_aux,
    const std::vector<dgl_type_t>& ufeat_ntids,
    const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
    const std::string& op, const std::string& reduce, const BcastOff& bcast,
    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/utils.h
@@ -11,7 +13,7 @@
 #include <dgl/runtime/ndarray.h>
 #include <dmlc/logging.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <type_traits>

 #include "../../runtime/cuda/cuda_common.h"
@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {

 template <typename T>
 __device__ __forceinline__ T _ldg(T* addr) {
-#if __CUDA_ARCH__ >= 350
+#if __HIP_DEVICE_COMPILE__ 
  return __ldg(addr);
 #else
  return *addr;
@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
 /** @brief Fill the vector started from ptr of size length with val */
 template <typename DType>
 void _Fill(DType* ptr, size_t length, DType val) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = FindNumThreads(length);
  int nb =
      (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
@@ -185,8 +187,8 @@ template <typename IdType>
 __global__ void _LinearSearchKernel(
    const IdType* indptr, const IdType* indices, const IdType* data,
    const IdType* row, const IdType* col, int64_t row_stride,
-    int64_t col_stride, int64_t length, const __nv_bfloat16* weights,
-    __nv_bfloat16 filler, __nv_bfloat16* out) {
+    int64_t col_stride, int64_t length, const __hip_bfloat16* weights,
+    __hip_bfloat16 filler, __hip_bfloat16* out) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
    } else {
      // If the result is saved in bf16, it should be fine to convert it to
      // float first
-      out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast<float>(v));
+      out[tx] = weights ? weights[v] : __hip_bfloat16(static_cast<float>(v));
    }
    tx += stride_x;
  }
@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
 void MaskSelect(
    runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input,
    const BoolType* mask, DType* output, int64_t n, int64_t* rst,
-    cudaStream_t stream) {
+    hipStream_t stream) {
  size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceSelect::Flagged(
+  CUDA_CALL(hipcub::DeviceSelect::Flagged(
      nullptr, workspace_size, input, mask, output, rst, n, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUDA_CALL(cub::DeviceSelect::Flagged(
+  CUDA_CALL(hipcub::DeviceSelect::Flagged(
      workspace, workspace_size, input, mask, output, rst, n, stream));
  device->FreeWorkspace(ctx, workspace);
 }
@@ -290,7 +292,7 @@ void MaskSelect(
 inline void* GetDevicePointer(runtime::NDArray array) {
  void* ptr = array->data;
  if (array.IsPinned()) {
-    CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0));
+    CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
  }
  return ptr;
 }

--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/utils.cu
 * @brief Utilities for CUDA kernels.
 */

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"

 namespace dgl {
 namespace cuda {
@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
  int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
  // Call CUB's reduction
  size_t workspace_size = 0;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
-  CUDA_CALL(cub::DeviceReduce::Min(
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
+  CUDA_CALL(hipcub::DeviceReduce::Min(
      nullptr, workspace_size, flags, rst, length, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUDA_CALL(cub::DeviceReduce::Min(
+  CUDA_CALL(hipcub::DeviceReduce::Min(
      workspace, workspace_size, flags, rst, length, stream));
  int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
  device->FreeWorkspace(ctx, workspace);

--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/cpu/array_index_select_uvm.cuh

--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019-2022 by Contributors
 * @file array/cuda/uvm/array_index_select_uvm.cu
@@ -8,7 +10,7 @@
 #include "../../../runtime/cuda/cuda_common.h"
 #include "../array_index_select.cuh"
 #include "../utils.h"
-#include "./array_index_select_uvm.cuh"
+#include "array_index_select_uvm.cuh"

 namespace dgl {
 using runtime::NDArray;
@@ -17,7 +19,7 @@ namespace impl {

 template <typename DType, typename IdType>
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int64_t arr_len = array->shape[0];
  const int64_t len = index->shape[0];
  int64_t num_feat = 1;
@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);

 template <typename DType, typename IdType>
 void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const DType* source_data = static_cast<DType*>(source->data);
  const IdType* idx_data = static_cast<IdType*>(index->data);
  const int64_t arr_len = dest->shape[0];

--- a/src/array/filter.cc
+++ b/src/array/filter.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2021 by Contributors
 * @file array/filter.cc
 * @brief Object for selecting items in a set, or selecting items not in a set.
 */

-#include "./filter.h"
+#include "filter.h"

 #include <dgl/packed_func_ext.h>
 #include <dgl/runtime/packed_func.h>
@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
      IdArray array = args[0];
      auto ctx = array->ctx;
      // TODO(nv-dlasalle): Implement CPU version.
-      if (ctx.device_type == kDGLCUDA) {
+      if (ctx.device_type == kDGLCUDA|| ctx.device_type == kDGLROCM) {
 #ifdef DGL_USE_CUDA
        ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
          *rv = CreateSetFilter<kDGLCUDA, IdType>(array);

--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/kernel.cc
@@ -7,7 +8,7 @@
 #include <dgl/packed_func_ext.h>

 #include "../c_api_common.h"
-#include "./check.h"
+#include "check.h"
 #include "kernel_decl.h"

 using namespace dgl::runtime;