update src and graphbolt code

6ac701f8 · sangwzh · 1547bd93 · 6ac701f8 · 6ac701f8 · 6ac701f8
Commit 6ac701f8 authored Sep 13, 2024 by sangwzh
20 changed files
--- a/graphbolt/src/unique_and_compact.cc
+++ b/graphbolt/src/unique_and_compact.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *
@@ -10,9 +11,9 @@
 #include <unordered_map>
-#include "./concurrent_id_hash_map.h"
+#include "concurrent_id_hash_map.h"
-#include "./macro.h"
+#include "macro.h"
-#include "./utils.h"
+#include "utils.h"
 namespace graphbolt {
 namespace sampling {

--- a/src/array/arith.h
+++ b/src/array/arith.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file array/arith.h
@@ -6,13 +7,13 @@
 #ifndef DGL_ARRAY_ARITH_H_
 #define DGL_ARRAY_ARITH_H_
-#ifdef __CUDACC__
+#ifdef __HIPCC__
-#define DGLDEVICE __device__
+#define DGLDEVICE __device__ __host__
 #define DGLINLINE __forceinline__
 #else
 #define DGLDEVICE
 #define DGLINLINE inline
-#endif  // __CUDACC__
+#endif  // __HIPCC__
 namespace dgl {
 namespace aten {

--- a/src/array/array.cc
+++ b/src/array/array.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019-2022 by Contributors
 * @file array/array.cc
@@ -14,9 +15,9 @@
 #include <sstream>
 #include "../c_api_common.h"
-#include "./arith.h"
+#include "arith.h"
-#include "./array_op.h"
+#include "array_op.h"
-#include "./kernel_decl.h"
+#include "kernel_decl.h"
 using namespace dgl::runtime;
@@ -585,7 +586,7 @@ COOMatrix CSRRowWiseSampling(
    // prob_or_mask is pinned and rows on GPU is valid
    CHECK_VALID_CONTEXT(prob_or_mask, rows);
    ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
-      CHECK(!(prob_or_mask->dtype.bits == 8 && XPU == kDGLCUDA))
+      CHECK(!(prob_or_mask->dtype.bits == 8 && (XPU == kDGLCUDA || XPU == kDGLROCM)))
          << "GPU sampling with masks is currently not supported yet.";
      ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH(
          prob_or_mask->dtype, FloatType, "probability or mask", {

--- a/src/array/array_arith.cc
+++ b/src/array/array_arith.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file array/array_aritch.cc
@@ -8,8 +9,8 @@
 #include <dgl/runtime/ndarray.h>
 #include "../c_api_common.h"
-#include "./arith.h"
+#include "arith.h"
-#include "./array_op.h"
+#include "array_op.h"
 using namespace dgl::runtime;

--- a/src/array/cpu/array_cumsum.cc
+++ b/src/array/cpu/array_cumsum.cc
@@ -29,6 +29,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
    IdType* out_d = ret.Ptr<IdType>();
    out_d[0] = in_d[0];
    for (int64_t i = 1; i < len; ++i) out_d[i] = out_d[i - 1] + in_d[i];
+    std::cout << "limm cpu ret : " << ret << std::endl;
    return ret;
  }
 }

--- a/src/array/cpu/array_sort.cc
+++ b/src/array/cpu/array_sort.cc
@@ -48,7 +48,7 @@ void swap(const PairRef<V1, V2>& r1, const PairRef<V1, V2>& r2) {
 }
 template <typename V1, typename V2>
-struct PairIterator
+__host__ struct PairIterator
    : public std::iterator<
          std::random_access_iterator_tag, std::pair<V1, V2>, std::ptrdiff_t,
          std::pair<V1*, V2*>, PairRef<V1, V2>> {

--- a/src/array/cpu/gather_mm.cc
+++ b/src/array/cpu/gather_mm.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file kernel/cpu/gaher_mm.cc
 * @brief GatherMM C APIs and definitions.
 */
-#include "./gather_mm.h"
+#include "gather_mm.h"
 #include <dgl/array.h>

--- a/src/array/cpu/labor_sampling.cc
+++ b/src/array/cpu/labor_sampling.cc
+// !!! This is a file automatically generated by hipify!!!
 /*!
 *   Copyright (c) 2022, NVIDIA Corporation
 *   Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -18,7 +19,7 @@
 * \file array/cuda/labor_sampling.cc
 * \brief labor sampling
 */
-#include "./labor_pick.h"
+#include "labor_pick.h"
 namespace dgl {
 namespace aten {

--- a/src/array/cpu/rowwise_sampling.cc
+++ b/src/array/cpu/rowwise_sampling.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cpu/rowwise_sampling.cc
@@ -7,7 +8,7 @@
 #include <numeric>
-#include "./rowwise_pick.h"
+#include "rowwise_pick.h"
 namespace dgl {
 namespace aten {

--- a/src/array/cpu/rowwise_topk.cc
+++ b/src/array/cpu/rowwise_topk.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cpu/rowwise_topk.cc
@@ -6,7 +7,7 @@
 #include <algorithm>
 #include <numeric>
-#include "./rowwise_pick.h"
+#include "rowwise_pick.h"
 namespace dgl {
 namespace aten {

--- a/src/array/cpu/sddmm.cc
+++ b/src/array/cpu/sddmm.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file aten/cpu/sddmm.cc
 * @brief SDDMM C APIs and definitions.
 */
-#include "./sddmm.h"
+#include "sddmm.h"
 #include <dgl/array.h>

--- a/src/array/cpu/segment_reduce.cc
+++ b/src/array/cpu/segment_reduce.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file kernel/cpu/segment_reduce.cc
 * @brief Segment reduce C APIs and definitions.
 */
-#include "./segment_reduce.h"
+#include "segment_reduce.h"
 #include <dgl/array.h>
 #include <string>
-#include "./spmm_binary_ops.h"
+#include "spmm_binary_ops.h"
 namespace dgl {
 namespace aten {

--- a/src/array/cpu/spmm.cc
+++ b/src/array/cpu/spmm.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file kernel/cpu/spmm.cc
 * @brief SPMM C APIs and definitions.
 */
-#include "./spmm.h"
+#include "spmm.h"
 #include <dgl/array.h>

--- a/src/array/cpu/traversal.cc
+++ b/src/array/cpu/traversal.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cpu/traversal.cc
 * @brief Graph traversal implementation
 */
-#include "./traversal.h"
+#include "traversal.h"
 #include <dgl/graph_traversal.h>

--- a/src/array/cuda/array_cumsum.cu
+++ b/src/array/cuda/array_cumsum.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cpu/array_cumsum.cu
 * @brief Array cumsum GPU implementation
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"
 namespace dgl {
 using runtime::NDArray;
@@ -23,7 +26,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
                         : aten::Full(0, 1, array->dtype.bits, array->ctx);
  auto device = runtime::DeviceAPI::Get(array->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const IdType* in_d = array.Ptr<IdType>();
  IdArray ret;
  IdType* out_d = nullptr;
@@ -36,16 +39,16 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
  }
  // Allocate workspace
  size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceScan::InclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
      nullptr, workspace_size, in_d, out_d, len, stream));
  void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
  // Compute cumsum
-  CUDA_CALL(cub::DeviceScan::InclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
      workspace, workspace_size, in_d, out_d, len, stream));
  device->FreeWorkspace(array->ctx, workspace);
+  std::cout << "cuda ret : " << ret << std::endl;
  return ret;
 }

--- a/src/array/cuda/array_index_select.cuh
+++ b/src/array/cuda/array_index_select.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2021-2022 by Contributors
 * @file array/cuda/array_index_select.cuh

--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019 by Contributors
 * @file array/cpu/array_index_select.cu
 * @brief Array index select GPU implementation
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./array_index_select.cuh"
+#include "array_index_select.cuh"
-#include "./utils.h"
+#include "utils.h"
 namespace dgl {
 using runtime::NDArray;
@@ -33,7 +36,7 @@ NDArray IndexSelect(NDArray array, IdArray index) {
  const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
  const IdType* idx_data = static_cast<IdType*>(index->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  if (num_feat == 1) {
    const int nt = cuda::FindNumThreads(len);
    const int nb = (len + nt - 1) / nt;
@@ -61,9 +64,9 @@ template NDArray IndexSelect<kDGLCUDA, int64_t, int64_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDGLCUDA, __half, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDGLCUDA, __half, int64_t>(NDArray, IdArray);
 #if BF16_ENABLED
-template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int32_t>(
+template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int32_t>(
    NDArray, IdArray);
-template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int64_t>(
+template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int64_t>(
    NDArray, IdArray);
 #endif  // BF16_ENABLED
 template NDArray IndexSelect<kDGLCUDA, float, int32_t>(NDArray, IdArray);
@@ -87,7 +90,7 @@ template uint32_t IndexSelect<kDGLCUDA, uint32_t>(NDArray array, int64_t index);
 template uint64_t IndexSelect<kDGLCUDA, uint64_t>(NDArray array, int64_t index);
 template __half IndexSelect<kDGLCUDA, __half>(NDArray array, int64_t index);
 #if BF16_ENABLED
-template __nv_bfloat16 IndexSelect<kDGLCUDA, __nv_bfloat16>(
+template __hip_bfloat16 IndexSelect<kDGLCUDA, __hip_bfloat16>(
    NDArray array, int64_t index);
 #endif  // BF16_ENABLED
 template float IndexSelect<kDGLCUDA, float>(NDArray array, int64_t index);

--- a/src/array/cuda/array_nonzero.cu
+++ b/src/array/cuda/array_nonzero.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020 by Contributors
 * @file array/cpu/array_nonzero.cc
@@ -5,11 +7,13 @@
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"
 namespace dgl {
 using runtime::NDArray;
@@ -33,24 +37,24 @@ IdArray NonZero(IdArray array) {
  const int64_t len = array->shape[0];
  IdArray ret = NewIdArray(len, ctx, 64);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const IdType* const in_data = static_cast<const IdType*>(array->data);
  int64_t* const out_data = static_cast<int64_t*>(ret->data);
  IsNonZeroIndex<IdType> comp(in_data);
-  cub::CountingInputIterator<int64_t> counter(0);
+  hipcub::CountingInputIterator<int64_t> counter(0);
  // room for cub to output on GPU
  int64_t* d_num_nonzeros =
      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
  size_t temp_size = 0;
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
      nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
      stream));
  void* temp = device->AllocWorkspace(ctx, temp_size);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
      temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
  device->FreeWorkspace(ctx, temp);

--- a/src/array/cuda/array_op_impl.cu
+++ b/src/array/cuda/array_op_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2020-2021 by Contributors
 * @file array/cuda/array_op_impl.cu
 * @brief Array operator GPU implementation
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
 #include "../../runtime/cuda/cuda_common.h"
 #include "../../runtime/cuda/cuda_hashtable.cuh"
 #include "../arith.h"
-#include "./utils.h"
+#include "utils.h"
 namespace dgl {
 using runtime::NDArray;
@@ -36,7 +40,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
  CUDA_KERNEL_CALL(
@@ -107,7 +111,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
  CUDA_KERNEL_CALL(
@@ -178,7 +182,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
  IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
  CUDA_KERNEL_CALL(
@@ -249,7 +253,7 @@ IdArray UnaryElewise(IdArray lhs) {
  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
  CUDA_KERNEL_CALL(
@@ -277,7 +281,7 @@ template <DGLDeviceType XPU, typename DType>
 NDArray Full(DType val, int64_t length, DGLContext ctx) {
  NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits<DType>::dtype, ctx);
  DType* ret_data = static_cast<DType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
  CUDA_KERNEL_CALL(
@@ -292,8 +296,8 @@ template IdArray Full<kDGLCUDA, int64_t>(
 template IdArray Full<kDGLCUDA, __half>(
    __half val, int64_t length, DGLContext ctx);
 #if BF16_ENABLED
-template IdArray Full<kDGLCUDA, __nv_bfloat16>(
+template IdArray Full<kDGLCUDA, __hip_bfloat16>(
-    __nv_bfloat16 val, int64_t length, DGLContext ctx);
+    __hip_bfloat16 val, int64_t length, DGLContext ctx);
 #endif  // BF16_ENABLED
 template IdArray Full<kDGLCUDA, float>(
    float val, int64_t length, DGLContext ctx);
@@ -319,7 +323,7 @@ IdArray Range(IdType low, IdType high, DGLContext ctx) {
  IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
  if (length == 0) return ret;
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
  CUDA_KERNEL_CALL(
@@ -355,7 +359,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
  const auto& ctx = arrays[0]->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  // build node maps and get the induced nodes
  OrderedHashTable<IdType> node_map(total_length, ctx, stream);
@@ -364,7 +368,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
  IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8);
-  CUDA_CALL(cudaMemsetAsync(
+  CUDA_CALL(hipMemsetAsync(
      num_induced_device, 0, sizeof(*num_induced_device), stream));
  node_map.FillWithDuplicates(
@@ -416,7 +420,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
  const std::vector<int64_t> shape(arr->shape, arr->shape + arr->ndim);
  IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx);
  const int64_t length = ret.NumElements();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
  if (bits == 32) {

--- a/src/array/cuda/array_scatter.cu
+++ b/src/array/cuda/array_scatter.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2019 by Contributors
 * @file array/cuda/array_scatter.cu
 * @brief Array scatter GPU implementation
 */
 #include <dgl/array.h>
+#include "../../../include/dgl/array.h"
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
+#include "utils.h"
 namespace dgl {
 using runtime::NDArray;
@@ -31,7 +35,7 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
  const DType* val = value.Ptr<DType>();
  DType* outd = out.Ptr<DType>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  const int nt = cuda::FindNumThreads(len);
  const int nb = (len + nt - 1) / nt;
  CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd);
@@ -41,7 +45,7 @@ template void Scatter_<kDGLCUDA, int32_t, int32_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, int64_t, int32_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, __half, int32_t>(IdArray, NDArray, NDArray);
 #if BF16_ENABLED
-template void Scatter_<kDGLCUDA, __nv_bfloat16, int32_t>(
+template void Scatter_<kDGLCUDA, __hip_bfloat16, int32_t>(
    IdArray, NDArray, NDArray);
 #endif  // BF16_ENABLED
 template void Scatter_<kDGLCUDA, float, int32_t>(IdArray, NDArray, NDArray);
@@ -50,7 +54,7 @@ template void Scatter_<kDGLCUDA, int32_t, int64_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, int64_t, int64_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, __half, int64_t>(IdArray, NDArray, NDArray);
 #if BF16_ENABLED
-template void Scatter_<kDGLCUDA, __nv_bfloat16, int64_t>(
+template void Scatter_<kDGLCUDA, __hip_bfloat16, int64_t>(
    IdArray, NDArray, NDArray);
 #endif  // BF16_ENABLED
 template void Scatter_<kDGLCUDA, float, int64_t>(IdArray, NDArray, NDArray);