"tests/python/vscode:/vscode.git/clone" did not exist on "3200b88b555f77e2a800b22f644cf62b827a57bc"
Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* *
...@@ -10,9 +11,9 @@ ...@@ -10,9 +11,9 @@
#include <unordered_map> #include <unordered_map>
#include "./concurrent_id_hash_map.h" #include "concurrent_id_hash_map.h"
#include "./macro.h" #include "macro.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace sampling { namespace sampling {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file array/arith.h * @file array/arith.h
...@@ -6,13 +7,13 @@ ...@@ -6,13 +7,13 @@
#ifndef DGL_ARRAY_ARITH_H_ #ifndef DGL_ARRAY_ARITH_H_
#define DGL_ARRAY_ARITH_H_ #define DGL_ARRAY_ARITH_H_
#ifdef __CUDACC__ #ifdef __HIPCC__
#define DGLDEVICE __device__ #define DGLDEVICE __device__ __host__
#define DGLINLINE __forceinline__ #define DGLINLINE __forceinline__
#else #else
#define DGLDEVICE #define DGLDEVICE
#define DGLINLINE inline #define DGLINLINE inline
#endif // __CUDACC__ #endif // __HIPCC__
namespace dgl { namespace dgl {
namespace aten { namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019-2022 by Contributors * Copyright (c) 2019-2022 by Contributors
* @file array/array.cc * @file array/array.cc
...@@ -14,9 +15,9 @@ ...@@ -14,9 +15,9 @@
#include <sstream> #include <sstream>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./arith.h" #include "arith.h"
#include "./array_op.h" #include "array_op.h"
#include "./kernel_decl.h" #include "kernel_decl.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -585,7 +586,7 @@ COOMatrix CSRRowWiseSampling( ...@@ -585,7 +586,7 @@ COOMatrix CSRRowWiseSampling(
// prob_or_mask is pinned and rows on GPU is valid // prob_or_mask is pinned and rows on GPU is valid
CHECK_VALID_CONTEXT(prob_or_mask, rows); CHECK_VALID_CONTEXT(prob_or_mask, rows);
ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", { ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
CHECK(!(prob_or_mask->dtype.bits == 8 && XPU == kDGLCUDA)) CHECK(!(prob_or_mask->dtype.bits == 8 && (XPU == kDGLCUDA || XPU == kDGLROCM)))
<< "GPU sampling with masks is currently not supported yet."; << "GPU sampling with masks is currently not supported yet.";
ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH( ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH(
prob_or_mask->dtype, FloatType, "probability or mask", { prob_or_mask->dtype, FloatType, "probability or mask", {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file array/array_aritch.cc * @file array/array_aritch.cc
...@@ -8,8 +9,8 @@ ...@@ -8,8 +9,8 @@
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./arith.h" #include "arith.h"
#include "./array_op.h" #include "array_op.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
...@@ -29,6 +29,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) { ...@@ -29,6 +29,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
IdType* out_d = ret.Ptr<IdType>(); IdType* out_d = ret.Ptr<IdType>();
out_d[0] = in_d[0]; out_d[0] = in_d[0];
for (int64_t i = 1; i < len; ++i) out_d[i] = out_d[i - 1] + in_d[i]; for (int64_t i = 1; i < len; ++i) out_d[i] = out_d[i - 1] + in_d[i];
std::cout << "limm cpu ret : " << ret << std::endl;
return ret; return ret;
} }
} }
......
...@@ -48,7 +48,7 @@ void swap(const PairRef<V1, V2>& r1, const PairRef<V1, V2>& r2) { ...@@ -48,7 +48,7 @@ void swap(const PairRef<V1, V2>& r1, const PairRef<V1, V2>& r2) {
} }
template <typename V1, typename V2> template <typename V1, typename V2>
struct PairIterator __host__ struct PairIterator
: public std::iterator< : public std::iterator<
std::random_access_iterator_tag, std::pair<V1, V2>, std::ptrdiff_t, std::random_access_iterator_tag, std::pair<V1, V2>, std::ptrdiff_t,
std::pair<V1*, V2*>, PairRef<V1, V2>> { std::pair<V1*, V2*>, PairRef<V1, V2>> {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file kernel/cpu/gaher_mm.cc * @file kernel/cpu/gaher_mm.cc
* @brief GatherMM C APIs and definitions. * @brief GatherMM C APIs and definitions.
*/ */
#include "./gather_mm.h" #include "gather_mm.h"
#include <dgl/array.h> #include <dgl/array.h>
......
// !!! This is a file automatically generated by hipify!!!
/*! /*!
* Copyright (c) 2022, NVIDIA Corporation * Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -18,7 +19,7 @@ ...@@ -18,7 +19,7 @@
* \file array/cuda/labor_sampling.cc * \file array/cuda/labor_sampling.cc
* \brief labor sampling * \brief labor sampling
*/ */
#include "./labor_pick.h" #include "labor_pick.h"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cpu/rowwise_sampling.cc * @file array/cpu/rowwise_sampling.cc
...@@ -7,7 +8,7 @@ ...@@ -7,7 +8,7 @@
#include <numeric> #include <numeric>
#include "./rowwise_pick.h" #include "rowwise_pick.h"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cpu/rowwise_topk.cc * @file array/cpu/rowwise_topk.cc
...@@ -6,7 +7,7 @@ ...@@ -6,7 +7,7 @@
#include <algorithm> #include <algorithm>
#include <numeric> #include <numeric>
#include "./rowwise_pick.h" #include "rowwise_pick.h"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file aten/cpu/sddmm.cc * @file aten/cpu/sddmm.cc
* @brief SDDMM C APIs and definitions. * @brief SDDMM C APIs and definitions.
*/ */
#include "./sddmm.h" #include "sddmm.h"
#include <dgl/array.h> #include <dgl/array.h>
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file kernel/cpu/segment_reduce.cc * @file kernel/cpu/segment_reduce.cc
* @brief Segment reduce C APIs and definitions. * @brief Segment reduce C APIs and definitions.
*/ */
#include "./segment_reduce.h" #include "segment_reduce.h"
#include <dgl/array.h> #include <dgl/array.h>
#include <string> #include <string>
#include "./spmm_binary_ops.h" #include "spmm_binary_ops.h"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file kernel/cpu/spmm.cc * @file kernel/cpu/spmm.cc
* @brief SPMM C APIs and definitions. * @brief SPMM C APIs and definitions.
*/ */
#include "./spmm.h" #include "spmm.h"
#include <dgl/array.h> #include <dgl/array.h>
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cpu/traversal.cc * @file array/cpu/traversal.cc
* @brief Graph traversal implementation * @brief Graph traversal implementation
*/ */
#include "./traversal.h" #include "traversal.h"
#include <dgl/graph_traversal.h> #include <dgl/graph_traversal.h>
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cpu/array_cumsum.cu * @file array/cpu/array_cumsum.cu
* @brief Array cumsum GPU implementation * @brief Array cumsum GPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -23,7 +26,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) { ...@@ -23,7 +26,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
: aten::Full(0, 1, array->dtype.bits, array->ctx); : aten::Full(0, 1, array->dtype.bits, array->ctx);
auto device = runtime::DeviceAPI::Get(array->ctx); auto device = runtime::DeviceAPI::Get(array->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* in_d = array.Ptr<IdType>(); const IdType* in_d = array.Ptr<IdType>();
IdArray ret; IdArray ret;
IdType* out_d = nullptr; IdType* out_d = nullptr;
...@@ -36,16 +39,16 @@ IdArray CumSum(IdArray array, bool prepend_zero) { ...@@ -36,16 +39,16 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
} }
// Allocate workspace // Allocate workspace
size_t workspace_size = 0; size_t workspace_size = 0;
CUDA_CALL(cub::DeviceScan::InclusiveSum( CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
nullptr, workspace_size, in_d, out_d, len, stream)); nullptr, workspace_size, in_d, out_d, len, stream));
void* workspace = device->AllocWorkspace(array->ctx, workspace_size); void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
// Compute cumsum // Compute cumsum
CUDA_CALL(cub::DeviceScan::InclusiveSum( CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
workspace, workspace_size, in_d, out_d, len, stream)); workspace, workspace_size, in_d, out_d, len, stream));
device->FreeWorkspace(array->ctx, workspace); device->FreeWorkspace(array->ctx, workspace);
std::cout << "cuda ret : " << ret << std::endl;
return ret; return ret;
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2021-2022 by Contributors * Copyright (c) 2021-2022 by Contributors
* @file array/cuda/array_index_select.cuh * @file array/cuda/array_index_select.cuh
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file array/cpu/array_index_select.cu * @file array/cpu/array_index_select.cu
* @brief Array index select GPU implementation * @brief Array index select GPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./array_index_select.cuh" #include "array_index_select.cuh"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -33,7 +36,7 @@ NDArray IndexSelect(NDArray array, IdArray index) { ...@@ -33,7 +36,7 @@ NDArray IndexSelect(NDArray array, IdArray index) {
const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array)); const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
const IdType* idx_data = static_cast<IdType*>(index->data); const IdType* idx_data = static_cast<IdType*>(index->data);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_feat == 1) { if (num_feat == 1) {
const int nt = cuda::FindNumThreads(len); const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt; const int nb = (len + nt - 1) / nt;
...@@ -61,9 +64,9 @@ template NDArray IndexSelect<kDGLCUDA, int64_t, int64_t>(NDArray, IdArray); ...@@ -61,9 +64,9 @@ template NDArray IndexSelect<kDGLCUDA, int64_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, __half, int32_t>(NDArray, IdArray); template NDArray IndexSelect<kDGLCUDA, __half, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, __half, int64_t>(NDArray, IdArray); template NDArray IndexSelect<kDGLCUDA, __half, int64_t>(NDArray, IdArray);
#if BF16_ENABLED #if BF16_ENABLED
template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int32_t>( template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int32_t>(
NDArray, IdArray); NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int64_t>( template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int64_t>(
NDArray, IdArray); NDArray, IdArray);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template NDArray IndexSelect<kDGLCUDA, float, int32_t>(NDArray, IdArray); template NDArray IndexSelect<kDGLCUDA, float, int32_t>(NDArray, IdArray);
...@@ -87,7 +90,7 @@ template uint32_t IndexSelect<kDGLCUDA, uint32_t>(NDArray array, int64_t index); ...@@ -87,7 +90,7 @@ template uint32_t IndexSelect<kDGLCUDA, uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<kDGLCUDA, uint64_t>(NDArray array, int64_t index); template uint64_t IndexSelect<kDGLCUDA, uint64_t>(NDArray array, int64_t index);
template __half IndexSelect<kDGLCUDA, __half>(NDArray array, int64_t index); template __half IndexSelect<kDGLCUDA, __half>(NDArray array, int64_t index);
#if BF16_ENABLED #if BF16_ENABLED
template __nv_bfloat16 IndexSelect<kDGLCUDA, __nv_bfloat16>( template __hip_bfloat16 IndexSelect<kDGLCUDA, __hip_bfloat16>(
NDArray array, int64_t index); NDArray array, int64_t index);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template float IndexSelect<kDGLCUDA, float>(NDArray array, int64_t index); template float IndexSelect<kDGLCUDA, float>(NDArray array, int64_t index);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* @file array/cpu/array_nonzero.cc * @file array/cpu/array_nonzero.cc
...@@ -5,11 +7,13 @@ ...@@ -5,11 +7,13 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -33,24 +37,24 @@ IdArray NonZero(IdArray array) { ...@@ -33,24 +37,24 @@ IdArray NonZero(IdArray array) {
const int64_t len = array->shape[0]; const int64_t len = array->shape[0];
IdArray ret = NewIdArray(len, ctx, 64); IdArray ret = NewIdArray(len, ctx, 64);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* const in_data = static_cast<const IdType*>(array->data); const IdType* const in_data = static_cast<const IdType*>(array->data);
int64_t* const out_data = static_cast<int64_t*>(ret->data); int64_t* const out_data = static_cast<int64_t*>(ret->data);
IsNonZeroIndex<IdType> comp(in_data); IsNonZeroIndex<IdType> comp(in_data);
cub::CountingInputIterator<int64_t> counter(0); hipcub::CountingInputIterator<int64_t> counter(0);
// room for cub to output on GPU // room for cub to output on GPU
int64_t* d_num_nonzeros = int64_t* d_num_nonzeros =
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t))); static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
size_t temp_size = 0; size_t temp_size = 0;
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(hipcub::DeviceSelect::If(
nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp, nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
stream)); stream));
void* temp = device->AllocWorkspace(ctx, temp_size); void* temp = device->AllocWorkspace(ctx, temp_size);
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(hipcub::DeviceSelect::If(
temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream)); temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
device->FreeWorkspace(ctx, temp); device->FreeWorkspace(ctx, temp);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2020-2021 by Contributors * Copyright (c) 2020-2021 by Contributors
* @file array/cuda/array_op_impl.cu * @file array/cuda/array_op_impl.cu
* @brief Array operator GPU implementation * @brief Array operator GPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_hashtable.cuh" #include "../../runtime/cuda/cuda_hashtable.cuh"
#include "../arith.h" #include "../arith.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -36,7 +40,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) { ...@@ -36,7 +40,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
const IdType* lhs_data = static_cast<IdType*>(lhs->data); const IdType* lhs_data = static_cast<IdType*>(lhs->data);
const IdType* rhs_data = static_cast<IdType*>(rhs->data); const IdType* rhs_data = static_cast<IdType*>(rhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len); int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt; int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
...@@ -107,7 +111,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) { ...@@ -107,7 +111,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
const IdType* lhs_data = static_cast<IdType*>(lhs->data); const IdType* lhs_data = static_cast<IdType*>(lhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len); int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt; int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
...@@ -178,7 +182,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) { ...@@ -178,7 +182,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits); IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
const IdType* rhs_data = static_cast<IdType*>(rhs->data); const IdType* rhs_data = static_cast<IdType*>(rhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len); int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt; int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
...@@ -249,7 +253,7 @@ IdArray UnaryElewise(IdArray lhs) { ...@@ -249,7 +253,7 @@ IdArray UnaryElewise(IdArray lhs) {
IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
const IdType* lhs_data = static_cast<IdType*>(lhs->data); const IdType* lhs_data = static_cast<IdType*>(lhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len); int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt; int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
...@@ -277,7 +281,7 @@ template <DGLDeviceType XPU, typename DType> ...@@ -277,7 +281,7 @@ template <DGLDeviceType XPU, typename DType>
NDArray Full(DType val, int64_t length, DGLContext ctx) { NDArray Full(DType val, int64_t length, DGLContext ctx) {
NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits<DType>::dtype, ctx); NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits<DType>::dtype, ctx);
DType* ret_data = static_cast<DType*>(ret->data); DType* ret_data = static_cast<DType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(length); int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt; int nb = (length + nt - 1) / nt;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
...@@ -292,8 +296,8 @@ template IdArray Full<kDGLCUDA, int64_t>( ...@@ -292,8 +296,8 @@ template IdArray Full<kDGLCUDA, int64_t>(
template IdArray Full<kDGLCUDA, __half>( template IdArray Full<kDGLCUDA, __half>(
__half val, int64_t length, DGLContext ctx); __half val, int64_t length, DGLContext ctx);
#if BF16_ENABLED #if BF16_ENABLED
template IdArray Full<kDGLCUDA, __nv_bfloat16>( template IdArray Full<kDGLCUDA, __hip_bfloat16>(
__nv_bfloat16 val, int64_t length, DGLContext ctx); __hip_bfloat16 val, int64_t length, DGLContext ctx);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template IdArray Full<kDGLCUDA, float>( template IdArray Full<kDGLCUDA, float>(
float val, int64_t length, DGLContext ctx); float val, int64_t length, DGLContext ctx);
...@@ -319,7 +323,7 @@ IdArray Range(IdType low, IdType high, DGLContext ctx) { ...@@ -319,7 +323,7 @@ IdArray Range(IdType low, IdType high, DGLContext ctx) {
IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8); IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
if (length == 0) return ret; if (length == 0) return ret;
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(length); int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt; int nb = (length + nt - 1) / nt;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
...@@ -355,7 +359,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) { ...@@ -355,7 +359,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
const auto& ctx = arrays[0]->ctx; const auto& ctx = arrays[0]->ctx;
auto device = runtime::DeviceAPI::Get(ctx); auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// build node maps and get the induced nodes // build node maps and get the induced nodes
OrderedHashTable<IdType> node_map(total_length, ctx, stream); OrderedHashTable<IdType> node_map(total_length, ctx, stream);
...@@ -364,7 +368,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) { ...@@ -364,7 +368,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t))); static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8); IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8);
CUDA_CALL(cudaMemsetAsync( CUDA_CALL(hipMemsetAsync(
num_induced_device, 0, sizeof(*num_induced_device), stream)); num_induced_device, 0, sizeof(*num_induced_device), stream));
node_map.FillWithDuplicates( node_map.FillWithDuplicates(
...@@ -416,7 +420,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) { ...@@ -416,7 +420,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
const std::vector<int64_t> shape(arr->shape, arr->shape + arr->ndim); const std::vector<int64_t> shape(arr->shape, arr->shape + arr->ndim);
IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx); IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx);
const int64_t length = ret.NumElements(); const int64_t length = ret.NumElements();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(length); int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt; int nb = (length + nt - 1) / nt;
if (bits == 32) { if (bits == 32) {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file array/cuda/array_scatter.cu * @file array/cuda/array_scatter.cu
* @brief Array scatter GPU implementation * @brief Array scatter GPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -31,7 +35,7 @@ void Scatter_(IdArray index, NDArray value, NDArray out) { ...@@ -31,7 +35,7 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
const DType* val = value.Ptr<DType>(); const DType* val = value.Ptr<DType>();
DType* outd = out.Ptr<DType>(); DType* outd = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = cuda::FindNumThreads(len); const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt; const int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd); CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd);
...@@ -41,7 +45,7 @@ template void Scatter_<kDGLCUDA, int32_t, int32_t>(IdArray, NDArray, NDArray); ...@@ -41,7 +45,7 @@ template void Scatter_<kDGLCUDA, int32_t, int32_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, int64_t, int32_t>(IdArray, NDArray, NDArray); template void Scatter_<kDGLCUDA, int64_t, int32_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, __half, int32_t>(IdArray, NDArray, NDArray); template void Scatter_<kDGLCUDA, __half, int32_t>(IdArray, NDArray, NDArray);
#if BF16_ENABLED #if BF16_ENABLED
template void Scatter_<kDGLCUDA, __nv_bfloat16, int32_t>( template void Scatter_<kDGLCUDA, __hip_bfloat16, int32_t>(
IdArray, NDArray, NDArray); IdArray, NDArray, NDArray);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void Scatter_<kDGLCUDA, float, int32_t>(IdArray, NDArray, NDArray); template void Scatter_<kDGLCUDA, float, int32_t>(IdArray, NDArray, NDArray);
...@@ -50,7 +54,7 @@ template void Scatter_<kDGLCUDA, int32_t, int64_t>(IdArray, NDArray, NDArray); ...@@ -50,7 +54,7 @@ template void Scatter_<kDGLCUDA, int32_t, int64_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, int64_t, int64_t>(IdArray, NDArray, NDArray); template void Scatter_<kDGLCUDA, int64_t, int64_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, __half, int64_t>(IdArray, NDArray, NDArray); template void Scatter_<kDGLCUDA, __half, int64_t>(IdArray, NDArray, NDArray);
#if BF16_ENABLED #if BF16_ENABLED
template void Scatter_<kDGLCUDA, __nv_bfloat16, int64_t>( template void Scatter_<kDGLCUDA, __hip_bfloat16, int64_t>(
IdArray, NDArray, NDArray); IdArray, NDArray, NDArray);
#endif // BF16_ENABLED #endif // BF16_ENABLED
template void Scatter_<kDGLCUDA, float, int64_t>(IdArray, NDArray, NDArray); template void Scatter_<kDGLCUDA, float, int64_t>(IdArray, NDArray, NDArray);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment