Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
*
......@@ -10,9 +11,9 @@
#include <unordered_map>
#include "./concurrent_id_hash_map.h"
#include "./macro.h"
#include "./utils.h"
#include "concurrent_id_hash_map.h"
#include "macro.h"
#include "utils.h"
namespace graphbolt {
namespace sampling {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file array/arith.h
......@@ -6,13 +7,13 @@
#ifndef DGL_ARRAY_ARITH_H_
#define DGL_ARRAY_ARITH_H_
#ifdef __CUDACC__
#define DGLDEVICE __device__
#ifdef __HIPCC__
#define DGLDEVICE __device__ __host__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __CUDACC__
#endif // __HIPCC__
namespace dgl {
namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/array.cc
......@@ -14,9 +15,9 @@
#include <sstream>
#include "../c_api_common.h"
#include "./arith.h"
#include "./array_op.h"
#include "./kernel_decl.h"
#include "arith.h"
#include "array_op.h"
#include "kernel_decl.h"
using namespace dgl::runtime;
......@@ -585,7 +586,7 @@ COOMatrix CSRRowWiseSampling(
// prob_or_mask is pinned and rows on GPU is valid
CHECK_VALID_CONTEXT(prob_or_mask, rows);
ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
CHECK(!(prob_or_mask->dtype.bits == 8 && XPU == kDGLCUDA))
CHECK(!(prob_or_mask->dtype.bits == 8 && (XPU == kDGLCUDA || XPU == kDGLROCM)))
<< "GPU sampling with masks is currently not supported yet.";
ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH(
prob_or_mask->dtype, FloatType, "probability or mask", {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file array/array_aritch.cc
......@@ -8,8 +9,8 @@
#include <dgl/runtime/ndarray.h>
#include "../c_api_common.h"
#include "./arith.h"
#include "./array_op.h"
#include "arith.h"
#include "array_op.h"
using namespace dgl::runtime;
......
......@@ -29,6 +29,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
IdType* out_d = ret.Ptr<IdType>();
out_d[0] = in_d[0];
for (int64_t i = 1; i < len; ++i) out_d[i] = out_d[i - 1] + in_d[i];
std::cout << "limm cpu ret : " << ret << std::endl;
return ret;
}
}
......
......@@ -48,7 +48,7 @@ void swap(const PairRef<V1, V2>& r1, const PairRef<V1, V2>& r2) {
}
template <typename V1, typename V2>
struct PairIterator
__host__ struct PairIterator
: public std::iterator<
std::random_access_iterator_tag, std::pair<V1, V2>, std::ptrdiff_t,
std::pair<V1*, V2*>, PairRef<V1, V2>> {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file kernel/cpu/gaher_mm.cc
* @brief GatherMM C APIs and definitions.
*/
#include "./gather_mm.h"
#include "gather_mm.h"
#include <dgl/array.h>
......
// !!! This is a file automatically generated by hipify!!!
/*!
* Copyright (c) 2022, NVIDIA Corporation
* Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -18,7 +19,7 @@
* \file array/cuda/labor_sampling.cc
* \brief labor sampling
*/
#include "./labor_pick.h"
#include "labor_pick.h"
namespace dgl {
namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cpu/rowwise_sampling.cc
......@@ -7,7 +8,7 @@
#include <numeric>
#include "./rowwise_pick.h"
#include "rowwise_pick.h"
namespace dgl {
namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cpu/rowwise_topk.cc
......@@ -6,7 +7,7 @@
#include <algorithm>
#include <numeric>
#include "./rowwise_pick.h"
#include "rowwise_pick.h"
namespace dgl {
namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file aten/cpu/sddmm.cc
* @brief SDDMM C APIs and definitions.
*/
#include "./sddmm.h"
#include "sddmm.h"
#include <dgl/array.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file kernel/cpu/segment_reduce.cc
* @brief Segment reduce C APIs and definitions.
*/
#include "./segment_reduce.h"
#include "segment_reduce.h"
#include <dgl/array.h>
#include <string>
#include "./spmm_binary_ops.h"
#include "spmm_binary_ops.h"
namespace dgl {
namespace aten {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file kernel/cpu/spmm.cc
* @brief SPMM C APIs and definitions.
*/
#include "./spmm.h"
#include "spmm.h"
#include <dgl/array.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cpu/traversal.cc
* @brief Graph traversal implementation
*/
#include "./traversal.h"
#include "traversal.h"
#include <dgl/graph_traversal.h>
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cpu/array_cumsum.cu
* @brief Array cumsum GPU implementation
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -23,7 +26,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
: aten::Full(0, 1, array->dtype.bits, array->ctx);
auto device = runtime::DeviceAPI::Get(array->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* in_d = array.Ptr<IdType>();
IdArray ret;
IdType* out_d = nullptr;
......@@ -36,16 +39,16 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
}
// Allocate workspace
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceScan::InclusiveSum(
CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
nullptr, workspace_size, in_d, out_d, len, stream));
void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
// Compute cumsum
CUDA_CALL(cub::DeviceScan::InclusiveSum(
CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
workspace, workspace_size, in_d, out_d, len, stream));
device->FreeWorkspace(array->ctx, workspace);
std::cout << "cuda ret : " << ret << std::endl;
return ret;
}
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021-2022 by Contributors
* @file array/cuda/array_index_select.cuh
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file array/cpu/array_index_select.cu
* @brief Array index select GPU implementation
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h"
#include "./array_index_select.cuh"
#include "./utils.h"
#include "array_index_select.cuh"
#include "utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -33,7 +36,7 @@ NDArray IndexSelect(NDArray array, IdArray index) {
const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
const IdType* idx_data = static_cast<IdType*>(index->data);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
if (num_feat == 1) {
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
......@@ -61,9 +64,9 @@ template NDArray IndexSelect<kDGLCUDA, int64_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, __half, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, __half, int64_t>(NDArray, IdArray);
#if BF16_ENABLED
template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int32_t>(
template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int32_t>(
NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int64_t>(
template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int64_t>(
NDArray, IdArray);
#endif // BF16_ENABLED
template NDArray IndexSelect<kDGLCUDA, float, int32_t>(NDArray, IdArray);
......@@ -87,7 +90,7 @@ template uint32_t IndexSelect<kDGLCUDA, uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<kDGLCUDA, uint64_t>(NDArray array, int64_t index);
template __half IndexSelect<kDGLCUDA, __half>(NDArray array, int64_t index);
#if BF16_ENABLED
template __nv_bfloat16 IndexSelect<kDGLCUDA, __nv_bfloat16>(
template __hip_bfloat16 IndexSelect<kDGLCUDA, __hip_bfloat16>(
NDArray array, int64_t index);
#endif // BF16_ENABLED
template float IndexSelect<kDGLCUDA, float>(NDArray array, int64_t index);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cpu/array_nonzero.cc
......@@ -5,11 +7,13 @@
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -33,24 +37,24 @@ IdArray NonZero(IdArray array) {
const int64_t len = array->shape[0];
IdArray ret = NewIdArray(len, ctx, 64);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const IdType* const in_data = static_cast<const IdType*>(array->data);
int64_t* const out_data = static_cast<int64_t*>(ret->data);
IsNonZeroIndex<IdType> comp(in_data);
cub::CountingInputIterator<int64_t> counter(0);
hipcub::CountingInputIterator<int64_t> counter(0);
// room for cub to output on GPU
int64_t* d_num_nonzeros =
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
size_t temp_size = 0;
CUDA_CALL(cub::DeviceSelect::If(
CUDA_CALL(hipcub::DeviceSelect::If(
nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
stream));
void* temp = device->AllocWorkspace(ctx, temp_size);
CUDA_CALL(cub::DeviceSelect::If(
CUDA_CALL(hipcub::DeviceSelect::If(
temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
device->FreeWorkspace(ctx, temp);
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020-2021 by Contributors
* @file array/cuda/array_op_impl.cu
* @brief Array operator GPU implementation
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_hashtable.cuh"
#include "../arith.h"
#include "./utils.h"
#include "utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -36,7 +40,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
const IdType* lhs_data = static_cast<IdType*>(lhs->data);
const IdType* rhs_data = static_cast<IdType*>(rhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(
......@@ -107,7 +111,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
const IdType* lhs_data = static_cast<IdType*>(lhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(
......@@ -178,7 +182,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
const IdType* rhs_data = static_cast<IdType*>(rhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(
......@@ -249,7 +253,7 @@ IdArray UnaryElewise(IdArray lhs) {
IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
const IdType* lhs_data = static_cast<IdType*>(lhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(
......@@ -277,7 +281,7 @@ template <DGLDeviceType XPU, typename DType>
NDArray Full(DType val, int64_t length, DGLContext ctx) {
NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits<DType>::dtype, ctx);
DType* ret_data = static_cast<DType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
CUDA_KERNEL_CALL(
......@@ -292,8 +296,8 @@ template IdArray Full<kDGLCUDA, int64_t>(
template IdArray Full<kDGLCUDA, __half>(
__half val, int64_t length, DGLContext ctx);
#if BF16_ENABLED
template IdArray Full<kDGLCUDA, __nv_bfloat16>(
__nv_bfloat16 val, int64_t length, DGLContext ctx);
template IdArray Full<kDGLCUDA, __hip_bfloat16>(
__hip_bfloat16 val, int64_t length, DGLContext ctx);
#endif // BF16_ENABLED
template IdArray Full<kDGLCUDA, float>(
float val, int64_t length, DGLContext ctx);
......@@ -319,7 +323,7 @@ IdArray Range(IdType low, IdType high, DGLContext ctx) {
IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
if (length == 0) return ret;
IdType* ret_data = static_cast<IdType*>(ret->data);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
CUDA_KERNEL_CALL(
......@@ -355,7 +359,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
const auto& ctx = arrays[0]->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
// build node maps and get the induced nodes
OrderedHashTable<IdType> node_map(total_length, ctx, stream);
......@@ -364,7 +368,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8);
CUDA_CALL(cudaMemsetAsync(
CUDA_CALL(hipMemsetAsync(
num_induced_device, 0, sizeof(*num_induced_device), stream));
node_map.FillWithDuplicates(
......@@ -416,7 +420,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
const std::vector<int64_t> shape(arr->shape, arr->shape + arr->ndim);
IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx);
const int64_t length = ret.NumElements();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
if (bits == 32) {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019 by Contributors
* @file array/cuda/array_scatter.cu
* @brief Array scatter GPU implementation
*/
#include <dgl/array.h>
#include "../../../include/dgl/array.h"
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -31,7 +35,7 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
const DType* val = value.Ptr<DType>();
DType* outd = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd);
......@@ -41,7 +45,7 @@ template void Scatter_<kDGLCUDA, int32_t, int32_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, int64_t, int32_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, __half, int32_t>(IdArray, NDArray, NDArray);
#if BF16_ENABLED
template void Scatter_<kDGLCUDA, __nv_bfloat16, int32_t>(
template void Scatter_<kDGLCUDA, __hip_bfloat16, int32_t>(
IdArray, NDArray, NDArray);
#endif // BF16_ENABLED
template void Scatter_<kDGLCUDA, float, int32_t>(IdArray, NDArray, NDArray);
......@@ -50,7 +54,7 @@ template void Scatter_<kDGLCUDA, int32_t, int64_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, int64_t, int64_t>(IdArray, NDArray, NDArray);
template void Scatter_<kDGLCUDA, __half, int64_t>(IdArray, NDArray, NDArray);
#if BF16_ENABLED
template void Scatter_<kDGLCUDA, __nv_bfloat16, int64_t>(
template void Scatter_<kDGLCUDA, __hip_bfloat16, int64_t>(
IdArray, NDArray, NDArray);
#endif // BF16_ENABLED
template void Scatter_<kDGLCUDA, float, int64_t>(IdArray, NDArray, NDArray);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment