Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
320 additions
and
261 deletions
+320
-261
src/array/cuda/negative_sampling.hip
src/array/cuda/negative_sampling.hip
+20
-18
src/array/cuda/rowwise_sampling.hip
src/array/cuda/rowwise_sampling.hip
+22
-20
src/array/cuda/rowwise_sampling_prob.hip
src/array/cuda/rowwise_sampling_prob.hip
+32
-30
src/array/cuda/sddmm.cuh
src/array/cuda/sddmm.cuh
+7
-5
src/array/cuda/sddmm.hip
src/array/cuda/sddmm.hip
+7
-6
src/array/cuda/sddmm_hetero_coo.hip
src/array/cuda/sddmm_hetero_coo.hip
+4
-3
src/array/cuda/sddmm_hetero_csr.hip
src/array/cuda/sddmm_hetero_csr.hip
+4
-3
src/array/cuda/segment_reduce.cuh
src/array/cuda/segment_reduce.cuh
+8
-6
src/array/cuda/segment_reduce.hip
src/array/cuda/segment_reduce.hip
+12
-11
src/array/cuda/spmat_op_impl_coo.hip
src/array/cuda/spmat_op_impl_coo.hip
+6
-4
src/array/cuda/spmat_op_impl_csr.hip
src/array/cuda/spmat_op_impl_csr.hip
+43
-15
src/array/cuda/spmm.cuh
src/array/cuda/spmm.cuh
+105
-103
src/array/cuda/spmm.hip
src/array/cuda/spmm.hip
+8
-7
src/array/cuda/spmm_hetero.hip
src/array/cuda/spmm_hetero.hip
+11
-9
src/array/cuda/utils.h
src/array/cuda/utils.h
+12
-10
src/array/cuda/utils.hip
src/array/cuda/utils.hip
+7
-5
src/array/cuda/uvm/array_index_select_uvm.cuh
src/array/cuda/uvm/array_index_select_uvm.cuh
+2
-0
src/array/cuda/uvm/array_index_select_uvm.hip
src/array/cuda/uvm/array_index_select_uvm.hip
+5
-3
src/array/filter.cc
src/array/filter.cc
+3
-2
src/array/kernel.cc
src/array/kernel.cc
+2
-1
No files found.
src/array/cuda/negative_sampling.
cu
→
src/array/cuda/negative_sampling.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/negative_sampling.cu
* @brief rowwise sampling
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/array_iterator.h>
#include <dgl/random.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
using namespace dgl::runtime;
...
...
@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
cu
randStatePhilox4_32_10_t
hip
randStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time
cu
rand_init
(
random_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
rand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) {
for (int i = 0; i < num_trials; ++i) {
uint4
result
=
cu
rand4
(
&
rng
);
uint4 result =
hip
rand4(&rng);
// Turns out that result.x is always 0 with the above RNG.
uint64_t y_hi = result.y >> 16;
uint64_t y_lo = result.y & 0xFFFF;
...
...
@@ -88,7 +90,7 @@ struct IsNotMinusOne {
template <typename IdType>
void SortOrderedPairs(
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
IdType
*
tmp_major
,
IdType
*
tmp_minor
,
int64_t
n
,
cuda
Stream_t
stream
)
{
IdType* tmp_major, IdType* tmp_minor, int64_t n,
hip
Stream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix
...
...
@@ -98,21 +100,21 @@ void SortOrderedPairs(
void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
// Radix sort by major key next.
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
...
...
@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IdType* out_row_data = out_row.Ptr<IdType>();
IdType* out_col_data = out_col.Ptr<IdType>();
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = cuda::FindNumThreads(num_actual_samples);
const int nb = (num_actual_samples + nt - 1) / nt;
std::pair<IdArray, IdArray> result;
...
...
@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL
(
cub
::
DeviceSelect
::
If
(
CUDA_CALL(
hip
cub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL
(
cub
::
DeviceSelect
::
If
(
CUDA_CALL(
hip
cub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
...
...
@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
size_t tmp_size_unique = 0;
void* tmp_unique = nullptr;
CUDA_CALL
(
cub
::
DeviceSelect
::
Unique
(
CUDA_CALL(
hip
cub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size)
? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer
CUDA_CALL
(
cub
::
DeviceSelect
::
Unique
(
CUDA_CALL(
hip
cub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out
=
std
::
min
(
num_samples
,
num_out
);
num_out = ::min(num_samples, num_out);
result = {
unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
} else {
num_out
=
std
::
min
(
num_samples
,
num_out
);
num_out = ::min(num_samples, num_out);
result = {
out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)};
...
...
src/array/cuda/rowwise_sampling.
cu
→
src/array/cuda/rowwise_sampling.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cuda/rowwise_sampling.cu
* @brief uniform rowwise sampling
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include "
../../array/cuda/
atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
using namespace dgl::cuda;
using namespace dgl::aten::cuda;
...
...
@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
__syncthreads();
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const
int
num
=
cu
rand
(
&
rng
)
%
(
idx
+
1
);
const int num =
hip
rand(&rng) % (idx + 1);
if (num < num_picks) {
// use max so as to achieve the replacement order the serial
// algorithm would have
...
...
@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const
int64_t
edge
=
cu
rand
(
&
rng
)
%
deg
;
const int64_t edge =
hip
rand(&rng) % deg;
const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge];
...
...
@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...
...
@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cuda
Event_t
copyEvent
;
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
hip
Event_t copyEvent;
CUDA_CALL(
hip
EventCreate(©Event));
NDArray new_len_tensor;
if (TensorDispatcher::Global()->IsAvailable()) {
...
...
@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
}
// copy using the internal current stream
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
...
...
@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL(
hip
EventDestroy(copyEvent));
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
...
...
src/array/cuda/rowwise_sampling_prob.
cu
→
src/array/cuda/rowwise_sampling_prob.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2022 by Contributors
* @file array/cuda/rowwise_sampling_prob.cu
...
...
@@ -6,20 +8,20 @@
* sampling code rowwise_sampling.cu.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include "
../../array/cuda/
atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
// require CUB 1.17 to use DeviceSegmentedSort
static_assert
(
CUB_VERSION
>=
101700
,
"Require CUB >= 1.17 to use DeviceSegmentedSort"
);
//
static_assert(
//
CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
namespace dgl {
using namespace cuda;
...
...
@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
prob, data, idx, in_row_start, &item_prob);
// compute A-Res value
ares[ares_idx] = static_cast<FloatType>(
__powf
(
cu
rand_uniform
(
&
rng
),
1.0
f
/
item_prob
));
__powf(
hip
rand_uniform(&rng), 1.0f / item_prob));
ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
}
}
...
...
@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
randStatePhilox4_32_10_t rng;
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
...
...
@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
if (deg > 0) {
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef
cub
::
BlockScan
<
FloatType
,
BLOCK_SIZE
>
BlockScan
;
typedef
hip
cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total
...
...
@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get random value
FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType
rand
=
static_cast
<
FloatType
>
(
cu
rand_uniform
(
&
rng
)
*
sum
);
FloatType rand = static_cast<FloatType>(
hip
rand_uniform(&rng) * sum);
// get the offset of the first value within cdf array which is greater
// than random value.
int64_t
item
=
cub
::
UpperBound
<
FloatType
*
,
int64_t
,
FloatType
>
(
int64_t item =
hip
cub::UpperBound<FloatType*, int64_t, FloatType>(
&cdf[cdf_row_start], deg, rand);
item = min(item, deg - 1);
// get in and out index
...
...
@@ -411,7 +413,7 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
IdType* new_row_data = new_row.Ptr<IdType>();
IdType* new_col_data = new_col.Ptr<IdType>();
IdType* new_eid_data = new_eid.Ptr<IdType>();
auto
stream
=
runtime
::
getCurrent
CUDA
Stream
();
auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
...
...
@@ -441,7 +443,7 @@ COOMatrix _COORemoveIf(
const COOMatrix& coo, const NDArray& values, DType criteria) {
const DType* val = values.Ptr<DType>();
auto maskgen = [val, criteria](
int
nb
,
int
nt
,
cuda
Stream_t
stream
,
int64_t
nnz
,
int nb, int nt,
hip
Stream_t stream, int64_t nnz,
const IdType* data, int8_t* flags) {
CUDA_KERNEL_CALL(
(_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
...
...
@@ -481,7 +483,7 @@ COOMatrix _CSRRowWiseSampling(
const FloatArray& prob, bool replace) {
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...
...
@@ -530,10 +532,10 @@ COOMatrix _CSRRowWiseSampling(
IdType* temp_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, temp_deg);
...
...
@@ -551,16 +553,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cuda
Event_t
copyEvent
;
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
hip
Event_t copyEvent;
CUDA_CALL(
hip
EventCreate(©Event));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// wait on a cudaevent
IdType new_len;
...
...
@@ -568,7 +570,7 @@ COOMatrix _CSRRowWiseSampling(
device->CopyDataFromTo(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
// allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one
...
...
@@ -612,16 +614,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* sort_temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
cub
::
DoubleBuffer
<
FloatType
>
sort_keys
(
temp
,
sort_temp
);
cub
::
DoubleBuffer
<
IdType
>
sort_values
(
temp_idxs
,
sort_temp_idxs
);
hip
cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
hip
cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
device->FreeWorkspace(ctx, d_temp_storage);
...
...
@@ -641,8 +643,8 @@ COOMatrix _CSRRowWiseSampling(
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL(
hip
EventDestroy(copyEvent));
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
...
...
src/array/cuda/sddmm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cuh
...
...
@@ -10,8 +12,8 @@
#include "../../runtime/cuda/cuda_common.h"
#include "../selector.h"
#include "
./
functor.cuh"
#include "
./
utils.h"
#include "functor.cuh"
#include "utils.h"
#include "atomic.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
...
...
@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
}
#pragma unroll
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
_sync
(
full_mask
,
val
,
offset
);
val
+=
__shfl_down
(
val
,
offset
);
if
(
tx
==
0
)
outoff
[
i
]
=
val
;
}
}
...
...
@@ -275,7 +277,7 @@ void SDDMMCoo(
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
...
@@ -337,7 +339,7 @@ void SDDMMCsr(
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
N
=
csr
.
num_rows
,
M
=
csr
.
num_cols
,
E
=
csr
.
indices
->
shape
[
0
];
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
...
...
src/array/cuda/sddmm.
cu
→
src/array/cuda/sddmm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
...
...
@@ -5,8 +6,8 @@
*/
#include <dgl/array.h>
#include "
./
functor.cuh"
#include "
./
sddmm.cuh"
#include "functor.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
template
void
SDDMMCsr
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCsr<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template
void
SDDMMCsr
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCsr<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
...
...
@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
template
void
SDDMMCoo
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCoo<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template
void
SDDMMCoo
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCoo<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
...
...
src/array/cuda/sddmm_hetero_coo.
cu
→
src/array/cuda/sddmm_hetero_coo.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
...
...
@@ -5,7 +6,7 @@
*/
#include <dgl/array.h>
#include "
./
sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
template
void
SDDMMCooHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCooHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template
void
SDDMMCooHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCooHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
...
...
src/array/cuda/sddmm_hetero_csr.
cu
→
src/array/cuda/sddmm_hetero_csr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
...
...
@@ -5,7 +6,7 @@
*/
#include <dgl/array.h>
#include "
./
sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace aten {
...
...
@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
template
void
SDDMMCsrHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCsrHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template
void
SDDMMCsrHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCsrHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
...
...
src/array/cuda/segment_reduce.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cuh
...
...
@@ -10,8 +12,8 @@
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace
dgl
{
...
...
@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
out
->
shape
[
0
];
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
...
@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
const
IdType
*
idx_data
=
idx
.
Ptr
<
IdType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
feat
->
shape
[
0
];
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
...
@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
const
std
::
vector
<
NDArray
>&
list_feat
,
const
std
::
vector
<
NDArray
>&
list_idx
,
const
std
::
vector
<
NDArray
>&
list_idx_types
,
std
::
vector
<
NDArray
>*
list_out
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if
(
op
==
"copy_lhs"
||
op
==
"copy_rhs"
)
{
std
::
vector
<
std
::
vector
<
dgl_id_t
>>
src_dst_ntypes
(
graph
->
NumVertexTypes
(),
std
::
vector
<
dgl_id_t
>
());
...
...
@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
feat
->
shape
[
0
];
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
...
src/array/cuda/segment_reduce.
cu
→
src/array/cuda/segment_reduce.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cu
...
...
@@ -6,9 +7,9 @@
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include "
./
functor.cuh"
#include "
./
segment_reduce.cuh"
#include "
./
utils.h"
#include "functor.cuh"
#include "segment_reduce.cuh"
#include "utils.h"
namespace dgl {
...
...
@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
#if BF16_ENABLED
template
void
SegmentReduce
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentReduce<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
template
void
SegmentReduce
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentReduce<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
#endif // BF16_ENABLED
...
...
@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
template void ScatterAdd<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray idx, NDArray out);
#if BF16_ENABLED
template
void
ScatterAdd
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void ScatterAdd<kDGLCUDA, int32_t, __
hip
_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
template
void
ScatterAdd
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void ScatterAdd<kDGLCUDA, int64_t, __
hip
_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
#endif // BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, float>(
...
...
@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
#if BF16_ENABLED
template
void
UpdateGradMinMax_hetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
template
void
UpdateGradMinMax_hetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
...
...
@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray arg, NDArray out);
#if BF16_ENABLED
template
void
BackwardSegmentCmp
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void BackwardSegmentCmp<kDGLCUDA, int32_t, __
hip
_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
template
void
BackwardSegmentCmp
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __
hip
_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
#endif // BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/spmat_op_impl_coo.
cu
→
src/array/cuda/spmat_op_impl_coo.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by contributors.
* @file array/cuda/spmat_op_impl_coo.cu
...
...
@@ -10,8 +12,8 @@
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace dgl {
...
...
@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType nt = 1024;
...
...
@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType num_rows = coo.num_rows;
...
...
src/array/cuda/spmat_op_impl_csr.
cu
→
src/array/cuda/spmat_op_impl_csr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmat_op_impl_csr.cu
...
...
@@ -7,14 +9,14 @@
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include <unordered_set>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "
./
utils.h"
#include "atomic.cuh"
#include "utils.h"
namespace dgl {
...
...
@@ -28,7 +30,7 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = csr.indptr->ctx;
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
...
...
@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const
auto
rstlen
=
std
::
max
(
rowlen
,
collen
);
const auto rstlen = ::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
if (rstlen == 0) return rst;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = dgl::cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr;
...
...
@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
bool CSRHasDuplicate(CSRMatrix csr) {
if (!csr.sorted) csr = CSRSort(csr);
const auto& ctx = csr.indptr->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
...
...
@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto len = rows->shape[0];
const IdType* vid_data = rows.Ptr<IdType>();
const IdType* indptr_data =
...
...
@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
...
...
@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const
auto
len
=
std
::
max
(
rowlen
,
collen
);
const auto len = ::max(rowlen, collen);
if (len == 0) return {NullArray(), NullArray(), NullArray()};
const auto& ctx = row->ctx;
...
...
@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
const int64_t nnz = csr.indices->shape[0];
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const IdType* indptr_data =
static_cast<IdType*>(GetDevicePointer(csr.indptr));
...
...
@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
static_cast<IdType>(num_rows));
NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
typedef
cub
::
WarpReduce
<
IdType
>
WarpReduce
;
typedef
hip
cub::WarpReduce<IdType> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
while (out_row < last_row) {
...
...
@@ -547,6 +549,7 @@ __global__ void _SegmentMaskColKernel(
}
}
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
if (laneid == 0) {
count[out_row] = reduce_count;
}
...
...
@@ -557,13 +560,16 @@ __global__ void _SegmentMaskColKernel(
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceMatrix(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = rows->ctx;
const auto& dtype = rows->dtype;
const auto nbits = dtype.bits;
const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0];
std::cout << "new_nrows : " << new_nrows << std::endl;
std::cout << "new_ncols : " << new_ncols << std::endl;
if (new_nrows == 0 || new_ncols == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
...
@@ -572,6 +578,7 @@ CSRMatrix CSRSliceMatrix(
// First slice rows
csr = CSRSliceRows(csr, rows);
std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
if (csr.indices->shape[0] == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
...
@@ -581,9 +588,11 @@ CSRMatrix CSRSliceMatrix(
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
// A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
std::cout << "1 IdArray count : " << count << std::endl;
CUDA_CALL(
cuda
Memset
(
count
.
Ptr
<
IdType
>
(),
0
,
sizeof
(
IdType
)
*
(
csr
.
num_rows
)));
hip
Memset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
std::cout << "2 IdArray count : " << count << std::endl;
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5);
// Because num_cols is usually less than 1 Million (on GPU), the
...
...
@@ -593,7 +602,7 @@ CSRMatrix CSRSliceMatrix(
using it = thrust::counting_iterator<int64_t>;
runtime::CUDAWorkspaceAllocator allocator(ctx);
const
auto
exec_policy
=
thrust
::
cuda
::
par_nosync
(
allocator
).
on
(
stream
);
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
thrust::for_each(
exec_policy, it(0), it(new_ncols),
[key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
...
...
@@ -617,20 +626,37 @@ CSRMatrix CSRSliceMatrix(
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb);
std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
std::cout << "indptr_data : " << indptr_data << std::endl;
std::cout << "indices_data : " << indices_data << std::endl;
std::cout << "num_rows : " << num_rows << std::endl;
std::cout << "buffer_size : " << buffer_size << std::endl;
std::cout << "mask : " << mask << std::endl;
std::cout << "count : " << count << std::endl;
std::cout << "hashmap_buffer : " << hashmap_buffer << std::endl;
CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
count.Ptr<IdType>());
std::cout << "3 IdArray count : " << count << std::endl;
IdArray idx = AsNumBits(NonZero(mask), nbits);
std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
if (idx->shape[0] == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
NullArray(dtype, ctx), NullArray(dtype, ctx));
// Indptr needs to be adjusted according to the new nnz per row.
std::cout << " count : " << count << std::endl;
IdArray ret_indptr = CumSum(count, true);
std::cout << " IdArray ret_indptr : " << ret_indptr << std::endl;
// Column & data can be obtained by index select.
IdArray ret_col = IndexSelect(csr.indices, idx);
...
...
@@ -641,6 +667,8 @@ CSRMatrix CSRSliceMatrix(
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
ret_col = IndexSelect(col_hash, ret_col);
// std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
}
...
...
src/array/cuda/spmm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cuh
...
...
@@ -11,7 +13,7 @@
#include <limits>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
...
...
@@ -28,14 +30,14 @@ namespace aten {
*/
template
<
typename
DType
,
typename
IdType
>
inline
bool
cusparse_available
(
bool
more_nnz_than_matrix_size
)
{
#if
CUDA
RT_VERSION < 11000
#if
DTK
RT_VERSION < 11000
if
(
std
::
is_same
<
IdType
,
int
>::
value
&&
(
std
::
is_same
<
DType
,
float
>::
value
||
std
::
is_same
<
DType
,
double
>::
value
))
return
true
;
return
false
;
#else
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
std
::
is_same
<
DType
,
__
nv
_bfloat16
>::
value
)
std
::
is_same
<
DType
,
__
hip
_bfloat16
>::
value
)
return
false
;
// cusparse's SpMM on fp16 is slow, temporally disabled.
// If the CSR matrix has more NNZ than matrix size, we should not use
// cuSPARSE 11.1.
...
...
@@ -47,54 +49,54 @@ namespace {
/** @brief Call cuBLAS geam API for transpose operation for float and double. */
template
<
typename
DType
>
cu
blasStatus_t
Xgeam
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
DType
*
alpha
,
const
DType
*
A
,
int
lda
,
const
DType
*
beta
,
const
DType
*
B
,
int
ldb
,
DType
*
C
,
int
ldc
)
{
LOG
(
FATAL
)
<<
"Not supported dtype"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
template
<
>
cu
blasStatus_t
Xgeam
<
__half
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
<
__half
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
__half
*
alpha
,
const
__half
*
A
,
int
lda
,
const
__half
*
beta
,
const
__half
*
B
,
int
ldb
,
__half
*
C
,
int
ldc
)
{
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"Xgeam does not support dtype half (FP16)"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
#if BF16_ENABLED
template
<
>
cu
blasStatus_t
Xgeam
<
__
nv
_bfloat16
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
int
m
,
int
n
,
const
__
nv
_bfloat16
*
alpha
,
const
__
nv
_bfloat16
*
A
,
int
lda
,
const
__
nv
_bfloat16
*
beta
,
const
__
nv
_bfloat16
*
B
,
int
ldb
,
__
nv
_bfloat16
*
C
,
int
ldc
)
{
hip
blasStatus_t
Xgeam
<
__
hip
_bfloat16
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
__
hip
_bfloat16
*
alpha
,
const
__
hip
_bfloat16
*
A
,
int
lda
,
const
__
hip
_bfloat16
*
beta
,
const
__
hip
_bfloat16
*
B
,
int
ldb
,
__
hip
_bfloat16
*
C
,
int
ldc
)
{
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
LOG
(
FATAL
)
<<
"Xgeam does not support dtype bfloat16 (BF16)"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
#endif // BF16_ENABLED
template
<
>
cu
blasStatus_t
Xgeam
<
float
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
<
float
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
beta
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
)
{
return
cu
blasSgeam
(
return
hip
blasSgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
template
<
>
cu
blasStatus_t
Xgeam
<
double
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasStatus_t
Xgeam
<
double
>
(
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
double
*
alpha
,
const
double
*
A
,
int
lda
,
const
double
*
beta
,
const
double
*
B
,
int
ldb
,
double
*
C
,
int
ldc
)
{
return
cu
blasDgeam
(
return
hip
blasDgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
...
...
@@ -119,12 +121,12 @@ template <typename DType>
void
_Transpose
(
const
DType
*
in
,
DType
*
out
,
int
row
,
int
col
)
{
DType
alpha
=
1.
,
beta
=
0.
;
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if
(
!
thr_entry
->
cublas_handle
)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL
(
hip
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
hip
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL
(
Xgeam
<
DType
>
(
thr_entry
->
cublas_handle
,
CU
BLAS_OP_T
,
CU
BLAS_OP_N
,
row
,
col
,
&
alpha
,
in
,
thr_entry
->
cublas_handle
,
HIP
BLAS_OP_T
,
HIP
BLAS_OP_N
,
row
,
col
,
&
alpha
,
in
,
col
,
&
beta
,
nullptr
,
row
,
out
,
row
));
}
...
...
@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
*/
template
<
>
void
_Transpose
<
__half
>
(
const
__half
*
in
,
__half
*
out
,
int
row
,
int
col
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
row
);
int
nb
=
col
;
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
...
...
@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
*/
template
<
>
void
_Transpose
<
__
nv
_bfloat16
>
(
const
__
nv
_bfloat16
*
in
,
__
nv
_bfloat16
*
out
,
int
row
,
int
col
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
void
_Transpose
<
__
hip
_bfloat16
>
(
const
__
hip
_bfloat16
*
in
,
__
hip
_bfloat16
*
out
,
int
row
,
int
col
)
{
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
row
);
int
nb
=
col
;
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
}
#endif // BF16_ENABLED
#if
CUDA
RT_VERSION < 11000
#if
DTK
RT_VERSION < 11000
template
<
typename
DType
>
cu
sparseStatus_t
Xcsrmm2
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
DType
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
DType
*
csrValA
,
hip
sparseStatus_t
Xcsrmm2
(
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
DType
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
DType
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
DType
*
B
,
int
ldb
,
const
DType
*
beta
,
DType
*
C
,
int
ldc
)
{
LOG
(
INFO
)
<<
"Not supported dtype"
;
return
CU
SPARSE_STATUS_EXECUTION_FAILED
;
return
HIP
SPARSE_STATUS_EXECUTION_FAILED
;
}
template
<
>
cu
sparseStatus_t
Xcsrmm2
<
float
>
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
float
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
float
*
csrValA
,
hip
sparseStatus_t
Xcsrmm2
<
float
>
(
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
float
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
float
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cu
sparseScsrmm2
(
return
hip
sparseScsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
>
cu
sparseStatus_t
Xcsrmm2
<
double
>
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
double
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
double
*
csrValA
,
hip
sparseStatus_t
Xcsrmm2
<
double
>
(
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
double
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
double
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
double
*
B
,
int
ldb
,
const
double
*
beta
,
double
*
C
,
int
ldc
)
{
return
cu
sparseDcsrmm2
(
return
hip
sparseDcsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
...
...
@@ -213,12 +215,12 @@ void CusparseCsrmm2(
// device
auto
device
=
runtime
::
DeviceAPI
::
Get
(
ctx
);
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
// all one data array
DType
*
valptr
=
nullptr
;
if
(
!
A_data
)
{
...
...
@@ -226,52 +228,52 @@ void CusparseCsrmm2(
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
}
#if
CUDA
RT_VERSION >= 11000
cu
sparseSpMatDescr_t
matA
;
cu
sparseDnMatDescr_t
matB
,
matC
;
#if
DTK
RT_VERSION >= 11000
hip
sparseSpMatDescr_t
matA
;
hip
sparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL
(
hip
sparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
CU
SPARSE_ORDER_ROW
));
HIP
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
hip
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
HIP
SPARSE_ORDER_ROW
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
CU
SPARSE_ORDER_ROW
));
hip
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
HIP
SPARSE_ORDER_ROW
));
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transA
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cu
sparseSpMM_bufferSize
(
CUSPARSE_CALL
(
hip
sparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseSpMM
(
CUSPARSE_CALL
(
hip
sparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
workspace
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matC
));
CUSPARSE_CALL
(
hip
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matC
));
#else
// allocate matrix for temporary transposed output
DType
*
trans_out
=
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
m
*
n
*
sizeof
(
DType
)));
cu
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
hip
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
hip
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
hip
sparseSetMatType
(
descr
,
HIP
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
hip
sparseSetMatIndexBase
(
descr
,
HIP
SPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_OPERATION_NON_TRANSPOSE
,
CU
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
thr_entry
->
cusparse_handle
,
HIP
SPARSE_OPERATION_NON_TRANSPOSE
,
HIP
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
trans_out
,
m
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL
(
hip
sparseDestroyMatDescr
(
descr
));
// transpose the output matrix
_Transpose
(
trans_out
,
C_data
,
n
,
m
);
device
->
FreeWorkspace
(
ctx
,
trans_out
);
...
...
@@ -284,7 +286,7 @@ template <typename DType, typename IdType>
void
CusparseCsrmm2Hetero
(
const
DGLContext
&
ctx
,
const
CSRMatrix
&
csr
,
const
DType
*
B_data
,
const
DType
*
A_data
,
DType
*
C_data
,
int64_t
x_length
,
cuda
Stream_t
strm_id
)
{
hip
Stream_t
strm_id
)
{
// We use csrmm2 to perform following operation:
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
// for node feature tensor. However, since cusparse only supports
...
...
@@ -307,9 +309,9 @@ void CusparseCsrmm2Hetero(
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
strm_id
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
strm_id
));
// all one data array
DType
*
valptr
=
nullptr
;
if
(
!
A_data
)
{
...
...
@@ -317,48 +319,48 @@ void CusparseCsrmm2Hetero(
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
}
#if
CUDA
RT_VERSION >= 11000
cu
sparseSpMatDescr_t
matA
;
cu
sparseDnMatDescr_t
matB
,
matC
;
#if
DTK
RT_VERSION >= 11000
hip
sparseSpMatDescr_t
matA
;
hip
sparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL
(
hip
sparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
CU
SPARSE_ORDER_ROW
));
HIP
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
hip
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
HIP
SPARSE_ORDER_ROW
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
CU
SPARSE_ORDER_ROW
));
hip
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
HIP
SPARSE_ORDER_ROW
));
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transA
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cu
sparseSpMM_bufferSize
(
CUSPARSE_CALL
(
hip
sparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseSpMM
(
CUSPARSE_CALL
(
hip
sparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
workspace
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matC
));
CUSPARSE_CALL
(
hip
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matC
));
#else
cu
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
hip
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
hip
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
hip
sparseSetMatType
(
descr
,
HIP
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
hip
sparseSetMatIndexBase
(
descr
,
HIP
SPARSE_INDEX_BASE_ZERO
));
CHECK_EQ
(
sizeof
(
IdType
),
sizeof
(
int32_t
));
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_OPERATION_NON_TRANSPOSE
,
CU
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
thr_entry
->
cusparse_handle
,
HIP
SPARSE_OPERATION_NON_TRANSPOSE
,
HIP
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
C_data
,
m
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL
(
hip
sparseDestroyMatDescr
(
descr
));
#endif
if
(
valptr
)
device
->
FreeWorkspace
(
ctx
,
valptr
);
}
...
...
@@ -625,7 +627,7 @@ void SpMMCoo(
*/
#if BF16_ENABLED
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
std
::
is_same
<
DType
,
__
nv
_bfloat16
>::
value
)
std
::
is_same
<
DType
,
__
hip
_bfloat16
>::
value
)
#else
if
(
std
::
is_same
<
DType
,
__half
>::
value
)
#endif // BF16_ENABLED
...
...
@@ -638,7 +640,7 @@ void SpMMCoo(
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
(),
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
int64_t
N
=
coo
.
num_rows
,
M
=
coo
.
num_cols
,
E
=
coo
.
row
->
shape
[
0
];
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
...
...
@@ -703,7 +705,7 @@ void SpMMCsr(
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
...
@@ -764,7 +766,7 @@ void SpMMCmpCsrHetero(
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
...
src/array/cuda/spmm.
cu
→
src/array/cuda/spmm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
...
...
@@ -6,9 +7,9 @@
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "
./
ge_spmm.cuh"
#include "
./
spmm.cuh"
#include "functor.cuh"
#include "ge_spmm.cuh"
#include "spmm.cuh"
namespace dgl {
...
...
@@ -109,11 +110,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
#if BF16_ENABLED
template
void
SpMMCsr
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCsr<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
template
void
SpMMCsr
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCsr<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
...
...
@@ -144,11 +145,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
#if BF16_ENABLED
template
void
SpMMCoo
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCoo<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
template
void
SpMMCoo
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCoo<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
...
...
src/array/cuda/spmm_hetero.
cu
→
src/array/cuda/spmm_hetero.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
...
...
@@ -6,9 +8,9 @@
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "
./
ge_spmm.cuh"
#include "
./
spmm.cuh"
#include "functor.cuh"
#include "ge_spmm.cuh"
#include "spmm.cuh"
namespace dgl {
...
...
@@ -37,7 +39,7 @@ void SpMMCsrHetero(
std::vector<DType*> trans_out((*vec_out).size(), NULL);
bool use_legacy_cusparsemm =
(
CUDA
RT_VERSION
<
11000
)
&&
(
reduce
==
"sum"
)
&&
(
DTK
RT_VERSION < 11000) && (reduce == "sum") &&
// legacy cuSPARSE does not care about NNZ, hence the argument "false".
((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
(op == "mul" && is_scalar_efeat &&
...
...
@@ -50,7 +52,7 @@ void SpMMCsrHetero(
if (m == 0) continue;
DType* out = static_cast<DType*>(device->AllocWorkspace(
vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
CUDA_CALL
(
cuda
Memset
(
out
,
0
,
m
*
n
*
sizeof
(
DType
)));
CUDA_CALL(
hip
Memset(out, 0, m * n * sizeof(DType)));
trans_out[ntype] = out;
}
}
...
...
@@ -111,7 +113,7 @@ void SpMMCsrHetero(
}
}
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
const dgl_type_t src_id = ufeat_ntids[etype];
const dgl_type_t dst_id = out_ntids[etype];
...
...
@@ -123,7 +125,7 @@ void SpMMCsrHetero(
cusparse_available<DType, IdType>(more_nnz)) { // cusparse
/* If CUDA is less than 11.0, put the output in trans_out for later
* transposition */
DType
*
out
=
(
CUDA
RT_VERSION
<
11000
)
DType* out = (
DTK
RT_VERSION < 11000)
? trans_out[dst_id]
: static_cast<DType*>((*vec_out)[dst_id]->data);
CusparseCsrmm2Hetero<DType, IdType>(
...
...
@@ -209,14 +211,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
#if BF16_ENABLED
template
void
SpMMCsrHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCsrHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
std::vector<std::vector<NDArray>>* out_aux,
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
template
void
SpMMCsrHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCsrHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
...
...
src/array/cuda/utils.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.h
...
...
@@ -11,7 +13,7 @@
#include <dgl/runtime/ndarray.h>
#include <dmlc/logging.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <type_traits>
#include "../../runtime/cuda/cuda_common.h"
...
...
@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {
template
<
typename
T
>
__device__
__forceinline__
T
_ldg
(
T
*
addr
)
{
#if __
CUDA_ARCH__ >= 350
#if __
HIP_DEVICE_COMPILE__
return
__ldg
(
addr
);
#else
return
*
addr
;
...
...
@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
/** @brief Fill the vector started from ptr of size length with val */
template
<
typename
DType
>
void
_Fill
(
DType
*
ptr
,
size_t
length
,
DType
val
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
length
);
int
nb
=
(
length
+
nt
-
1
)
/
nt
;
// on x-axis, no need to worry about upperbound.
...
...
@@ -185,8 +187,8 @@ template <typename IdType>
__global__
void
_LinearSearchKernel
(
const
IdType
*
indptr
,
const
IdType
*
indices
,
const
IdType
*
data
,
const
IdType
*
row
,
const
IdType
*
col
,
int64_t
row_stride
,
int64_t
col_stride
,
int64_t
length
,
const
__
nv
_bfloat16
*
weights
,
__
nv
_bfloat16
filler
,
__
nv
_bfloat16
*
out
)
{
int64_t
col_stride
,
int64_t
length
,
const
__
hip
_bfloat16
*
weights
,
__
hip
_bfloat16
filler
,
__
hip
_bfloat16
*
out
)
{
int
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
stride_x
=
gridDim
.
x
*
blockDim
.
x
;
while
(
tx
<
length
)
{
...
...
@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
}
else
{
// If the result is saved in bf16, it should be fine to convert it to
// float first
out
[
tx
]
=
weights
?
weights
[
v
]
:
__
nv
_bfloat16
(
static_cast
<
float
>
(
v
));
out
[
tx
]
=
weights
?
weights
[
v
]
:
__
hip
_bfloat16
(
static_cast
<
float
>
(
v
));
}
tx
+=
stride_x
;
}
...
...
@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
void
MaskSelect
(
runtime
::
DeviceAPI
*
device
,
const
DGLContext
&
ctx
,
const
DType
*
input
,
const
BoolType
*
mask
,
DType
*
output
,
int64_t
n
,
int64_t
*
rst
,
cuda
Stream_t
stream
)
{
hip
Stream_t
stream
)
{
size_t
workspace_size
=
0
;
CUDA_CALL
(
cub
::
DeviceSelect
::
Flagged
(
CUDA_CALL
(
hip
cub
::
DeviceSelect
::
Flagged
(
nullptr
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUDA_CALL
(
cub
::
DeviceSelect
::
Flagged
(
CUDA_CALL
(
hip
cub
::
DeviceSelect
::
Flagged
(
workspace
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
}
...
...
@@ -290,7 +292,7 @@ void MaskSelect(
inline
void
*
GetDevicePointer
(
runtime
::
NDArray
array
)
{
void
*
ptr
=
array
->
data
;
if
(
array
.
IsPinned
())
{
CUDA_CALL
(
cuda
HostGetDevicePointer
(
&
ptr
,
ptr
,
0
));
CUDA_CALL
(
hip
HostGetDevicePointer
(
&
ptr
,
ptr
,
0
));
}
return
ptr
;
}
...
...
src/array/cuda/utils.
cu
→
src/array/cuda/utils.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.cu
* @brief Utilities for CUDA kernels.
*/
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace cuda {
...
...
@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction
size_t workspace_size = 0;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
CUDA_CALL
(
cub
::
DeviceReduce
::
Min
(
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CUDA_CALL(
hip
cub::DeviceReduce::Min(
nullptr, workspace_size, flags, rst, length, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL
(
cub
::
DeviceReduce
::
Min
(
CUDA_CALL(
hip
cub::DeviceReduce::Min(
workspace, workspace_size, flags, rst, length, stream));
int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
device->FreeWorkspace(ctx, workspace);
...
...
src/array/cuda/uvm/array_index_select_uvm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2021 by Contributors
* @file array/cpu/array_index_select_uvm.cuh
...
...
src/array/cuda/uvm/array_index_select_uvm.
cu
→
src/array/cuda/uvm/array_index_select_uvm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2019-2022 by Contributors
* @file array/cuda/uvm/array_index_select_uvm.cu
...
...
@@ -8,7 +10,7 @@
#include "../../../runtime/cuda/cuda_common.h"
#include "../array_index_select.cuh"
#include "../utils.h"
#include "
./
array_index_select_uvm.cuh"
#include "array_index_select_uvm.cuh"
namespace dgl {
using runtime::NDArray;
...
...
@@ -17,7 +19,7 @@ namespace impl {
template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0];
int64_t num_feat = 1;
...
...
@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template <typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* source_data = static_cast<DType*>(source->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = dest->shape[0];
...
...
src/array/filter.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2021 by Contributors
* @file array/filter.cc
* @brief Object for selecting items in a set, or selecting items not in a set.
*/
#include "
./
filter.h"
#include "filter.h"
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/packed_func.h>
...
...
@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
IdArray
array
=
args
[
0
];
auto
ctx
=
array
->
ctx
;
// TODO(nv-dlasalle): Implement CPU version.
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
#ifdef DGL_USE_CUDA
ATEN_ID_TYPE_SWITCH
(
array
->
dtype
,
IdType
,
{
*
rv
=
CreateSetFilter
<
kDGLCUDA
,
IdType
>
(
array
);
...
...
src/array/kernel.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020 by Contributors
* @file array/kernel.cc
...
...
@@ -7,7 +8,7 @@
#include <dgl/packed_func_ext.h>
#include "../c_api_common.h"
#include "
./
check.h"
#include "check.h"
#include "kernel_decl.h"
using
namespace
dgl
::
runtime
;
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment