Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
320 additions
and
261 deletions
+320
-261
src/array/cuda/negative_sampling.hip
src/array/cuda/negative_sampling.hip
+20
-18
src/array/cuda/rowwise_sampling.hip
src/array/cuda/rowwise_sampling.hip
+22
-20
src/array/cuda/rowwise_sampling_prob.hip
src/array/cuda/rowwise_sampling_prob.hip
+32
-30
src/array/cuda/sddmm.cuh
src/array/cuda/sddmm.cuh
+7
-5
src/array/cuda/sddmm.hip
src/array/cuda/sddmm.hip
+7
-6
src/array/cuda/sddmm_hetero_coo.hip
src/array/cuda/sddmm_hetero_coo.hip
+4
-3
src/array/cuda/sddmm_hetero_csr.hip
src/array/cuda/sddmm_hetero_csr.hip
+4
-3
src/array/cuda/segment_reduce.cuh
src/array/cuda/segment_reduce.cuh
+8
-6
src/array/cuda/segment_reduce.hip
src/array/cuda/segment_reduce.hip
+12
-11
src/array/cuda/spmat_op_impl_coo.hip
src/array/cuda/spmat_op_impl_coo.hip
+6
-4
src/array/cuda/spmat_op_impl_csr.hip
src/array/cuda/spmat_op_impl_csr.hip
+43
-15
src/array/cuda/spmm.cuh
src/array/cuda/spmm.cuh
+105
-103
src/array/cuda/spmm.hip
src/array/cuda/spmm.hip
+8
-7
src/array/cuda/spmm_hetero.hip
src/array/cuda/spmm_hetero.hip
+11
-9
src/array/cuda/utils.h
src/array/cuda/utils.h
+12
-10
src/array/cuda/utils.hip
src/array/cuda/utils.hip
+7
-5
src/array/cuda/uvm/array_index_select_uvm.cuh
src/array/cuda/uvm/array_index_select_uvm.cuh
+2
-0
src/array/cuda/uvm/array_index_select_uvm.hip
src/array/cuda/uvm/array_index_select_uvm.hip
+5
-3
src/array/filter.cc
src/array/filter.cc
+3
-2
src/array/kernel.cc
src/array/kernel.cc
+2
-1
No files found.
src/array/cuda/negative_sampling.
cu
→
src/array/cuda/negative_sampling.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file array/cuda/negative_sampling.cu
* @file array/cuda/negative_sampling.cu
* @brief rowwise sampling
* @brief rowwise sampling
*/
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/array.h>
#include <dgl/array.h>
#include <dgl/array_iterator.h>
#include <dgl/array_iterator.h>
#include <dgl/random.h>
#include <dgl/random.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
using namespace dgl::runtime;
using namespace dgl::runtime;
...
@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
...
@@ -31,13 +33,13 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
const int stride_x = gridDim.x * blockDim.x;
cu
randStatePhilox4_32_10_t
hip
randStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time
rng; // this allows generating 4 32-bit ints at a time
cu
rand_init
(
random_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
rand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) {
while (tx < num_samples) {
for (int i = 0; i < num_trials; ++i) {
for (int i = 0; i < num_trials; ++i) {
uint4
result
=
cu
rand4
(
&
rng
);
uint4 result =
hip
rand4(&rng);
// Turns out that result.x is always 0 with the above RNG.
// Turns out that result.x is always 0 with the above RNG.
uint64_t y_hi = result.y >> 16;
uint64_t y_hi = result.y >> 16;
uint64_t y_lo = result.y & 0xFFFF;
uint64_t y_lo = result.y & 0xFFFF;
...
@@ -88,7 +90,7 @@ struct IsNotMinusOne {
...
@@ -88,7 +90,7 @@ struct IsNotMinusOne {
template <typename IdType>
template <typename IdType>
void SortOrderedPairs(
void SortOrderedPairs(
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
IdType
*
tmp_major
,
IdType
*
tmp_minor
,
int64_t
n
,
cuda
Stream_t
stream
)
{
IdType* tmp_major, IdType* tmp_minor, int64_t n,
hip
Stream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since
// Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable.
// cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix
// We need a 2*n auxiliary storage to store the results form the first radix
...
@@ -98,21 +100,21 @@ void SortOrderedPairs(
...
@@ -98,21 +100,21 @@ void SortOrderedPairs(
void* tmp2 = nullptr;
void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress.
// Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
stream));
tmp1 = device->AllocWorkspace(ctx, s1);
tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
stream));
// Radix sort by major key next.
// Radix sort by major key next.
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1
: tmp1; // reuse buffer if s2 <= s1
CUDA_CALL
(
cub
::
DeviceRadixSort
::
SortPairs
(
CUDA_CALL(
hip
cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
stream));
...
@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
...
@@ -141,7 +143,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IdType* out_row_data = out_row.Ptr<IdType>();
IdType* out_row_data = out_row.Ptr<IdType>();
IdType* out_col_data = out_col.Ptr<IdType>();
IdType* out_col_data = out_col.Ptr<IdType>();
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = cuda::FindNumThreads(num_actual_samples);
const int nt = cuda::FindNumThreads(num_actual_samples);
const int nb = (num_actual_samples + nt - 1) / nt;
const int nb = (num_actual_samples + nt - 1) / nt;
std::pair<IdArray, IdArray> result;
std::pair<IdArray, IdArray> result;
...
@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
...
@@ -159,11 +161,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
IsNotMinusOne<IdType> op;
IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL
(
cub
::
DeviceSelect
::
If
(
CUDA_CALL(
hip
cub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size);
void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL
(
cub
::
DeviceSelect
::
If
(
CUDA_CALL(
hip
cub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
...
@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
...
@@ -181,25 +183,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
size_t tmp_size_unique = 0;
size_t tmp_size_unique = 0;
void* tmp_unique = nullptr;
void* tmp_unique = nullptr;
CUDA_CALL
(
cub
::
DeviceSelect
::
Unique
(
CUDA_CALL(
hip
cub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size)
tmp_unique = (tmp_size_unique > tmp_size)
? device->AllocWorkspace(ctx, tmp_size_unique)
? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer
: tmp; // reuse buffer
CUDA_CALL
(
cub
::
DeviceSelect
::
Unique
(
CUDA_CALL(
hip
cub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out
=
std
::
min
(
num_samples
,
num_out
);
num_out = ::min(num_samples, num_out);
result = {
result = {
unique_row.CreateView({num_out}, dtype),
unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)};
unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
} else {
} else {
num_out
=
std
::
min
(
num_samples
,
num_out
);
num_out = ::min(num_samples, num_out);
result = {
result = {
out_row.CreateView({num_out}, dtype),
out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)};
out_col.CreateView({num_out}, dtype)};
...
...
src/array/cuda/rowwise_sampling.
cu
→
src/array/cuda/rowwise_sampling.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file array/cuda/rowwise_sampling.cu
* @file array/cuda/rowwise_sampling.cu
* @brief uniform rowwise sampling
* @brief uniform rowwise sampling
*/
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/random.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <dgl/runtime/tensordispatch.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include <numeric>
#include "
../../array/cuda/
atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
using namespace dgl::cuda;
using namespace dgl::cuda;
using namespace dgl::aten::cuda;
using namespace dgl::aten::cuda;
...
@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
...
@@ -126,8 +128,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
const int64_t last_row =
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
hip
randStatePhilox4_32_10_t rng;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t row = in_rows[out_row];
...
@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
...
@@ -151,7 +153,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
__syncthreads();
__syncthreads();
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const
int
num
=
cu
rand
(
&
rng
)
%
(
idx
+
1
);
const int num =
hip
rand(&rng) % (idx + 1);
if (num < num_picks) {
if (num < num_picks) {
// use max so as to achieve the replacement order the serial
// use max so as to achieve the replacement order the serial
// algorithm would have
// algorithm would have
...
@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
...
@@ -204,8 +206,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t last_row =
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
hip
randStatePhilox4_32_10_t rng;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t row = in_rows[out_row];
...
@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
...
@@ -216,7 +218,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
if (deg > 0) {
if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0.
// each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const
int64_t
edge
=
cu
rand
(
&
rng
)
%
deg
;
const int64_t edge =
hip
rand(&rng) % deg;
const int64_t out_idx = out_row_start + idx;
const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row;
out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge];
out_cols[out_idx] = in_index[in_row_start + edge];
...
@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
...
@@ -237,7 +239,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
const auto& ctx = rows->ctx;
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_rows = rows->shape[0];
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...
@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
...
@@ -279,16 +281,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
IdType* out_ptr = static_cast<IdType*>(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
device->FreeWorkspace(ctx, out_deg);
cuda
Event_t
copyEvent
;
hip
Event_t copyEvent;
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
CUDA_CALL(
hip
EventCreate(©Event));
NDArray new_len_tensor;
NDArray new_len_tensor;
if (TensorDispatcher::Global()->IsAvailable()) {
if (TensorDispatcher::Global()->IsAvailable()) {
...
@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
...
@@ -301,10 +303,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
}
}
// copy using the internal current stream
// copy using the internal current stream
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
...
@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
...
@@ -329,8 +331,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
device->FreeWorkspace(ctx, out_ptr);
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
// wait for copying `new_len` to finish
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventDestroy(copyEvent));
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
...
...
src/array/cuda/rowwise_sampling_prob.
cu
→
src/array/cuda/rowwise_sampling_prob.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2022 by Contributors
* Copyright (c) 2022 by Contributors
* @file array/cuda/rowwise_sampling_prob.cu
* @file array/cuda/rowwise_sampling_prob.cu
...
@@ -6,20 +8,20 @@
...
@@ -6,20 +8,20 @@
* sampling code rowwise_sampling.cu.
* sampling code rowwise_sampling.cu.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
*/
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <dgl/random.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include <numeric>
#include "
../../array/cuda/
atomic.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
// require CUB 1.17 to use DeviceSegmentedSort
// require CUB 1.17 to use DeviceSegmentedSort
static_assert
(
//
static_assert(
CUB_VERSION
>=
101700
,
"Require CUB >= 1.17 to use DeviceSegmentedSort"
);
//
CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
namespace dgl {
namespace dgl {
using namespace cuda;
using namespace cuda;
...
@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
...
@@ -159,8 +161,8 @@ __global__ void _CSRAResValueKernel(
const int64_t last_row =
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
hip
randStatePhilox4_32_10_t rng;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t row = in_rows[out_row];
...
@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
...
@@ -179,7 +181,7 @@ __global__ void _CSRAResValueKernel(
prob, data, idx, in_row_start, &item_prob);
prob, data, idx, in_row_start, &item_prob);
// compute A-Res value
// compute A-Res value
ares[ares_idx] = static_cast<FloatType>(
ares[ares_idx] = static_cast<FloatType>(
__powf
(
cu
rand_uniform
(
&
rng
),
1.0
f
/
item_prob
));
__powf(
hip
rand_uniform(&rng), 1.0f / item_prob));
ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
}
}
}
}
...
@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
...
@@ -317,8 +319,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
const int64_t last_row =
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
cu
randStatePhilox4_32_10_t
rng
;
hip
randStatePhilox4_32_10_t rng;
cu
rand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
hip
rand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t row = in_rows[out_row];
...
@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
...
@@ -330,7 +332,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
if (deg > 0) {
if (deg > 0) {
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef
cub
::
BlockScan
<
FloatType
,
BLOCK_SIZE
>
BlockScan
;
typedef
hip
cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
// Allocate shared memory for BlockScan
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
__shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total
// Initialize running total
...
@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
...
@@ -362,10 +364,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get random value
// get random value
FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType
rand
=
static_cast
<
FloatType
>
(
cu
rand_uniform
(
&
rng
)
*
sum
);
FloatType rand = static_cast<FloatType>(
hip
rand_uniform(&rng) * sum);
// get the offset of the first value within cdf array which is greater
// get the offset of the first value within cdf array which is greater
// than random value.
// than random value.
int64_t
item
=
cub
::
UpperBound
<
FloatType
*
,
int64_t
,
FloatType
>
(
int64_t item =
hip
cub::UpperBound<FloatType*, int64_t, FloatType>(
&cdf[cdf_row_start], deg, rand);
&cdf[cdf_row_start], deg, rand);
item = min(item, deg - 1);
item = min(item, deg - 1);
// get in and out index
// get in and out index
...
@@ -411,7 +413,7 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
...
@@ -411,7 +413,7 @@ COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
IdType* new_row_data = new_row.Ptr<IdType>();
IdType* new_row_data = new_row.Ptr<IdType>();
IdType* new_col_data = new_col.Ptr<IdType>();
IdType* new_col_data = new_col.Ptr<IdType>();
IdType* new_eid_data = new_eid.Ptr<IdType>();
IdType* new_eid_data = new_eid.Ptr<IdType>();
auto
stream
=
runtime
::
getCurrent
CUDA
Stream
();
auto stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
...
@@ -441,7 +443,7 @@ COOMatrix _COORemoveIf(
...
@@ -441,7 +443,7 @@ COOMatrix _COORemoveIf(
const COOMatrix& coo, const NDArray& values, DType criteria) {
const COOMatrix& coo, const NDArray& values, DType criteria) {
const DType* val = values.Ptr<DType>();
const DType* val = values.Ptr<DType>();
auto maskgen = [val, criteria](
auto maskgen = [val, criteria](
int
nb
,
int
nt
,
cuda
Stream_t
stream
,
int64_t
nnz
,
int nb, int nt,
hip
Stream_t stream, int64_t nnz,
const IdType* data, int8_t* flags) {
const IdType* data, int8_t* flags) {
CUDA_KERNEL_CALL(
CUDA_KERNEL_CALL(
(_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
(_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
...
@@ -481,7 +483,7 @@ COOMatrix _CSRRowWiseSampling(
...
@@ -481,7 +483,7 @@ COOMatrix _CSRRowWiseSampling(
const FloatArray& prob, bool replace) {
const FloatArray& prob, bool replace) {
const auto& ctx = rows->ctx;
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t num_rows = rows->shape[0];
const int64_t num_rows = rows->shape[0];
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
...
@@ -530,10 +532,10 @@ COOMatrix _CSRRowWiseSampling(
...
@@ -530,10 +532,10 @@ COOMatrix _CSRRowWiseSampling(
IdType* temp_ptr = static_cast<IdType*>(
IdType* temp_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
size_t prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, temp_deg);
device->FreeWorkspace(ctx, temp_deg);
...
@@ -551,16 +553,16 @@ COOMatrix _CSRRowWiseSampling(
...
@@ -551,16 +553,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* out_ptr = static_cast<IdType*>(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
prefix_temp_size = 0;
prefix_temp_size = 0;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
CUDA_CALL(
hip
cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
device->FreeWorkspace(ctx, out_deg);
cuda
Event_t
copyEvent
;
hip
Event_t copyEvent;
CUDA_CALL
(
cuda
EventCreate
(
&
copyEvent
));
CUDA_CALL(
hip
EventCreate(©Event));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// wait on a cudaevent
// wait on a cudaevent
IdType new_len;
IdType new_len;
...
@@ -568,7 +570,7 @@ COOMatrix _CSRRowWiseSampling(
...
@@ -568,7 +570,7 @@ COOMatrix _CSRRowWiseSampling(
device->CopyDataFromTo(
device->CopyDataFromTo(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
CUDA_CALL
(
cuda
EventRecord
(
copyEvent
,
stream
));
CUDA_CALL(
hip
EventRecord(copyEvent, stream));
// allocate workspace
// allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one
// 1) for w/ replacement, it's a global buffer to store cdf segments (one
...
@@ -612,16 +614,16 @@ COOMatrix _CSRRowWiseSampling(
...
@@ -612,16 +614,16 @@ COOMatrix _CSRRowWiseSampling(
IdType* sort_temp_idxs = static_cast<IdType*>(
IdType* sort_temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
cub
::
DoubleBuffer
<
FloatType
>
sort_keys
(
temp
,
sort_temp
);
hip
cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
cub
::
DoubleBuffer
<
IdType
>
sort_values
(
temp_idxs
,
sort_temp_idxs
);
hip
cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
void* d_temp_storage = nullptr;
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
size_t temp_storage_bytes = 0;
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
num_rows, temp_ptr, temp_ptr + 1, stream));
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
CUDA_CALL(
hip
cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
num_rows, temp_ptr, temp_ptr + 1, stream));
num_rows, temp_ptr, temp_ptr + 1, stream));
device->FreeWorkspace(ctx, d_temp_storage);
device->FreeWorkspace(ctx, d_temp_storage);
...
@@ -641,8 +643,8 @@ COOMatrix _CSRRowWiseSampling(
...
@@ -641,8 +643,8 @@ COOMatrix _CSRRowWiseSampling(
device->FreeWorkspace(ctx, out_ptr);
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
// wait for copying `new_len` to finish
CUDA_CALL
(
cuda
EventSynchronize
(
copyEvent
));
CUDA_CALL(
hip
EventSynchronize(copyEvent));
CUDA_CALL
(
cuda
EventDestroy
(
copyEvent
));
CUDA_CALL(
hip
EventDestroy(copyEvent));
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
...
...
src/array/cuda/sddmm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cuh
* @file array/cuda/sddmm.cuh
...
@@ -10,8 +12,8 @@
...
@@ -10,8 +12,8 @@
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "../selector.h"
#include "../selector.h"
#include "
./
functor.cuh"
#include "functor.cuh"
#include "
./
utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "atomic.cuh"
#include "bf16.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
#include "fp16.cuh"
...
@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
...
@@ -178,7 +180,7 @@ __global__ void SDDMMCooTreeReduceKernel(
}
}
#pragma unroll
#pragma unroll
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
_sync
(
full_mask
,
val
,
offset
);
val
+=
__shfl_down
(
val
,
offset
);
if
(
tx
==
0
)
outoff
[
i
]
=
val
;
if
(
tx
==
0
)
outoff
[
i
]
=
val
;
}
}
}
}
...
@@ -275,7 +277,7 @@ void SDDMMCoo(
...
@@ -275,7 +277,7 @@ void SDDMMCoo(
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
@@ -337,7 +339,7 @@ void SDDMMCsr(
...
@@ -337,7 +339,7 @@ void SDDMMCsr(
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
lhs_data
=
lhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
const
DType
*
rhs_data
=
rhs
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
N
=
csr
.
num_rows
,
M
=
csr
.
num_cols
,
E
=
csr
.
indices
->
shape
[
0
];
int64_t
N
=
csr
.
num_rows
,
M
=
csr
.
num_cols
,
E
=
csr
.
indices
->
shape
[
0
];
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
int64_t
*
lhs_off
=
nullptr
,
*
rhs_off
=
nullptr
;
...
...
src/array/cuda/sddmm.
cu
→
src/array/cuda/sddmm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
* @file array/cuda/sddmm.cu
...
@@ -5,8 +6,8 @@
...
@@ -5,8 +6,8 @@
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "
./
functor.cuh"
#include "functor.cuh"
#include "
./
sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace dgl {
namespace aten {
namespace aten {
...
@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
...
@@ -48,10 +49,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SDDMMCsr
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCsr<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template
void
SDDMMCsr
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCsr<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
...
@@ -75,10 +76,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SDDMMCoo
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCoo<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
template
void
SDDMMCoo
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCoo<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
...
src/array/cuda/sddmm_hetero_coo.
cu
→
src/array/cuda/sddmm_hetero_coo.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
* @file array/cuda/sddmm.cu
...
@@ -5,7 +6,7 @@
...
@@ -5,7 +6,7 @@
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "
./
sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace dgl {
namespace aten {
namespace aten {
...
@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
...
@@ -49,13 +50,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SDDMMCooHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCooHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
const std::vector<dgl_type_t>& out_eid);
template
void
SDDMMCooHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCooHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
...
...
src/array/cuda/sddmm_hetero_csr.
cu
→
src/array/cuda/sddmm_hetero_csr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/sddmm.cu
* @file array/cuda/sddmm.cu
...
@@ -5,7 +6,7 @@
...
@@ -5,7 +6,7 @@
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
#include "
./
sddmm.cuh"
#include "sddmm.cuh"
namespace dgl {
namespace dgl {
namespace aten {
namespace aten {
...
@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
...
@@ -48,13 +49,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
int rhs_target, const std::vector<dgl_type_t>& in_eid,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
const std::vector<dgl_type_t>& out_eid);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SDDMMCsrHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SDDMMCsrHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
int rhs_target, const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
const std::vector<dgl_type_t>& out_eid);
template
void
SDDMMCsrHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SDDMMCsrHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const BcastOff& bcast,
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
...
...
src/array/cuda/segment_reduce.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cuh
* @file array/cuda/segment_reduce.cuh
...
@@ -10,8 +12,8 @@
...
@@ -10,8 +12,8 @@
#include <vector>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "atomic.cuh"
#include "
./
utils.h"
#include "utils.h"
namespace
dgl
{
namespace
dgl
{
...
@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
...
@@ -125,7 +127,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
out
->
shape
[
0
];
int64_t
n
=
out
->
shape
[
0
];
int64_t
dim
=
1
;
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
...
@@ -155,7 +157,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
const
IdType
*
idx_data
=
idx
.
Ptr
<
IdType
>
();
const
IdType
*
idx_data
=
idx
.
Ptr
<
IdType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
feat
->
shape
[
0
];
int64_t
n
=
feat
->
shape
[
0
];
int64_t
dim
=
1
;
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
...
@@ -186,7 +188,7 @@ void UpdateGradMinMax_hetero(
const
std
::
vector
<
NDArray
>&
list_feat
,
const
std
::
vector
<
NDArray
>&
list_idx
,
const
std
::
vector
<
NDArray
>&
list_feat
,
const
std
::
vector
<
NDArray
>&
list_idx
,
const
std
::
vector
<
NDArray
>&
list_idx_types
,
const
std
::
vector
<
NDArray
>&
list_idx_types
,
std
::
vector
<
NDArray
>*
list_out
)
{
std
::
vector
<
NDArray
>*
list_out
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if
(
op
==
"copy_lhs"
||
op
==
"copy_rhs"
)
{
if
(
op
==
"copy_lhs"
||
op
==
"copy_rhs"
)
{
std
::
vector
<
std
::
vector
<
dgl_id_t
>>
src_dst_ntypes
(
std
::
vector
<
std
::
vector
<
dgl_id_t
>>
src_dst_ntypes
(
graph
->
NumVertexTypes
(),
std
::
vector
<
dgl_id_t
>
());
graph
->
NumVertexTypes
(),
std
::
vector
<
dgl_id_t
>
());
...
@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
...
@@ -239,7 +241,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
const
IdType
*
arg_data
=
arg
.
Ptr
<
IdType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
n
=
feat
->
shape
[
0
];
int64_t
n
=
feat
->
shape
[
0
];
int64_t
dim
=
1
;
int64_t
dim
=
1
;
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
for
(
int
i
=
1
;
i
<
out
->
ndim
;
++
i
)
dim
*=
out
->
shape
[
i
];
...
...
src/array/cuda/segment_reduce.
cu
→
src/array/cuda/segment_reduce.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/segment_reduce.cu
* @file array/cuda/segment_reduce.cu
...
@@ -6,9 +7,9 @@
...
@@ -6,9 +7,9 @@
#include <dgl/array.h>
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/base_heterograph.h>
#include "
./
functor.cuh"
#include "functor.cuh"
#include "
./
segment_reduce.cuh"
#include "segment_reduce.cuh"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
...
@@ -60,10 +61,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
NDArray arg);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SegmentReduce
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SegmentReduce<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
NDArray arg);
template
void
SegmentReduce
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SegmentReduce<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
const std::string& op, NDArray feat, NDArray offsets, NDArray out,
NDArray arg);
NDArray arg);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
...
@@ -85,9 +86,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
template void ScatterAdd<kDGLCUDA, int64_t, __half>(
template void ScatterAdd<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray idx, NDArray out);
NDArray feat, NDArray idx, NDArray out);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
ScatterAdd
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void ScatterAdd<kDGLCUDA, int32_t, __
hip
_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
NDArray feat, NDArray idx, NDArray out);
template
void
ScatterAdd
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void ScatterAdd<kDGLCUDA, int64_t, __
hip
_bfloat16>(
NDArray feat, NDArray idx, NDArray out);
NDArray feat, NDArray idx, NDArray out);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template void ScatterAdd<kDGLCUDA, int32_t, float>(
template void ScatterAdd<kDGLCUDA, int32_t, float>(
...
@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
...
@@ -108,11 +109,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
UpdateGradMinMax_hetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
template
void
UpdateGradMinMax_hetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const HeteroGraphPtr& g, const std::string& op,
const HeteroGraphPtr& g, const std::string& op,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
...
@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
...
@@ -139,9 +140,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
NDArray feat, NDArray arg, NDArray out);
NDArray feat, NDArray arg, NDArray out);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
BackwardSegmentCmp
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void BackwardSegmentCmp<kDGLCUDA, int32_t, __
hip
_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
NDArray feat, NDArray arg, NDArray out);
template
void
BackwardSegmentCmp
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void BackwardSegmentCmp<kDGLCUDA, int64_t, __
hip
_bfloat16>(
NDArray feat, NDArray arg, NDArray out);
NDArray feat, NDArray arg, NDArray out);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
...
...
src/array/cuda/spmat_op_impl_coo.
cu
→
src/array/cuda/spmat_op_impl_coo.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by contributors.
* Copyright (c) 2021 by contributors.
* @file array/cuda/spmat_op_impl_coo.cu
* @file array/cuda/spmat_op_impl_coo.cu
...
@@ -10,8 +12,8 @@
...
@@ -10,8 +12,8 @@
#include <vector>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "atomic.cuh"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(
...
@@ -72,7 +74,7 @@ __global__ void _COOGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = coo.row->ctx;
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType nnz = coo.row->shape[0];
IdType nt = 1024;
IdType nt = 1024;
...
@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(
...
@@ -103,7 +105,7 @@ __global__ void _COOGetAllRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = coo.row->ctx;
const auto& ctx = coo.row->ctx;
IdType nnz = coo.row->shape[0];
IdType nnz = coo.row->shape[0];
IdType num_rows = coo.num_rows;
IdType num_rows = coo.num_rows;
...
...
src/array/cuda/spmat_op_impl_csr.
cu
→
src/array/cuda/spmat_op_impl_csr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmat_op_impl_csr.cu
* @file array/cuda/spmat_op_impl_csr.cu
...
@@ -7,14 +9,14 @@
...
@@ -7,14 +9,14 @@
#include <thrust/execution_policy.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/for_each.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include <numeric>
#include <unordered_set>
#include <unordered_set>
#include <vector>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
atomic.cuh"
#include "atomic.cuh"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
...
@@ -28,7 +30,7 @@ namespace impl {
...
@@ -28,7 +30,7 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = csr.indptr->ctx;
const auto& ctx = csr.indptr->ctx;
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
...
@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
...
@@ -53,12 +55,12 @@ template <DGLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const auto collen = col->shape[0];
const
auto
rstlen
=
std
::
max
(
rowlen
,
collen
);
const auto rstlen = ::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
if (rstlen == 0) return rst;
if (rstlen == 0) return rst;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int nt = dgl::cuda::FindNumThreads(rstlen);
const int nt = dgl::cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr;
const IdType* data = nullptr;
...
@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
...
@@ -104,7 +106,7 @@ template <DGLDeviceType XPU, typename IdType>
bool CSRHasDuplicate(CSRMatrix csr) {
bool CSRHasDuplicate(CSRMatrix csr) {
if (!csr.sorted) csr = CSRSort(csr);
if (!csr.sorted) csr = CSRSort(csr);
const auto& ctx = csr.indptr->ctx;
const auto& ctx = csr.indptr->ctx;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto device = runtime::DeviceAPI::Get(ctx);
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
// but should be fine.
...
@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(
...
@@ -149,7 +151,7 @@ __global__ void _CSRGetRowNNZKernel(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto len = rows->shape[0];
const auto len = rows->shape[0];
const IdType* vid_data = rows.Ptr<IdType>();
const IdType* vid_data = rows.Ptr<IdType>();
const IdType* indptr_data =
const IdType* indptr_data =
...
@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(
...
@@ -250,7 +252,7 @@ __global__ void _SegmentCopyKernel(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t len = rows->shape[0];
const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
...
@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
...
@@ -359,7 +361,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
CSRMatrix csr, NDArray row, NDArray col) {
CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const auto collen = col->shape[0];
const
auto
len
=
std
::
max
(
rowlen
,
collen
);
const auto len = ::max(rowlen, collen);
if (len == 0) return {NullArray(), NullArray(), NullArray()};
if (len == 0) return {NullArray(), NullArray(), NullArray()};
const auto& ctx = row->ctx;
const auto& ctx = row->ctx;
...
@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
...
@@ -367,7 +369,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
const int64_t nnz = csr.indices->shape[0];
const int64_t nnz = csr.indices->shape[0];
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const IdType* indptr_data =
const IdType* indptr_data =
static_cast<IdType*>(GetDevicePointer(csr.indptr));
static_cast<IdType*>(GetDevicePointer(csr.indptr));
...
@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
...
@@ -532,7 +534,7 @@ __global__ void _SegmentMaskColKernel(
static_cast<IdType>(num_rows));
static_cast<IdType>(num_rows));
NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
typedef
cub
::
WarpReduce
<
IdType
>
WarpReduce
;
typedef
hip
cub::WarpReduce<IdType> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
__shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
while (out_row < last_row) {
while (out_row < last_row) {
...
@@ -547,6 +549,7 @@ __global__ void _SegmentMaskColKernel(
...
@@ -547,6 +549,7 @@ __global__ void _SegmentMaskColKernel(
}
}
}
}
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
if (laneid == 0) {
if (laneid == 0) {
count[out_row] = reduce_count;
count[out_row] = reduce_count;
}
}
...
@@ -557,13 +560,16 @@ __global__ void _SegmentMaskColKernel(
...
@@ -557,13 +560,16 @@ __global__ void _SegmentMaskColKernel(
template <DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceMatrix(
CSRMatrix CSRSliceMatrix(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const auto& ctx = rows->ctx;
const auto& ctx = rows->ctx;
const auto& dtype = rows->dtype;
const auto& dtype = rows->dtype;
const auto nbits = dtype.bits;
const auto nbits = dtype.bits;
const int64_t new_nrows = rows->shape[0];
const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0];
const int64_t new_ncols = cols->shape[0];
std::cout << "new_nrows : " << new_nrows << std::endl;
std::cout << "new_ncols : " << new_ncols << std::endl;
if (new_nrows == 0 || new_ncols == 0)
if (new_nrows == 0 || new_ncols == 0)
return CSRMatrix(
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
@@ -572,6 +578,7 @@ CSRMatrix CSRSliceMatrix(
...
@@ -572,6 +578,7 @@ CSRMatrix CSRSliceMatrix(
// First slice rows
// First slice rows
csr = CSRSliceRows(csr, rows);
csr = CSRSliceRows(csr, rows);
std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
if (csr.indices->shape[0] == 0)
if (csr.indices->shape[0] == 0)
return CSRMatrix(
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
@@ -581,9 +588,11 @@ CSRMatrix CSRSliceMatrix(
...
@@ -581,9 +588,11 @@ CSRMatrix CSRSliceMatrix(
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
// A count for how many masked values per row.
// A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
std::cout << "1 IdArray count : " << count << std::endl;
CUDA_CALL(
CUDA_CALL(
cuda
Memset
(
count
.
Ptr
<
IdType
>
(),
0
,
sizeof
(
IdType
)
*
(
csr
.
num_rows
)));
hip
Memset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
std::cout << "2 IdArray count : " << count << std::endl;
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5);
// For performance, the load factor of the hashmap is in (0.25, 0.5);
// Because num_cols is usually less than 1 Million (on GPU), the
// Because num_cols is usually less than 1 Million (on GPU), the
...
@@ -593,7 +602,7 @@ CSRMatrix CSRSliceMatrix(
...
@@ -593,7 +602,7 @@ CSRMatrix CSRSliceMatrix(
using it = thrust::counting_iterator<int64_t>;
using it = thrust::counting_iterator<int64_t>;
runtime::CUDAWorkspaceAllocator allocator(ctx);
runtime::CUDAWorkspaceAllocator allocator(ctx);
const
auto
exec_policy
=
thrust
::
cuda
::
par_nosync
(
allocator
).
on
(
stream
);
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
thrust::for_each(
thrust::for_each(
exec_policy, it(0), it(new_ncols),
exec_policy, it(0), it(new_ncols),
[key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
[key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
...
@@ -617,20 +626,37 @@ CSRMatrix CSRSliceMatrix(
...
@@ -617,20 +626,37 @@ CSRMatrix CSRSliceMatrix(
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb);
const dim3 nblks(nb);
std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
std::cout << "indptr_data : " << indptr_data << std::endl;
std::cout << "indices_data : " << indices_data << std::endl;
std::cout << "num_rows : " << num_rows << std::endl;
std::cout << "buffer_size : " << buffer_size << std::endl;
std::cout << "mask : " << mask << std::endl;
std::cout << "count : " << count << std::endl;
std::cout << "hashmap_buffer : " << hashmap_buffer << std::endl;
CUDA_KERNEL_CALL(
CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
count.Ptr<IdType>());
count.Ptr<IdType>());
std::cout << "3 IdArray count : " << count << std::endl;
IdArray idx = AsNumBits(NonZero(mask), nbits);
IdArray idx = AsNumBits(NonZero(mask), nbits);
std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
if (idx->shape[0] == 0)
if (idx->shape[0] == 0)
return CSRMatrix(
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
NullArray(dtype, ctx), NullArray(dtype, ctx));
NullArray(dtype, ctx), NullArray(dtype, ctx));
// Indptr needs to be adjusted according to the new nnz per row.
// Indptr needs to be adjusted according to the new nnz per row.
std::cout << " count : " << count << std::endl;
IdArray ret_indptr = CumSum(count, true);
IdArray ret_indptr = CumSum(count, true);
std::cout << " IdArray ret_indptr : " << ret_indptr << std::endl;
// Column & data can be obtained by index select.
// Column & data can be obtained by index select.
IdArray ret_col = IndexSelect(csr.indices, idx);
IdArray ret_col = IndexSelect(csr.indices, idx);
...
@@ -641,6 +667,8 @@ CSRMatrix CSRSliceMatrix(
...
@@ -641,6 +667,8 @@ CSRMatrix CSRSliceMatrix(
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
ret_col = IndexSelect(col_hash, ret_col);
ret_col = IndexSelect(col_hash, ret_col);
// std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
}
}
...
...
src/array/cuda/spmm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cuh
* @file array/cuda/spmm.cuh
...
@@ -11,7 +13,7 @@
...
@@ -11,7 +13,7 @@
#include <limits>
#include <limits>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
#include "atomic.cuh"
#include "atomic.cuh"
#include "bf16.cuh"
#include "bf16.cuh"
#include "fp16.cuh"
#include "fp16.cuh"
...
@@ -28,14 +30,14 @@ namespace aten {
...
@@ -28,14 +30,14 @@ namespace aten {
*/
*/
template
<
typename
DType
,
typename
IdType
>
template
<
typename
DType
,
typename
IdType
>
inline
bool
cusparse_available
(
bool
more_nnz_than_matrix_size
)
{
inline
bool
cusparse_available
(
bool
more_nnz_than_matrix_size
)
{
#if
CUDA
RT_VERSION < 11000
#if
DTK
RT_VERSION < 11000
if
(
std
::
is_same
<
IdType
,
int
>::
value
&&
if
(
std
::
is_same
<
IdType
,
int
>::
value
&&
(
std
::
is_same
<
DType
,
float
>::
value
||
std
::
is_same
<
DType
,
double
>::
value
))
(
std
::
is_same
<
DType
,
float
>::
value
||
std
::
is_same
<
DType
,
double
>::
value
))
return
true
;
return
true
;
return
false
;
return
false
;
#else
#else
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
std
::
is_same
<
DType
,
__
nv
_bfloat16
>::
value
)
std
::
is_same
<
DType
,
__
hip
_bfloat16
>::
value
)
return
false
;
// cusparse's SpMM on fp16 is slow, temporally disabled.
return
false
;
// cusparse's SpMM on fp16 is slow, temporally disabled.
// If the CSR matrix has more NNZ than matrix size, we should not use
// If the CSR matrix has more NNZ than matrix size, we should not use
// cuSPARSE 11.1.
// cuSPARSE 11.1.
...
@@ -47,54 +49,54 @@ namespace {
...
@@ -47,54 +49,54 @@ namespace {
/** @brief Call cuBLAS geam API for transpose operation for float and double. */
/** @brief Call cuBLAS geam API for transpose operation for float and double. */
template
<
typename
DType
>
template
<
typename
DType
>
cu
blasStatus_t
Xgeam
(
hip
blasStatus_t
Xgeam
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
DType
*
alpha
,
const
DType
*
A
,
int
lda
,
int
m
,
int
n
,
const
DType
*
alpha
,
const
DType
*
A
,
int
lda
,
const
DType
*
beta
,
const
DType
*
B
,
int
ldb
,
DType
*
C
,
int
ldc
)
{
const
DType
*
beta
,
const
DType
*
B
,
int
ldb
,
DType
*
C
,
int
ldc
)
{
LOG
(
FATAL
)
<<
"Not supported dtype"
;
LOG
(
FATAL
)
<<
"Not supported dtype"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
}
template
<
>
template
<
>
cu
blasStatus_t
Xgeam
<
__half
>
(
hip
blasStatus_t
Xgeam
<
__half
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
__half
*
alpha
,
const
__half
*
A
,
int
lda
,
int
m
,
int
n
,
const
__half
*
alpha
,
const
__half
*
A
,
int
lda
,
const
__half
*
beta
,
const
__half
*
B
,
int
ldb
,
__half
*
C
,
int
ldc
)
{
const
__half
*
beta
,
const
__half
*
B
,
int
ldb
,
__half
*
C
,
int
ldc
)
{
// TODO(ndickson): There is no cublasHgeam, so a different
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
// implementation would be required.
LOG
(
FATAL
)
<<
"Xgeam does not support dtype half (FP16)"
;
LOG
(
FATAL
)
<<
"Xgeam does not support dtype half (FP16)"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
}
#if BF16_ENABLED
#if BF16_ENABLED
template
<
>
template
<
>
cu
blasStatus_t
Xgeam
<
__
nv
_bfloat16
>
(
hip
blasStatus_t
Xgeam
<
__
hip
_bfloat16
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
__
nv
_bfloat16
*
alpha
,
const
__
nv
_bfloat16
*
A
,
int
lda
,
int
m
,
int
n
,
const
__
hip
_bfloat16
*
alpha
,
const
__
hip
_bfloat16
*
A
,
int
lda
,
const
__
nv
_bfloat16
*
beta
,
const
__
nv
_bfloat16
*
B
,
int
ldb
,
const
__
hip
_bfloat16
*
beta
,
const
__
hip
_bfloat16
*
B
,
int
ldb
,
__
nv
_bfloat16
*
C
,
int
ldc
)
{
__
hip
_bfloat16
*
C
,
int
ldc
)
{
// TODO(ndickson): There is no cublasHgeam, so a different
// TODO(ndickson): There is no cublasHgeam, so a different
// implementation would be required.
// implementation would be required.
LOG
(
FATAL
)
<<
"Xgeam does not support dtype bfloat16 (BF16)"
;
LOG
(
FATAL
)
<<
"Xgeam does not support dtype bfloat16 (BF16)"
;
return
CU
BLAS_STATUS_EXECUTION_FAILED
;
return
HIP
BLAS_STATUS_EXECUTION_FAILED
;
}
}
#endif // BF16_ENABLED
#endif // BF16_ENABLED
template
<
>
template
<
>
cu
blasStatus_t
Xgeam
<
float
>
(
hip
blasStatus_t
Xgeam
<
float
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
int
m
,
int
n
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
beta
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
)
{
const
float
*
beta
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
)
{
return
cu
blasSgeam
(
return
hip
blasSgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
}
template
<
>
template
<
>
cu
blasStatus_t
Xgeam
<
double
>
(
hip
blasStatus_t
Xgeam
<
double
>
(
cu
blasHandle_t
handle
,
cu
blasOperation_t
transa
,
cu
blasOperation_t
transb
,
hip
blasHandle_t
handle
,
hip
blasOperation_t
transa
,
hip
blasOperation_t
transb
,
int
m
,
int
n
,
const
double
*
alpha
,
const
double
*
A
,
int
lda
,
int
m
,
int
n
,
const
double
*
alpha
,
const
double
*
A
,
int
lda
,
const
double
*
beta
,
const
double
*
B
,
int
ldb
,
double
*
C
,
int
ldc
)
{
const
double
*
beta
,
const
double
*
B
,
int
ldb
,
double
*
C
,
int
ldc
)
{
return
cu
blasDgeam
(
return
hip
blasDgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
}
...
@@ -119,12 +121,12 @@ template <typename DType>
...
@@ -119,12 +121,12 @@ template <typename DType>
void
_Transpose
(
const
DType
*
in
,
DType
*
out
,
int
row
,
int
col
)
{
void
_Transpose
(
const
DType
*
in
,
DType
*
out
,
int
row
,
int
col
)
{
DType
alpha
=
1.
,
beta
=
0.
;
DType
alpha
=
1.
,
beta
=
0.
;
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
if
(
!
thr_entry
->
cublas_handle
)
if
(
!
thr_entry
->
cublas_handle
)
CUBLAS_CALL
(
cu
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
hip
blasCreate
(
&
(
thr_entry
->
cublas_handle
)));
CUBLAS_CALL
(
cu
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL
(
hip
blasSetStream
(
thr_entry
->
cublas_handle
,
stream
));
CUBLAS_CALL
(
Xgeam
<
DType
>
(
CUBLAS_CALL
(
Xgeam
<
DType
>
(
thr_entry
->
cublas_handle
,
CU
BLAS_OP_T
,
CU
BLAS_OP_N
,
row
,
col
,
&
alpha
,
in
,
thr_entry
->
cublas_handle
,
HIP
BLAS_OP_T
,
HIP
BLAS_OP_N
,
row
,
col
,
&
alpha
,
in
,
col
,
&
beta
,
nullptr
,
row
,
out
,
row
));
col
,
&
beta
,
nullptr
,
row
,
out
,
row
));
}
}
...
@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
...
@@ -134,7 +136,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
*/
*/
template
<
>
template
<
>
void
_Transpose
<
__half
>
(
const
__half
*
in
,
__half
*
out
,
int
row
,
int
col
)
{
void
_Transpose
<
__half
>
(
const
__half
*
in
,
__half
*
out
,
int
row
,
int
col
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
row
);
int
nt
=
FindNumThreads
(
row
);
int
nb
=
col
;
int
nb
=
col
;
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
...
@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
...
@@ -146,47 +148,47 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
*/
*/
template
<
>
template
<
>
void
_Transpose
<
__
nv
_bfloat16
>
(
void
_Transpose
<
__
hip
_bfloat16
>
(
const
__
nv
_bfloat16
*
in
,
__
nv
_bfloat16
*
out
,
int
row
,
int
col
)
{
const
__
hip
_bfloat16
*
in
,
__
hip
_bfloat16
*
out
,
int
row
,
int
col
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
row
);
int
nt
=
FindNumThreads
(
row
);
int
nb
=
col
;
int
nb
=
col
;
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
CUDA_KERNEL_CALL
(
_TransposeKernel
,
nb
,
nt
,
0
,
stream
,
in
,
out
,
col
,
row
);
}
}
#endif // BF16_ENABLED
#endif // BF16_ENABLED
#if
CUDA
RT_VERSION < 11000
#if
DTK
RT_VERSION < 11000
template
<
typename
DType
>
template
<
typename
DType
>
cu
sparseStatus_t
Xcsrmm2
(
hip
sparseStatus_t
Xcsrmm2
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
DType
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
DType
*
csrValA
,
const
DType
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
DType
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
DType
*
B
,
int
ldb
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
DType
*
B
,
int
ldb
,
const
DType
*
beta
,
DType
*
C
,
int
ldc
)
{
const
DType
*
beta
,
DType
*
C
,
int
ldc
)
{
LOG
(
INFO
)
<<
"Not supported dtype"
;
LOG
(
INFO
)
<<
"Not supported dtype"
;
return
CU
SPARSE_STATUS_EXECUTION_FAILED
;
return
HIP
SPARSE_STATUS_EXECUTION_FAILED
;
}
}
template
<
>
template
<
>
cu
sparseStatus_t
Xcsrmm2
<
float
>
(
hip
sparseStatus_t
Xcsrmm2
<
float
>
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
float
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
float
*
csrValA
,
const
float
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
float
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
float
*
B
,
int
ldb
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cu
sparseScsrmm2
(
return
hip
sparseScsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
}
template
<
>
template
<
>
cu
sparseStatus_t
Xcsrmm2
<
double
>
(
hip
sparseStatus_t
Xcsrmm2
<
double
>
(
cu
sparseHandle_t
handle
,
cu
sparseOperation_t
transA
,
hip
sparseHandle_t
handle
,
hip
sparseOperation_t
transA
,
cu
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
hip
sparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
double
*
alpha
,
const
cu
sparseMatDescr_t
descrA
,
const
double
*
csrValA
,
const
double
*
alpha
,
const
hip
sparseMatDescr_t
descrA
,
const
double
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
double
*
B
,
int
ldb
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
double
*
B
,
int
ldb
,
const
double
*
beta
,
double
*
C
,
int
ldc
)
{
const
double
*
beta
,
double
*
C
,
int
ldc
)
{
return
cu
sparseDcsrmm2
(
return
hip
sparseDcsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
}
...
@@ -213,12 +215,12 @@ void CusparseCsrmm2(
...
@@ -213,12 +215,12 @@ void CusparseCsrmm2(
// device
// device
auto
device
=
runtime
::
DeviceAPI
::
Get
(
ctx
);
auto
device
=
runtime
::
DeviceAPI
::
Get
(
ctx
);
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
stream
));
// all one data array
// all one data array
DType
*
valptr
=
nullptr
;
DType
*
valptr
=
nullptr
;
if
(
!
A_data
)
{
if
(
!
A_data
)
{
...
@@ -226,52 +228,52 @@ void CusparseCsrmm2(
...
@@ -226,52 +228,52 @@ void CusparseCsrmm2(
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
}
}
#if
CUDA
RT_VERSION >= 11000
#if
DTK
RT_VERSION >= 11000
cu
sparseSpMatDescr_t
matA
;
hip
sparseSpMatDescr_t
matA
;
cu
sparseDnMatDescr_t
matB
,
matC
;
hip
sparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL
(
hip
sparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
HIP
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
CUSPARSE_CALL
(
hip
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
CU
SPARSE_ORDER_ROW
));
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
HIP
SPARSE_ORDER_ROW
));
CUSPARSE_CALL
(
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
CU
SPARSE_ORDER_ROW
));
hip
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
HIP
SPARSE_ORDER_ROW
));
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transA
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
size_t
workspace_size
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cu
sparseSpMM_bufferSize
(
CUSPARSE_CALL
(
hip
sparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseSpMM
(
CUSPARSE_CALL
(
hip
sparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
workspace
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
hip
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matC
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matC
));
#else
#else
// allocate matrix for temporary transposed output
// allocate matrix for temporary transposed output
DType
*
trans_out
=
DType
*
trans_out
=
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
m
*
n
*
sizeof
(
DType
)));
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
m
*
n
*
sizeof
(
DType
)));
cu
sparseMatDescr_t
descr
;
hip
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
hip
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
hip
sparseSetMatType
(
descr
,
HIP
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL
(
hip
sparseSetMatIndexBase
(
descr
,
HIP
SPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_OPERATION_NON_TRANSPOSE
,
thr_entry
->
cusparse_handle
,
HIP
SPARSE_OPERATION_NON_TRANSPOSE
,
CU
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
HIP
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
trans_out
,
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
trans_out
,
m
));
m
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL
(
hip
sparseDestroyMatDescr
(
descr
));
// transpose the output matrix
// transpose the output matrix
_Transpose
(
trans_out
,
C_data
,
n
,
m
);
_Transpose
(
trans_out
,
C_data
,
n
,
m
);
device
->
FreeWorkspace
(
ctx
,
trans_out
);
device
->
FreeWorkspace
(
ctx
,
trans_out
);
...
@@ -284,7 +286,7 @@ template <typename DType, typename IdType>
...
@@ -284,7 +286,7 @@ template <typename DType, typename IdType>
void
CusparseCsrmm2Hetero
(
void
CusparseCsrmm2Hetero
(
const
DGLContext
&
ctx
,
const
CSRMatrix
&
csr
,
const
DType
*
B_data
,
const
DGLContext
&
ctx
,
const
CSRMatrix
&
csr
,
const
DType
*
B_data
,
const
DType
*
A_data
,
DType
*
C_data
,
int64_t
x_length
,
const
DType
*
A_data
,
DType
*
C_data
,
int64_t
x_length
,
cuda
Stream_t
strm_id
)
{
hip
Stream_t
strm_id
)
{
// We use csrmm2 to perform following operation:
// We use csrmm2 to perform following operation:
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
// for node feature tensor. However, since cusparse only supports
// for node feature tensor. However, since cusparse only supports
...
@@ -307,9 +309,9 @@ void CusparseCsrmm2Hetero(
...
@@ -307,9 +309,9 @@ void CusparseCsrmm2Hetero(
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
// allocate cusparse handle if needed
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cu
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
CUSPARSE_CALL
(
hip
sparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
}
CUSPARSE_CALL
(
cu
sparseSetStream
(
thr_entry
->
cusparse_handle
,
strm_id
));
CUSPARSE_CALL
(
hip
sparseSetStream
(
thr_entry
->
cusparse_handle
,
strm_id
));
// all one data array
// all one data array
DType
*
valptr
=
nullptr
;
DType
*
valptr
=
nullptr
;
if
(
!
A_data
)
{
if
(
!
A_data
)
{
...
@@ -317,48 +319,48 @@ void CusparseCsrmm2Hetero(
...
@@ -317,48 +319,48 @@ void CusparseCsrmm2Hetero(
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
ctx
,
nnz
*
sizeof
(
DType
)));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
_Fill
(
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
}
}
#if
CUDA
RT_VERSION >= 11000
#if
DTK
RT_VERSION >= 11000
cu
sparseSpMatDescr_t
matA
;
hip
sparseSpMatDescr_t
matA
;
cu
sparseDnMatDescr_t
matB
,
matC
;
hip
sparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
dtype
=
cuda_dtype
<
DType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
constexpr
auto
idtype
=
cusparse_idtype
<
IdType
>::
value
;
CUSPARSE_CALL
(
cu
sparseCreateCsr
(
CUSPARSE_CALL
(
hip
sparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
&
matA
,
m
,
k
,
nnz
,
static_cast
<
IdType
*>
(
csr
.
indptr
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
static_cast
<
IdType
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
const_cast
<
DType
*>
(
valptr
?
valptr
:
A_data
),
idtype
,
idtype
,
CU
SPARSE_INDEX_BASE_ZERO
,
dtype
));
HIP
SPARSE_INDEX_BASE_ZERO
,
dtype
));
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
CUSPARSE_CALL
(
hip
sparseCreateDnMat
(
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
CU
SPARSE_ORDER_ROW
));
&
matB
,
k
,
n
,
n
,
const_cast
<
DType
*>
(
B_data
),
dtype
,
HIP
SPARSE_ORDER_ROW
));
CUSPARSE_CALL
(
CUSPARSE_CALL
(
cu
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
CU
SPARSE_ORDER_ROW
));
hip
sparseCreateDnMat
(
&
matC
,
m
,
n
,
n
,
C_data
,
dtype
,
HIP
SPARSE_ORDER_ROW
));
auto
transA
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transA
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CU
SPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
HIP
SPARSE_OPERATION_NON_TRANSPOSE
;
size_t
workspace_size
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cu
sparseSpMM_bufferSize
(
CUSPARSE_CALL
(
hip
sparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cu
sparseSpMM
(
CUSPARSE_CALL
(
hip
sparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
dtype
,
CU
SPARSE_SPMM_CSR_ALG2
,
workspace
));
matC
,
dtype
,
HIP
SPARSE_SPMM_CSR_ALG2
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cu
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
hip
sparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cu
sparseDestroyDnMat
(
matC
));
CUSPARSE_CALL
(
hip
sparseDestroyDnMat
(
matC
));
#else
#else
cu
sparseMatDescr_t
descr
;
hip
sparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cu
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
hip
sparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cu
sparseSetMatType
(
descr
,
CU
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
hip
sparseSetMatType
(
descr
,
HIP
SPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cu
sparseSetMatIndexBase
(
descr
,
CU
SPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL
(
hip
sparseSetMatIndexBase
(
descr
,
HIP
SPARSE_INDEX_BASE_ZERO
));
CHECK_EQ
(
sizeof
(
IdType
),
sizeof
(
int32_t
));
CHECK_EQ
(
sizeof
(
IdType
),
sizeof
(
int32_t
));
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
thr_entry
->
cusparse_handle
,
CU
SPARSE_OPERATION_NON_TRANSPOSE
,
thr_entry
->
cusparse_handle
,
HIP
SPARSE_OPERATION_NON_TRANSPOSE
,
CU
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
HIP
SPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
(
valptr
)
?
valptr
:
A_data
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
C_data
,
m
));
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
C_data
,
m
));
CUSPARSE_CALL
(
cu
sparseDestroyMatDescr
(
descr
));
CUSPARSE_CALL
(
hip
sparseDestroyMatDescr
(
descr
));
#endif
#endif
if
(
valptr
)
device
->
FreeWorkspace
(
ctx
,
valptr
);
if
(
valptr
)
device
->
FreeWorkspace
(
ctx
,
valptr
);
}
}
...
@@ -625,7 +627,7 @@ void SpMMCoo(
...
@@ -625,7 +627,7 @@ void SpMMCoo(
*/
*/
#if BF16_ENABLED
#if BF16_ENABLED
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
if
(
std
::
is_same
<
DType
,
__half
>::
value
||
std
::
is_same
<
DType
,
__
nv
_bfloat16
>::
value
)
std
::
is_same
<
DType
,
__
hip
_bfloat16
>::
value
)
#else
#else
if
(
std
::
is_same
<
DType
,
__half
>::
value
)
if
(
std
::
is_same
<
DType
,
__half
>::
value
)
#endif // BF16_ENABLED
#endif // BF16_ENABLED
...
@@ -638,7 +640,7 @@ void SpMMCoo(
...
@@ -638,7 +640,7 @@ void SpMMCoo(
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
*
efeat_data
=
efeat
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
DType
*
out_data
=
out
.
Ptr
<
DType
>
();
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
(),
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
(),
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
int64_t
N
=
coo
.
num_rows
,
M
=
coo
.
num_cols
,
E
=
coo
.
row
->
shape
[
0
];
const
int64_t
N
=
coo
.
num_rows
,
M
=
coo
.
num_cols
,
E
=
coo
.
row
->
shape
[
0
];
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
...
@@ -703,7 +705,7 @@ void SpMMCsr(
...
@@ -703,7 +705,7 @@ void SpMMCsr(
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
@@ -764,7 +766,7 @@ void SpMMCmpCsrHetero(
...
@@ -764,7 +766,7 @@ void SpMMCmpCsrHetero(
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
argu_data
=
argu
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
Idx
*
arge_data
=
arge
.
Ptr
<
Idx
>
();
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
*
ubcast_off
=
nullptr
,
*
ebcast_off
=
nullptr
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
int64_t
len
=
bcast
.
out_len
,
lhs_len
=
bcast
.
lhs_len
,
rhs_len
=
bcast
.
rhs_len
;
...
...
src/array/cuda/spmm.
cu
→
src/array/cuda/spmm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
* @file array/cuda/spmm.cu
...
@@ -6,9 +7,9 @@
...
@@ -6,9 +7,9 @@
#include <dgl/array.h>
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "functor.cuh"
#include "
./
ge_spmm.cuh"
#include "ge_spmm.cuh"
#include "
./
spmm.cuh"
#include "spmm.cuh"
namespace dgl {
namespace dgl {
...
@@ -109,11 +110,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
...
@@ -109,11 +110,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
std::vector<NDArray> out_aux);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SpMMCsr
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCsr<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
std::vector<NDArray> out_aux);
template
void
SpMMCsr
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCsr<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
std::vector<NDArray> out_aux);
...
@@ -144,11 +145,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
...
@@ -144,11 +145,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
std::vector<NDArray> out_aux);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SpMMCoo
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCoo<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
std::vector<NDArray> out_aux);
template
void
SpMMCoo
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCoo<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
std::vector<NDArray> out_aux);
std::vector<NDArray> out_aux);
...
...
src/array/cuda/spmm_hetero.
cu
→
src/array/cuda/spmm_hetero.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/spmm.cu
* @file array/cuda/spmm.cu
...
@@ -6,9 +8,9 @@
...
@@ -6,9 +8,9 @@
#include <dgl/array.h>
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
functor.cuh"
#include "functor.cuh"
#include "
./
ge_spmm.cuh"
#include "ge_spmm.cuh"
#include "
./
spmm.cuh"
#include "spmm.cuh"
namespace dgl {
namespace dgl {
...
@@ -37,7 +39,7 @@ void SpMMCsrHetero(
...
@@ -37,7 +39,7 @@ void SpMMCsrHetero(
std::vector<DType*> trans_out((*vec_out).size(), NULL);
std::vector<DType*> trans_out((*vec_out).size(), NULL);
bool use_legacy_cusparsemm =
bool use_legacy_cusparsemm =
(
CUDA
RT_VERSION
<
11000
)
&&
(
reduce
==
"sum"
)
&&
(
DTK
RT_VERSION < 11000) && (reduce == "sum") &&
// legacy cuSPARSE does not care about NNZ, hence the argument "false".
// legacy cuSPARSE does not care about NNZ, hence the argument "false".
((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
(op == "mul" && is_scalar_efeat &&
(op == "mul" && is_scalar_efeat &&
...
@@ -50,7 +52,7 @@ void SpMMCsrHetero(
...
@@ -50,7 +52,7 @@ void SpMMCsrHetero(
if (m == 0) continue;
if (m == 0) continue;
DType* out = static_cast<DType*>(device->AllocWorkspace(
DType* out = static_cast<DType*>(device->AllocWorkspace(
vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
CUDA_CALL
(
cuda
Memset
(
out
,
0
,
m
*
n
*
sizeof
(
DType
)));
CUDA_CALL(
hip
Memset(out, 0, m * n * sizeof(DType)));
trans_out[ntype] = out;
trans_out[ntype] = out;
}
}
}
}
...
@@ -111,7 +113,7 @@ void SpMMCsrHetero(
...
@@ -111,7 +113,7 @@ void SpMMCsrHetero(
}
}
}
}
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
const dgl_type_t src_id = ufeat_ntids[etype];
const dgl_type_t src_id = ufeat_ntids[etype];
const dgl_type_t dst_id = out_ntids[etype];
const dgl_type_t dst_id = out_ntids[etype];
...
@@ -123,7 +125,7 @@ void SpMMCsrHetero(
...
@@ -123,7 +125,7 @@ void SpMMCsrHetero(
cusparse_available<DType, IdType>(more_nnz)) { // cusparse
cusparse_available<DType, IdType>(more_nnz)) { // cusparse
/* If CUDA is less than 11.0, put the output in trans_out for later
/* If CUDA is less than 11.0, put the output in trans_out for later
* transposition */
* transposition */
DType
*
out
=
(
CUDA
RT_VERSION
<
11000
)
DType* out = (
DTK
RT_VERSION < 11000)
? trans_out[dst_id]
? trans_out[dst_id]
: static_cast<DType*>((*vec_out)[dst_id]->data);
: static_cast<DType*>((*vec_out)[dst_id]->data);
CusparseCsrmm2Hetero<DType, IdType>(
CusparseCsrmm2Hetero<DType, IdType>(
...
@@ -209,14 +211,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
...
@@ -209,14 +211,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
const std::vector<dgl_type_t>& out_ntids);
#if BF16_ENABLED
#if BF16_ENABLED
template
void
SpMMCsrHetero
<
kDGLCUDA
,
int32_t
,
__
nv
_bfloat16
>(
template void SpMMCsrHetero<kDGLCUDA, int32_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
std::vector<std::vector<NDArray>>* out_aux,
std::vector<std::vector<NDArray>>* out_aux,
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& ufeat_ntids,
const std::vector<dgl_type_t>& out_ntids);
const std::vector<dgl_type_t>& out_ntids);
template
void
SpMMCsrHetero
<
kDGLCUDA
,
int64_t
,
__
nv
_bfloat16
>(
template void SpMMCsrHetero<kDGLCUDA, int64_t, __
hip
_bfloat16>(
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::string& op, const std::string& reduce, const BcastOff& bcast,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
...
...
src/array/cuda/utils.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.h
* @file array/cuda/utils.h
...
@@ -11,7 +13,7 @@
...
@@ -11,7 +13,7 @@
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/ndarray.h>
#include <dmlc/logging.h>
#include <dmlc/logging.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <type_traits>
#include <type_traits>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
...
@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {
...
@@ -90,7 +92,7 @@ inline int FindNumBlocks(int nblks, int max_nblks = -1) {
template
<
typename
T
>
template
<
typename
T
>
__device__
__forceinline__
T
_ldg
(
T
*
addr
)
{
__device__
__forceinline__
T
_ldg
(
T
*
addr
)
{
#if __
CUDA_ARCH__ >= 350
#if __
HIP_DEVICE_COMPILE__
return
__ldg
(
addr
);
return
__ldg
(
addr
);
#else
#else
return
*
addr
;
return
*
addr
;
...
@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
...
@@ -126,7 +128,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
/** @brief Fill the vector started from ptr of size length with val */
/** @brief Fill the vector started from ptr of size length with val */
template
<
typename
DType
>
template
<
typename
DType
>
void
_Fill
(
DType
*
ptr
,
size_t
length
,
DType
val
)
{
void
_Fill
(
DType
*
ptr
,
size_t
length
,
DType
val
)
{
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t
stream
=
runtime
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int
nt
=
FindNumThreads
(
length
);
int
nt
=
FindNumThreads
(
length
);
int
nb
=
int
nb
=
(
length
+
nt
-
1
)
/
nt
;
// on x-axis, no need to worry about upperbound.
(
length
+
nt
-
1
)
/
nt
;
// on x-axis, no need to worry about upperbound.
...
@@ -185,8 +187,8 @@ template <typename IdType>
...
@@ -185,8 +187,8 @@ template <typename IdType>
__global__
void
_LinearSearchKernel
(
__global__
void
_LinearSearchKernel
(
const
IdType
*
indptr
,
const
IdType
*
indices
,
const
IdType
*
data
,
const
IdType
*
indptr
,
const
IdType
*
indices
,
const
IdType
*
data
,
const
IdType
*
row
,
const
IdType
*
col
,
int64_t
row_stride
,
const
IdType
*
row
,
const
IdType
*
col
,
int64_t
row_stride
,
int64_t
col_stride
,
int64_t
length
,
const
__
nv
_bfloat16
*
weights
,
int64_t
col_stride
,
int64_t
length
,
const
__
hip
_bfloat16
*
weights
,
__
nv
_bfloat16
filler
,
__
nv
_bfloat16
*
out
)
{
__
hip
_bfloat16
filler
,
__
hip
_bfloat16
*
out
)
{
int
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
stride_x
=
gridDim
.
x
*
blockDim
.
x
;
const
int
stride_x
=
gridDim
.
x
*
blockDim
.
x
;
while
(
tx
<
length
)
{
while
(
tx
<
length
)
{
...
@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
...
@@ -204,7 +206,7 @@ __global__ void _LinearSearchKernel(
}
else
{
}
else
{
// If the result is saved in bf16, it should be fine to convert it to
// If the result is saved in bf16, it should be fine to convert it to
// float first
// float first
out
[
tx
]
=
weights
?
weights
[
v
]
:
__
nv
_bfloat16
(
static_cast
<
float
>
(
v
));
out
[
tx
]
=
weights
?
weights
[
v
]
:
__
hip
_bfloat16
(
static_cast
<
float
>
(
v
));
}
}
tx
+=
stride_x
;
tx
+=
stride_x
;
}
}
...
@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
...
@@ -277,12 +279,12 @@ template <typename DType, typename BoolType>
void
MaskSelect
(
void
MaskSelect
(
runtime
::
DeviceAPI
*
device
,
const
DGLContext
&
ctx
,
const
DType
*
input
,
runtime
::
DeviceAPI
*
device
,
const
DGLContext
&
ctx
,
const
DType
*
input
,
const
BoolType
*
mask
,
DType
*
output
,
int64_t
n
,
int64_t
*
rst
,
const
BoolType
*
mask
,
DType
*
output
,
int64_t
n
,
int64_t
*
rst
,
cuda
Stream_t
stream
)
{
hip
Stream_t
stream
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
CUDA_CALL
(
cub
::
DeviceSelect
::
Flagged
(
CUDA_CALL
(
hip
cub
::
DeviceSelect
::
Flagged
(
nullptr
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
nullptr
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUDA_CALL
(
cub
::
DeviceSelect
::
Flagged
(
CUDA_CALL
(
hip
cub
::
DeviceSelect
::
Flagged
(
workspace
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
workspace
,
workspace_size
,
input
,
mask
,
output
,
rst
,
n
,
stream
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
device
->
FreeWorkspace
(
ctx
,
workspace
);
}
}
...
@@ -290,7 +292,7 @@ void MaskSelect(
...
@@ -290,7 +292,7 @@ void MaskSelect(
inline
void
*
GetDevicePointer
(
runtime
::
NDArray
array
)
{
inline
void
*
GetDevicePointer
(
runtime
::
NDArray
array
)
{
void
*
ptr
=
array
->
data
;
void
*
ptr
=
array
->
data
;
if
(
array
.
IsPinned
())
{
if
(
array
.
IsPinned
())
{
CUDA_CALL
(
cuda
HostGetDevicePointer
(
&
ptr
,
ptr
,
0
));
CUDA_CALL
(
hip
HostGetDevicePointer
(
&
ptr
,
ptr
,
0
));
}
}
return
ptr
;
return
ptr
;
}
}
...
...
src/array/cuda/utils.
cu
→
src/array/cuda/utils.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/cuda/utils.cu
* @file array/cuda/utils.cu
* @brief Utilities for CUDA kernels.
* @brief Utilities for CUDA kernels.
*/
*/
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "
./
utils.h"
#include "utils.h"
namespace dgl {
namespace dgl {
namespace cuda {
namespace cuda {
...
@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
...
@@ -17,11 +19,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction
// Call CUB's reduction
size_t workspace_size = 0;
size_t workspace_size = 0;
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
CUDA_CALL
(
cub
::
DeviceReduce
::
Min
(
CUDA_CALL(
hip
cub::DeviceReduce::Min(
nullptr, workspace_size, flags, rst, length, stream));
nullptr, workspace_size, flags, rst, length, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL
(
cub
::
DeviceReduce
::
Min
(
CUDA_CALL(
hip
cub::DeviceReduce::Min(
workspace, workspace_size, flags, rst, length, stream));
workspace, workspace_size, flags, rst, length, stream));
int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
device->FreeWorkspace(ctx, workspace);
device->FreeWorkspace(ctx, workspace);
...
...
src/array/cuda/uvm/array_index_select_uvm.cuh
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file array/cpu/array_index_select_uvm.cuh
* @file array/cpu/array_index_select_uvm.cuh
...
...
src/array/cuda/uvm/array_index_select_uvm.
cu
→
src/array/cuda/uvm/array_index_select_uvm.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2019-2022 by Contributors
* Copyright (c) 2019-2022 by Contributors
* @file array/cuda/uvm/array_index_select_uvm.cu
* @file array/cuda/uvm/array_index_select_uvm.cu
...
@@ -8,7 +10,7 @@
...
@@ -8,7 +10,7 @@
#include "../../../runtime/cuda/cuda_common.h"
#include "../../../runtime/cuda/cuda_common.h"
#include "../array_index_select.cuh"
#include "../array_index_select.cuh"
#include "../utils.h"
#include "../utils.h"
#include "
./
array_index_select_uvm.cuh"
#include "array_index_select_uvm.cuh"
namespace dgl {
namespace dgl {
using runtime::NDArray;
using runtime::NDArray;
...
@@ -17,7 +19,7 @@ namespace impl {
...
@@ -17,7 +19,7 @@ namespace impl {
template <typename DType, typename IdType>
template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const int64_t arr_len = array->shape[0];
const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0];
const int64_t len = index->shape[0];
int64_t num_feat = 1;
int64_t num_feat = 1;
...
@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
...
@@ -78,7 +80,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template <typename DType, typename IdType>
template <typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
cuda
Stream_t
stream
=
runtime
::
getCurrent
CUDA
Stream
();
hip
Stream_t stream = runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const DType* source_data = static_cast<DType*>(source->data);
const DType* source_data = static_cast<DType*>(source->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = dest->shape[0];
const int64_t arr_len = dest->shape[0];
...
...
src/array/filter.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* @file array/filter.cc
* @file array/filter.cc
* @brief Object for selecting items in a set, or selecting items not in a set.
* @brief Object for selecting items in a set, or selecting items not in a set.
*/
*/
#include "
./
filter.h"
#include "filter.h"
#include <dgl/packed_func_ext.h>
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/packed_func.h>
#include <dgl/runtime/packed_func.h>
...
@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
...
@@ -23,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
IdArray
array
=
args
[
0
];
IdArray
array
=
args
[
0
];
auto
ctx
=
array
->
ctx
;
auto
ctx
=
array
->
ctx
;
// TODO(nv-dlasalle): Implement CPU version.
// TODO(nv-dlasalle): Implement CPU version.
if
(
ctx
.
device_type
==
kDGLCUDA
)
{
if
(
ctx
.
device_type
==
kDGLCUDA
||
ctx
.
device_type
==
kDGLROCM
)
{
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
ATEN_ID_TYPE_SWITCH
(
array
->
dtype
,
IdType
,
{
ATEN_ID_TYPE_SWITCH
(
array
->
dtype
,
IdType
,
{
*
rv
=
CreateSetFilter
<
kDGLCUDA
,
IdType
>
(
array
);
*
rv
=
CreateSetFilter
<
kDGLCUDA
,
IdType
>
(
array
);
...
...
src/array/kernel.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020 by Contributors
* Copyright (c) 2020 by Contributors
* @file array/kernel.cc
* @file array/kernel.cc
...
@@ -7,7 +8,7 @@
...
@@ -7,7 +8,7 @@
#include <dgl/packed_func_ext.h>
#include <dgl/packed_func_ext.h>
#include "../c_api_common.h"
#include "../c_api_common.h"
#include "
./
check.h"
#include "check.h"
#include "kernel_decl.h"
#include "kernel_decl.h"
using
namespace
dgl
::
runtime
;
using
namespace
dgl
::
runtime
;
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment