Unverified Commit 8ae50c42 authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] clang-format auto fix. (#4804)



* [Misc] clang-format auto fix.

* manual

* manual

* manual

* manual

* todo

* fix
Co-authored-by: default avatarSteve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
parent 81831111
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
#ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_ #ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_ #define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#include "macro.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "./utils.h"
#include "atomic.cuh"
#include "macro.cuh"
namespace dgl { namespace dgl {
...@@ -23,23 +23,19 @@ namespace cuda { ...@@ -23,23 +23,19 @@ namespace cuda {
* \note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf * \note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
* The grid dimension x and y are reordered for better performance. * The grid dimension x and y are reordered for better performance.
*/ */
template <typename Idx, typename DType, template <typename Idx, typename DType, typename BinaryOp>
typename BinaryOp>
__global__ void GESpMMKernel( __global__ void GESpMMKernel(
const DType* __restrict__ ufeat, const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
const DType* __restrict__ efeat, DType* __restrict__ out, const Idx* __restrict__ indptr,
DType* __restrict__ out, const Idx* __restrict__ indices, const int64_t num_rows,
const Idx* __restrict__ indptr, const int64_t num_cols, const int64_t feat_len) {
const Idx* __restrict__ indices, const Idx rid =
const int64_t num_rows, const int64_t num_cols, blockIdx.x * blockDim.y + threadIdx.y; // over vertices dimension
const int64_t feat_len) {
const Idx rid = blockIdx.x * blockDim.y + threadIdx.y; // over vertices dimension
const Idx fid = (blockIdx.y * 64) + threadIdx.x; // over feature dimension const Idx fid = (blockIdx.y * 64) + threadIdx.x; // over feature dimension
if (rid < num_rows && fid < feat_len) { if (rid < num_rows && fid < feat_len) {
const Idx low = __ldg(indptr + rid), high = __ldg(indptr + rid + 1); const Idx low = __ldg(indptr + rid), high = __ldg(indptr + rid + 1);
DType accum_0 = 0., DType accum_0 = 0., accum_1 = 0.;
accum_1 = 0.;
if (blockIdx.y != gridDim.y - 1) { // fid + 32 < feat_len if (blockIdx.y != gridDim.y - 1) { // fid + 32 < feat_len
for (Idx left = low; left < high; left += 32) { for (Idx left = low; left < high; left += 32) {
...@@ -109,24 +105,21 @@ __global__ void GESpMMKernel( ...@@ -109,24 +105,21 @@ __global__ void GESpMMKernel(
} }
out[feat_len * rid + fid] = accum_0; out[feat_len * rid + fid] = accum_0;
if (fid + 32 < feat_len) if (fid + 32 < feat_len) out[feat_len * rid + fid + 32] = accum_1;
out[feat_len * rid + fid + 32] = accum_1;
} }
} }
} }
} }
template <typename Idx, typename DType, template <typename Idx, typename DType, typename BinaryOp>
typename BinaryOp>
void GESpMMCsr( void GESpMMCsr(
const CSRMatrix& csr, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
NDArray ufeat, NDArray efeat, int64_t feat_len) {
NDArray out, int64_t feat_len) { const Idx* indptr = csr.indptr.Ptr<Idx>();
const Idx *indptr = csr.indptr.Ptr<Idx>(); const Idx* indices = csr.indices.Ptr<Idx>();
const Idx *indices = csr.indices.Ptr<Idx>(); const DType* ufeat_data = ufeat.Ptr<DType>();
const DType *ufeat_data = ufeat.Ptr<DType>(); const DType* efeat_data = efeat.Ptr<DType>();
const DType *efeat_data = efeat.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
DType *out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
...@@ -138,12 +131,10 @@ void GESpMMCsr( ...@@ -138,12 +131,10 @@ void GESpMMCsr(
const dim3 nthrs(ntx, nty); const dim3 nthrs(ntx, nty);
const int sh_mem_size = 0; const int sh_mem_size = 0;
CUDA_KERNEL_CALL((GESpMMKernel<Idx, DType, BinaryOp>), CUDA_KERNEL_CALL(
nblks, nthrs, sh_mem_size, stream, (GESpMMKernel<Idx, DType, BinaryOp>), nblks, nthrs, sh_mem_size, stream,
ufeat_data, efeat_data, out_data, ufeat_data, efeat_data, out_data, indptr, indices, csr.num_rows,
indptr, indices, csr.num_cols, feat_len);
csr.num_rows, csr.num_cols,
feat_len);
} }
} // namespace cuda } // namespace cuda
......
...@@ -8,12 +8,12 @@ ...@@ -8,12 +8,12 @@
///////////////////////// Dispatchers ////////////////////////// ///////////////////////// Dispatchers //////////////////////////
/* Macro used for switching between broadcasting and non-broadcasting kernels. /* Macro used for switching between broadcasting and non-broadcasting kernels.
* It also copies the auxiliary information for calculating broadcasting offsets * It also copies the auxiliary information for calculating broadcasting offsets
* to GPU. * to GPU.
*/ */
#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...) do { \ #define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...) \
do { \
const BcastOff &info = (BCAST); \ const BcastOff &info = (BCAST); \
if (!info.use_bcast) { \ if (!info.use_bcast) { \
constexpr bool UseBcast = false; \ constexpr bool UseBcast = false; \
...@@ -28,13 +28,15 @@ ...@@ -28,13 +28,15 @@
constexpr bool UseBcast = true; \ constexpr bool UseBcast = true; \
const DGLContext ctx = (CTX); \ const DGLContext ctx = (CTX); \
const auto device = runtime::DeviceAPI::Get(ctx); \ const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t*>( \ (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
device->AllocWorkspace(ctx, sizeof(int64_t) * info.lhs_offset.size())); \ ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(cudaMemcpy((LHS_OFF), &info.lhs_offset[0], \ CUDA_CALL(cudaMemcpy( \
(LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \ sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t*>( \ (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
device->AllocWorkspace(ctx, sizeof(int64_t) * info.rhs_offset.size())); \ ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(cudaMemcpy((RHS_OFF), &info.rhs_offset[0], \ CUDA_CALL(cudaMemcpy( \
(RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \ sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
if ((EDGE_MAP)) { \ if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \ constexpr bool UseIdx = true; \
...@@ -46,6 +48,6 @@ ...@@ -46,6 +48,6 @@
device->FreeWorkspace(ctx, (LHS_OFF)); \ device->FreeWorkspace(ctx, (LHS_OFF)); \
device->FreeWorkspace(ctx, (RHS_OFF)); \ device->FreeWorkspace(ctx, (RHS_OFF)); \
} \ } \
} while (0) } while (0)
#endif // DGL_ARRAY_CUDA_MACRO_CUH_ #endif // DGL_ARRAY_CUDA_MACRO_CUH_
...@@ -4,14 +4,14 @@ ...@@ -4,14 +4,14 @@
* \brief rowwise sampling * \brief rowwise sampling
*/ */
#include <dgl/random.h> #include <curand_kernel.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/array_iterator.h> #include <dgl/array_iterator.h>
#include <curand_kernel.h> #include <dgl/random.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./dgl_cub.cuh" #include "./dgl_cub.cuh"
#include "./utils.h" #include "./utils.h"
#include "../../runtime/cuda/cuda_common.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -23,20 +23,15 @@ namespace { ...@@ -23,20 +23,15 @@ namespace {
template <typename IdType> template <typename IdType>
__global__ void _GlobalUniformNegativeSamplingKernel( __global__ void _GlobalUniformNegativeSamplingKernel(
const IdType* __restrict__ indptr, const IdType* __restrict__ indptr, const IdType* __restrict__ indices,
const IdType* __restrict__ indices, IdType* __restrict__ row, IdType* __restrict__ col, int64_t num_row,
IdType* __restrict__ row, int64_t num_col, int64_t num_samples, int num_trials,
IdType* __restrict__ col, bool exclude_self_loops, int32_t random_seed) {
int64_t num_row,
int64_t num_col,
int64_t num_samples,
int num_trials,
bool exclude_self_loops,
int32_t random_seed) {
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x; const int stride_x = gridDim.x * blockDim.x;
curandStatePhilox4_32_10_t rng; // this allows generating 4 32-bit ints at a time curandStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time
curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) { while (tx < num_samples) {
...@@ -50,8 +45,7 @@ __global__ void _GlobalUniformNegativeSamplingKernel( ...@@ -50,8 +45,7 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t u = static_cast<int64_t>(((y_lo << 32L) | z) % num_row); int64_t u = static_cast<int64_t>(((y_lo << 32L) | z) % num_row);
int64_t v = static_cast<int64_t>(((y_hi << 32L) | w) % num_col); int64_t v = static_cast<int64_t>(((y_hi << 32L) | w) % num_col);
if (exclude_self_loops && (u == v)) if (exclude_self_loops && (u == v)) continue;
continue;
// binary search of v among indptr[u:u+1] // binary search of v among indptr[u:u+1]
int64_t b = indptr[u], e = indptr[u + 1] - 1; int64_t b = indptr[u], e = indptr[u + 1] - 1;
...@@ -81,48 +75,47 @@ __global__ void _GlobalUniformNegativeSamplingKernel( ...@@ -81,48 +75,47 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
template <typename DType> template <typename DType>
struct IsNotMinusOne { struct IsNotMinusOne {
__device__ __forceinline__ bool operator() (const std::pair<DType, DType>& a) { __device__ __forceinline__ bool operator()(const std::pair<DType, DType>& a) {
return a.first != -1; return a.first != -1;
} }
}; };
/*! /*!
* \brief Sort ordered pairs in ascending order, using \a tmp_major and \a tmp_minor as * \brief Sort ordered pairs in ascending order, using \a tmp_major and \a
* temporary buffers, each with \a n elements. * tmp_minor as temporary buffers, each with \a n elements.
*/ */
template <typename IdType> template <typename IdType>
void SortOrderedPairs( void SortOrderedPairs(
runtime::DeviceAPI* device, runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
DGLContext ctx, IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
IdType* major,
IdType* minor,
IdType* tmp_major,
IdType* tmp_minor,
int64_t n,
cudaStream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since // Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable. // cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix sort. // We need a 2*n auxiliary storage to store the results form the first radix
// sort.
size_t s1 = 0, s2 = 0; size_t s1 = 0, s2 = 0;
void* tmp1 = nullptr; void* tmp1 = nullptr;
void* tmp2 = nullptr; void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress. // Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream)); tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
tmp1 = device->AllocWorkspace(ctx, s1); tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream)); tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
// Radix sort by major key next. // Radix sort by major key next.
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream)); tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) : tmp1; // reuse buffer if s2 <= s1 stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1
CUDA_CALL(cub::DeviceRadixSort::SortPairs( CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream)); tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
if (tmp1 != tmp2) if (tmp1 != tmp2) device->FreeWorkspace(ctx, tmp2);
device->FreeWorkspace(ctx, tmp2);
device->FreeWorkspace(ctx, tmp1); device->FreeWorkspace(ctx, tmp1);
} }
...@@ -130,17 +123,14 @@ void SortOrderedPairs( ...@@ -130,17 +123,14 @@ void SortOrderedPairs(
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
const CSRMatrix& csr, const CSRMatrix& csr, int64_t num_samples, int num_trials,
int64_t num_samples, bool exclude_self_loops, bool replace, double redundancy) {
int num_trials,
bool exclude_self_loops,
bool replace,
double redundancy) {
auto ctx = csr.indptr->ctx; auto ctx = csr.indptr->ctx;
auto dtype = csr.indptr->dtype; auto dtype = csr.indptr->dtype;
const int64_t num_row = csr.num_rows; const int64_t num_row = csr.num_rows;
const int64_t num_col = csr.num_cols; const int64_t num_col = csr.num_cols;
const int64_t num_actual_samples = static_cast<int64_t>(num_samples * (1 + redundancy)); const int64_t num_actual_samples =
static_cast<int64_t>(num_samples * (1 + redundancy));
IdArray row = Full<IdType>(-1, num_actual_samples, ctx); IdArray row = Full<IdType>(-1, num_actual_samples, ctx);
IdArray col = Full<IdType>(-1, num_actual_samples, ctx); IdArray col = Full<IdType>(-1, num_actual_samples, ctx);
IdArray out_row = IdArray::Empty({num_actual_samples}, dtype, ctx); IdArray out_row = IdArray::Empty({num_actual_samples}, dtype, ctx);
...@@ -156,22 +146,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -156,22 +146,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
std::pair<IdArray, IdArray> result; std::pair<IdArray, IdArray> result;
int64_t num_out; int64_t num_out;
CUDA_KERNEL_CALL(_GlobalUniformNegativeSamplingKernel, CUDA_KERNEL_CALL(
nb, nt, 0, stream, _GlobalUniformNegativeSamplingKernel, nb, nt, 0, stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), row_data, col_data,
row_data, col_data, num_row, num_col, num_actual_samples, num_trials, num_row, num_col, num_actual_samples, num_trials, exclude_self_loops,
exclude_self_loops, RandomEngine::ThreadLocal()->RandInt32()); RandomEngine::ThreadLocal()->RandInt32());
size_t tmp_size = 0; size_t tmp_size = 0;
int64_t* num_out_cuda = static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t))); int64_t* num_out_cuda =
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
IsNotMinusOne<IdType> op; IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data); PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data); PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(cub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream)); nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size); void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL(cub::DeviceSelect::If( CUDA_CALL(cub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream)); tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
if (!replace) { if (!replace) {
...@@ -182,28 +175,33 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -182,28 +175,33 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
PairIterator<IdType> unique_begin(unique_row_data, unique_col_data); PairIterator<IdType> unique_begin(unique_row_data, unique_col_data);
SortOrderedPairs( SortOrderedPairs(
device, ctx, out_row_data, out_col_data, unique_row_data, unique_col_data, device, ctx, out_row_data, out_col_data, unique_row_data,
num_out, stream); unique_col_data, num_out, stream);
size_t tmp_size_unique = 0; size_t tmp_size_unique = 0;
void* tmp_unique = nullptr; void* tmp_unique = nullptr;
CUDA_CALL(cub::DeviceSelect::Unique( CUDA_CALL(cub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream)); nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
tmp_unique = (tmp_size_unique > tmp_size) ? num_out, stream));
device->AllocWorkspace(ctx, tmp_size_unique) : tmp_unique = (tmp_size_unique > tmp_size)
tmp; // reuse buffer ? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer
CUDA_CALL(cub::DeviceSelect::Unique( CUDA_CALL(cub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream)); tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out = std::min(num_samples, num_out); num_out = std::min(num_samples, num_out);
result = {unique_row.CreateView({num_out}, dtype), unique_col.CreateView({num_out}, dtype)}; result = {
unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp) if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
device->FreeWorkspace(ctx, tmp_unique);
} else { } else {
num_out = std::min(num_samples, num_out); num_out = std::min(num_samples, num_out);
result = {out_row.CreateView({num_out}, dtype), out_col.CreateView({num_out}, dtype)}; result = {
out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)};
} }
device->FreeWorkspace(ctx, tmp); device->FreeWorkspace(ctx, tmp);
...@@ -211,10 +209,10 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling( ...@@ -211,10 +209,10 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
return result; return result;
} }
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int32_t>( template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
const CSRMatrix&, int64_t, int, bool, bool, double); kDGLCUDA, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int64_t>( template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
const CSRMatrix&, int64_t, int, bool, bool, double); kDGLCUDA, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
}; // namespace impl }; // namespace impl
}; // namespace aten }; // namespace aten
......
...@@ -4,15 +4,15 @@ ...@@ -4,15 +4,15 @@
* \brief uniform rowwise sampling * \brief uniform rowwise sampling
*/ */
#include <curand_kernel.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <curand_kernel.h>
#include <numeric> #include <numeric>
#include "./dgl_cub.cuh"
#include "../../array/cuda/atomic.cuh" #include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./dgl_cub.cuh"
using namespace dgl::aten::cuda; using namespace dgl::aten::cuda;
...@@ -25,29 +25,28 @@ namespace { ...@@ -25,29 +25,28 @@ namespace {
constexpr int BLOCK_SIZE = 128; constexpr int BLOCK_SIZE = 128;
/** /**
* @brief Compute the size of each row in the sampled CSR, without replacement. * @brief Compute the size of each row in the sampled CSR, without replacement.
* *
* @tparam IdType The type of node and edge indexes. * @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row. * @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick. * @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick. * @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start. * @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by * @param out_deg The size of each row in the sampled matrix, as indexed by
* `in_rows` (output). * `in_rows` (output).
*/ */
template<typename IdType> template <typename IdType>
__global__ void _CSRRowWiseSampleDegreeKernel( __global__ void _CSRRowWiseSampleDegreeKernel(
const int64_t num_picks, const int64_t num_picks, const int64_t num_rows,
const int64_t num_rows, const IdType* const in_rows, const IdType* const in_ptr,
const IdType * const in_rows, IdType* const out_deg) {
const IdType * const in_ptr,
IdType * const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x * blockDim.x; const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) { if (tIdx < num_rows) {
const int in_row = in_rows[tIdx]; const int in_row = in_rows[tIdx];
const int out_row = tIdx; const int out_row = tIdx;
out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]); out_deg[out_row] = min(
static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
if (out_row == num_rows - 1) { if (out_row == num_rows - 1) {
// make the prefixsum work // make the prefixsum work
...@@ -57,23 +56,21 @@ __global__ void _CSRRowWiseSampleDegreeKernel( ...@@ -57,23 +56,21 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
} }
/** /**
* @brief Compute the size of each row in the sampled CSR, with replacement. * @brief Compute the size of each row in the sampled CSR, with replacement.
* *
* @tparam IdType The type of node and edge indexes. * @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row. * @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick. * @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick. * @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start. * @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by * @param out_deg The size of each row in the sampled matrix, as indexed by
* `in_rows` (output). * `in_rows` (output).
*/ */
template<typename IdType> template <typename IdType>
__global__ void _CSRRowWiseSampleDegreeReplaceKernel( __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
const int64_t num_picks, const int64_t num_picks, const int64_t num_rows,
const int64_t num_rows, const IdType* const in_rows, const IdType* const in_ptr,
const IdType * const in_rows, IdType* const out_deg) {
const IdType * const in_ptr,
IdType * const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x * blockDim.x; const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) { if (tIdx < num_rows) {
...@@ -94,41 +91,36 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel( ...@@ -94,41 +91,36 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
} }
/** /**
* @brief Perform row-wise uniform sampling on a CSR matrix, * @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, without replacement. * and generate a COO matrix, without replacement.
* *
* @tparam IdType The ID type used for matrices. * @tparam IdType The ID type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock. * @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use. * @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row. * @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick. * @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick. * @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR. * @param in_ptr The indptr array of the input CSR.
* @param in_index The indices array of the input CSR. * @param in_index The indices array of the input CSR.
* @param data The data array of the input CSR. * @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO. * @param out_ptr The offset to write each row to in the output COO.
* @param out_rows The rows of the output COO (output). * @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output). * @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output). * @param out_idxs The data array of the output COO (output).
*/ */
template<typename IdType, int TILE_SIZE> template <typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleUniformKernel( __global__ void _CSRRowWiseSampleUniformKernel(
const uint64_t rand_seed, const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
const int64_t num_picks, const IdType* const in_rows, const IdType* const in_ptr,
const int64_t num_rows, const IdType* const in_index, const IdType* const data,
const IdType * const in_rows, const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
const IdType * const in_ptr, IdType* const out_idxs) {
const IdType * const in_index,
const IdType * const data,
const IdType * const out_ptr,
IdType * const out_rows,
IdType * const out_cols,
IdType * const out_idxs) {
// we assign one warp per row // we assign one warp per row
assert(blockDim.x == BLOCK_SIZE); assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x * TILE_SIZE; int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
...@@ -177,41 +169,36 @@ __global__ void _CSRRowWiseSampleUniformKernel( ...@@ -177,41 +169,36 @@ __global__ void _CSRRowWiseSampleUniformKernel(
} }
/** /**
* @brief Perform row-wise uniform sampling on a CSR matrix, * @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, with replacement. * and generate a COO matrix, with replacement.
* *
* @tparam IdType The ID type used for matrices. * @tparam IdType The ID type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock. * @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use. * @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row. * @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick. * @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick. * @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR. * @param in_ptr The indptr array of the input CSR.
* @param in_index The indices array of the input CSR. * @param in_index The indices array of the input CSR.
* @param data The data array of the input CSR. * @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO. * @param out_ptr The offset to write each row to in the output COO.
* @param out_rows The rows of the output COO (output). * @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output). * @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output). * @param out_idxs The data array of the output COO (output).
*/ */
template<typename IdType, int TILE_SIZE> template <typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleUniformReplaceKernel( __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const uint64_t rand_seed, const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
const int64_t num_picks, const IdType* const in_rows, const IdType* const in_ptr,
const int64_t num_rows, const IdType* const in_index, const IdType* const data,
const IdType * const in_rows, const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
const IdType * const in_ptr, IdType* const out_idxs) {
const IdType * const in_index,
const IdType * const data,
const IdType * const out_ptr,
IdType * const out_rows,
IdType * const out_cols,
IdType * const out_idxs) {
// we assign one warp per row // we assign one warp per row
assert(blockDim.x == BLOCK_SIZE); assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x * TILE_SIZE; int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows); const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
...@@ -229,7 +216,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel( ...@@ -229,7 +216,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t out_idx = out_row_start + idx; const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row; out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge]; out_cols[out_idx] = in_index[in_row_start + edge];
out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge; out_idxs[out_idx] =
data ? data[in_row_start + edge] : in_row_start + edge;
} }
} }
out_row += 1; out_row += 1;
...@@ -248,11 +236,14 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -248,11 +236,14 @@ COOMatrix _CSRRowWiseSamplingUniform(
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
const int64_t num_rows = rows->shape[0]; const int64_t num_rows = rows->shape[0];
const IdType * const slice_rows = static_cast<const IdType*>(rows->data); const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
IdArray picked_row = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); IdArray picked_row =
IdArray picked_col = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_idx = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); IdArray picked_col =
NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_idx =
NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdType* const out_rows = static_cast<IdType*>(picked_row->data); IdType* const out_rows = static_cast<IdType*>(picked_row->data);
IdType* const out_cols = static_cast<IdType*>(picked_col->data); IdType* const out_cols = static_cast<IdType*>(picked_col->data);
IdType* const out_idxs = static_cast<IdType*>(picked_idx->data); IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
...@@ -261,65 +252,52 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -261,65 +252,52 @@ COOMatrix _CSRRowWiseSamplingUniform(
const IdType* in_cols = mat.indices.Ptr<IdType>(); const IdType* in_cols = mat.indices.Ptr<IdType>();
const IdType* data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr; const IdType* data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
if (mat.is_pinned) { if (mat.is_pinned) {
CUDA_CALL(cudaHostGetDevicePointer( CUDA_CALL(cudaHostGetDevicePointer(&in_ptr, mat.indptr.Ptr<IdType>(), 0));
&in_ptr, mat.indptr.Ptr<IdType>(), 0)); CUDA_CALL(cudaHostGetDevicePointer(&in_cols, mat.indices.Ptr<IdType>(), 0));
CUDA_CALL(cudaHostGetDevicePointer(
&in_cols, mat.indices.Ptr<IdType>(), 0));
if (CSRHasData(mat)) { if (CSRHasData(mat)) {
CUDA_CALL(cudaHostGetDevicePointer( CUDA_CALL(cudaHostGetDevicePointer(&data, mat.data.Ptr<IdType>(), 0));
&data, mat.data.Ptr<IdType>(), 0));
} }
} }
// compute degree // compute degree
IdType * out_deg = static_cast<IdType*>( IdType* out_deg = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
if (replace) { if (replace) {
const dim3 block(512); const dim3 block(512);
const dim3 grid((num_rows + block.x - 1) / block.x); const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeReplaceKernel, _CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks,
grid, block, 0, stream, num_rows, slice_rows, in_ptr, out_deg);
num_picks, num_rows, slice_rows, in_ptr, out_deg);
} else { } else {
const dim3 block(512); const dim3 block(512);
const dim3 grid((num_rows + block.x - 1) / block.x); const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeKernel, _CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks,
grid, block, 0, stream, num_rows, slice_rows, in_ptr, out_deg);
num_picks, num_rows, slice_rows, in_ptr, out_deg);
} }
// fill out_ptr // fill out_ptr
IdType * out_ptr = static_cast<IdType*>( IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size, CUDA_CALL(cub::DeviceScan::ExclusiveSum(
out_deg, nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
out_ptr, void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
num_rows+1, CUDA_CALL(cub::DeviceScan::ExclusiveSum(
stream)); prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void * prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
out_deg,
out_ptr,
num_rows+1,
stream));
device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg); device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent; cudaEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent)); CUDA_CALL(cudaEventCreate(&copyEvent));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// a cudaevent // wait on a cudaevent
IdType new_len; IdType new_len;
// copy using the internal current stream // copy using the internal current stream
device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0, device->CopyDataFromTo(
sizeof(new_len), out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
DGLContext{kDGLCPU, 0},
mat.indptr->dtype);
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(cudaEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
...@@ -331,36 +309,16 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -331,36 +309,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
const dim3 block(BLOCK_SIZE); const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>), (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>), grid, block,
grid, block, 0, stream, 0, stream, random_seed, num_picks, num_rows, slice_rows, in_ptr,
random_seed, in_cols, data, out_ptr, out_rows, out_cols, out_idxs);
num_picks,
num_rows,
slice_rows,
in_ptr,
in_cols,
data,
out_ptr,
out_rows,
out_cols,
out_idxs);
} else { // without replacement } else { // without replacement
const dim3 block(BLOCK_SIZE); const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>), (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>), grid, block, 0,
grid, block, 0, stream, stream, random_seed, num_picks, num_rows, slice_rows, in_ptr, in_cols,
random_seed, data, out_ptr, out_rows, out_cols, out_idxs);
num_picks,
num_rows,
slice_rows,
in_ptr,
in_cols,
data,
out_ptr,
out_rows,
out_cols,
out_idxs);
} }
device->FreeWorkspace(ctx, out_ptr); device->FreeWorkspace(ctx, out_ptr);
...@@ -372,8 +330,8 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -372,8 +330,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
picked_col = picked_col.CreateView({new_len}, picked_col->dtype); picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype); picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
return COOMatrix(mat.num_rows, mat.num_cols, picked_row, return COOMatrix(
picked_col, picked_idx); mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx);
} }
template <DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
...@@ -383,9 +341,11 @@ COOMatrix CSRRowWiseSamplingUniform( ...@@ -383,9 +341,11 @@ COOMatrix CSRRowWiseSamplingUniform(
// Basically this is UnitGraph::InEdges(). // Basically this is UnitGraph::InEdges().
COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false); COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false);
IdArray sliced_rows = IndexSelect(rows, coo.row); IdArray sliced_rows = IndexSelect(rows, coo.row);
return COOMatrix(mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data); return COOMatrix(
mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
} else { } else {
return _CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_picks, replace); return _CSRRowWiseSamplingUniform<XPU, IdType>(
mat, rows, num_picks, replace);
} }
} }
......
...@@ -8,9 +8,10 @@ ...@@ -8,9 +8,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./atomic.cuh" #include "./atomic.cuh"
#include "./utils.h"
namespace dgl { namespace dgl {
...@@ -24,11 +25,9 @@ namespace cuda { ...@@ -24,11 +25,9 @@ namespace cuda {
* \note each blockthread is responsible for aggregation on a row * \note each blockthread is responsible for aggregation on a row
* in the result tensor. * in the result tensor.
*/ */
template <typename IdType, typename DType, template <typename IdType, typename DType, typename ReduceOp>
typename ReduceOp>
__global__ void SegmentReduceKernel( __global__ void SegmentReduceKernel(
const DType* feat, const IdType* offsets, const DType* feat, const IdType* offsets, DType* out, IdType* arg,
DType* out, IdType* arg,
int64_t n, int64_t dim) { int64_t n, int64_t dim) {
for (int row = blockIdx.x; row < n; row += gridDim.x) { for (int row = blockIdx.x; row < n; row += gridDim.x) {
int col = blockIdx.y * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.x + threadIdx.x;
...@@ -39,8 +38,7 @@ __global__ void SegmentReduceKernel( ...@@ -39,8 +38,7 @@ __global__ void SegmentReduceKernel(
ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i); ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i);
} }
out[row * dim + col] = local_accum; out[row * dim + col] = local_accum;
if (ReduceOp::require_arg) if (ReduceOp::require_arg) arg[row * dim + col] = local_arg;
arg[row * dim + col] = local_arg;
col += gridDim.y * blockDim.x; col += gridDim.y * blockDim.x;
} }
} }
...@@ -53,8 +51,7 @@ __global__ void SegmentReduceKernel( ...@@ -53,8 +51,7 @@ __global__ void SegmentReduceKernel(
*/ */
template <typename IdType, typename DType> template <typename IdType, typename DType>
__global__ void ScatterAddKernel( __global__ void ScatterAddKernel(
const DType *feat, const IdType *idx, DType *out, const DType* feat, const IdType* idx, DType* out, int64_t n, int64_t dim) {
int64_t n, int64_t dim) {
for (int row = blockIdx.x; row < n; row += gridDim.x) { for (int row = blockIdx.x; row < n; row += gridDim.x) {
const int write_row = idx[row]; const int write_row = idx[row];
int col = blockIdx.y * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.x + threadIdx.x;
...@@ -73,7 +70,7 @@ __global__ void ScatterAddKernel( ...@@ -73,7 +70,7 @@ __global__ void ScatterAddKernel(
template <typename IdType, typename DType> template <typename IdType, typename DType>
__global__ void UpdateGradMinMaxHeteroKernel( __global__ void UpdateGradMinMaxHeteroKernel(
const DType *feat, const IdType *idx, const IdType *idx_type, DType *out, const DType* feat, const IdType* idx, const IdType* idx_type, DType* out,
int64_t n, int64_t dim, int type) { int64_t n, int64_t dim, int type) {
unsigned int tId = threadIdx.x; unsigned int tId = threadIdx.x;
unsigned int laneId = tId & 31; unsigned int laneId = tId & 31;
...@@ -100,8 +97,7 @@ __global__ void UpdateGradMinMaxHeteroKernel( ...@@ -100,8 +97,7 @@ __global__ void UpdateGradMinMaxHeteroKernel(
*/ */
template <typename IdType, typename DType> template <typename IdType, typename DType>
__global__ void BackwardSegmentCmpKernel( __global__ void BackwardSegmentCmpKernel(
const DType *feat, const IdType *arg, DType *out, const DType* feat, const IdType* arg, DType* out, int64_t n, int64_t dim) {
int64_t n, int64_t dim) {
for (int row = blockIdx.x; row < n; row += gridDim.x) { for (int row = blockIdx.x; row < n; row += gridDim.x) {
int col = blockIdx.y * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.x + threadIdx.x;
while (col < dim) { while (col < dim) {
...@@ -122,11 +118,7 @@ __global__ void BackwardSegmentCmpKernel( ...@@ -122,11 +118,7 @@ __global__ void BackwardSegmentCmpKernel(
* \param arg An auxiliary tensor storing ArgMax/Min information, * \param arg An auxiliary tensor storing ArgMax/Min information,
*/ */
template <typename IdType, typename DType, typename ReduceOp> template <typename IdType, typename DType, typename ReduceOp>
void SegmentReduce( void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
NDArray feat,
NDArray offsets,
NDArray out,
NDArray arg) {
const DType* feat_data = feat.Ptr<DType>(); const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>(); const IdType* offsets_data = offsets.Ptr<IdType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
...@@ -135,8 +127,7 @@ void SegmentReduce( ...@@ -135,8 +127,7 @@ void SegmentReduce(
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
int64_t n = out->shape[0]; int64_t n = out->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
dim *= out->shape[i];
const int nbx = FindNumBlocks<'x'>(n); const int nbx = FindNumBlocks<'x'>(n);
const int ntx = FindNumThreads(dim); const int ntx = FindNumThreads(dim);
...@@ -145,10 +136,9 @@ void SegmentReduce( ...@@ -145,10 +136,9 @@ void SegmentReduce(
const dim3 nblks(nbx, nby); const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty); const dim3 nthrs(ntx, nty);
// TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance. // TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance.
CUDA_KERNEL_CALL((SegmentReduceKernel<IdType, DType, ReduceOp>), CUDA_KERNEL_CALL(
nblks, nthrs, 0, stream, (SegmentReduceKernel<IdType, DType, ReduceOp>), nblks, nthrs, 0, stream,
feat_data, offsets_data, out_data, arg_data, feat_data, offsets_data, out_data, arg_data, n, dim);
n, dim);
} }
/*! /*!
...@@ -159,19 +149,15 @@ void SegmentReduce( ...@@ -159,19 +149,15 @@ void SegmentReduce(
* \param out The output tensor. * \param out The output tensor.
*/ */
template <typename IdType, typename DType> template <typename IdType, typename DType>
void ScatterAdd( void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
NDArray feat,
NDArray idx,
NDArray out) {
const DType* feat_data = feat.Ptr<DType>(); const DType* feat_data = feat.Ptr<DType>();
const IdType* idx_data = idx.Ptr<IdType>(); const IdType* idx_data = idx.Ptr<IdType>();
DType *out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
int64_t n = feat->shape[0]; int64_t n = feat->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
dim *= out->shape[i];
const int nbx = FindNumBlocks<'x'>(n); const int nbx = FindNumBlocks<'x'>(n);
const int ntx = FindNumThreads(dim); const int ntx = FindNumThreads(dim);
...@@ -179,10 +165,9 @@ void ScatterAdd( ...@@ -179,10 +165,9 @@ void ScatterAdd(
const int nty = 1; const int nty = 1;
const dim3 nblks(nbx, nby); const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty); const dim3 nthrs(ntx, nty);
CUDA_KERNEL_CALL((ScatterAddKernel<IdType, DType>), CUDA_KERNEL_CALL(
nblks, nthrs, 0, stream, (ScatterAddKernel<IdType, DType>), nblks, nthrs, 0, stream, feat_data,
feat_data, idx_data, out_data, idx_data, out_data, n, dim);
n, dim);
} }
/*! /*!
...@@ -195,24 +180,26 @@ void ScatterAdd( ...@@ -195,24 +180,26 @@ void ScatterAdd(
* \param list_out List of the output tensors. * \param list_out List of the output tensors.
*/ */
template <typename IdType, typename DType> template <typename IdType, typename DType>
void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph, void UpdateGradMinMax_hetero(
const std::string& op, const HeteroGraphPtr& graph, const std::string& op,
const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types, const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) { std::vector<NDArray>* list_out) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
if (op == "copy_lhs" || op == "copy_rhs") { if (op == "copy_lhs" || op == "copy_rhs") {
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(graph->NumVertexTypes(), std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
std::vector<dgl_id_t>()); graph->NumVertexTypes(), std::vector<dgl_id_t>());
for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) { for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
auto pair = graph->meta_graph()->FindEdge(etype); auto pair = graph->meta_graph()->FindEdge(etype);
const dgl_id_t dst_ntype = pair.first; // graph is reversed const dgl_id_t dst_ntype = pair.first; // graph is reversed
const dgl_id_t src_ntype = pair.second; const dgl_id_t src_ntype = pair.second;
auto same_src_dst_ntype = std::find(std::begin(src_dst_ntypes[dst_ntype]), auto same_src_dst_ntype = std::find(
std::begin(src_dst_ntypes[dst_ntype]),
std::end(src_dst_ntypes[dst_ntype]), src_ntype); std::end(src_dst_ntypes[dst_ntype]), src_ntype);
// if op is "copy_lhs", relation type with same src and dst node type will be updated once // if op is "copy_lhs", relation type with same src and dst node type will
if (op == "copy_lhs" && same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype])) // be updated once
if (op == "copy_lhs" &&
same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
continue; continue;
src_dst_ntypes[dst_ntype].push_back(src_ntype); src_dst_ntypes[dst_ntype].push_back(src_ntype);
const DType* feat_data = list_feat[dst_ntype].Ptr<DType>(); const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
...@@ -229,35 +216,31 @@ void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph, ...@@ -229,35 +216,31 @@ void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph,
const int nbx = FindNumBlocks<'x'>((n * th_per_row + ntx - 1) / ntx); const int nbx = FindNumBlocks<'x'>((n * th_per_row + ntx - 1) / ntx);
const dim3 nblks(nbx); const dim3 nblks(nbx);
const dim3 nthrs(ntx); const dim3 nthrs(ntx);
CUDA_KERNEL_CALL((UpdateGradMinMaxHeteroKernel<IdType, DType>), CUDA_KERNEL_CALL(
nblks, nthrs, 0, stream, (UpdateGradMinMaxHeteroKernel<IdType, DType>), nblks, nthrs, 0,
feat_data, idx_data, idx_type_data, stream, feat_data, idx_data, idx_type_data, out_data, n, dim, type);
out_data, n, dim, type);
} }
} }
} }
/*! /*!
* \brief CUDA implementation of backward phase of Segment Reduce with Min/Max reducer. * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max
* \note math equation: out[arg[i, k], k] = feat[i, k] * reducer.
* \param feat The input tensor. * \note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
* tensor.
* \param arg The ArgMin/Max information, used for indexing. * \param arg The ArgMin/Max information, used for indexing.
* \param out The output tensor. * \param out The output tensor.
*/ */
template <typename IdType, typename DType> template <typename IdType, typename DType>
void BackwardSegmentCmp( void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
NDArray feat,
NDArray arg,
NDArray out) {
const DType* feat_data = feat.Ptr<DType>(); const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>(); const IdType* arg_data = arg.Ptr<IdType>();
DType *out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
int64_t n = feat->shape[0]; int64_t n = feat->shape[0];
int64_t dim = 1; int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i) for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
dim *= out->shape[i];
const int nbx = FindNumBlocks<'x'>(n); const int nbx = FindNumBlocks<'x'>(n);
const int ntx = FindNumThreads(dim); const int ntx = FindNumThreads(dim);
...@@ -265,10 +248,9 @@ void BackwardSegmentCmp( ...@@ -265,10 +248,9 @@ void BackwardSegmentCmp(
const int nty = 1; const int nty = 1;
const dim3 nblks(nbx, nby); const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty); const dim3 nthrs(ntx, nty);
CUDA_KERNEL_CALL((BackwardSegmentCmpKernel<IdType, DType>), CUDA_KERNEL_CALL(
nblks, nthrs, 0, stream, (BackwardSegmentCmpKernel<IdType, DType>), nblks, nthrs, 0, stream,
feat_data, arg_data, out_data, feat_data, arg_data, out_data, n, dim);
n, dim);
} }
} // namespace cuda } // namespace cuda
......
...@@ -4,12 +4,14 @@ ...@@ -4,12 +4,14 @@
* \brief COO operator GPU implementation * \brief COO operator GPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <vector>
#include <unordered_set>
#include <numeric> #include <numeric>
#include <unordered_set>
#include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./atomic.cuh" #include "./atomic.cuh"
#include "./utils.h"
namespace dgl { namespace dgl {
...@@ -19,9 +21,8 @@ using namespace cuda; ...@@ -19,9 +21,8 @@ using namespace cuda;
namespace aten { namespace aten {
namespace impl { namespace impl {
template <typename IdType> template <typename IdType>
__device__ void _warpReduce(volatile IdType *sdata, IdType tid) { __device__ void _warpReduce(volatile IdType* sdata, IdType tid) {
sdata[tid] += sdata[tid + 32]; sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16]; sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8]; sdata[tid] += sdata[tid + 8];
...@@ -32,10 +33,8 @@ __device__ void _warpReduce(volatile IdType *sdata, IdType tid) { ...@@ -32,10 +33,8 @@ __device__ void _warpReduce(volatile IdType *sdata, IdType tid) {
template <typename IdType> template <typename IdType>
__global__ void _COOGetRowNNZKernel( __global__ void _COOGetRowNNZKernel(
const IdType* __restrict__ row_indices, const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnt,
IdType* __restrict__ glb_cnt, const int64_t row_query, IdType nnz) {
const int64_t row_query,
IdType nnz) {
__shared__ IdType local_cnt[1024]; __shared__ IdType local_cnt[1024];
IdType tx = threadIdx.x; IdType tx = threadIdx.x;
IdType bx = blockIdx.x; IdType bx = blockIdx.x;
...@@ -80,10 +79,9 @@ int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) { ...@@ -80,10 +79,9 @@ int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt); IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx); NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
_Fill(rst.Ptr<IdType>(), 1, IdType(0)); _Fill(rst.Ptr<IdType>(), 1, IdType(0));
CUDA_KERNEL_CALL(_COOGetRowNNZKernel, CUDA_KERNEL_CALL(
nb, nt, 0, stream, _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
coo.row.Ptr<IdType>(), rst.Ptr<IdType>(), rst.Ptr<IdType>(), row, nnz);
row, nnz);
rst = rst.CopyTo(DGLContext{kDGLCPU, 0}); rst = rst.CopyTo(DGLContext{kDGLCPU, 0});
return *rst.Ptr<IdType>(); return *rst.Ptr<IdType>();
} }
...@@ -93,8 +91,7 @@ template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t); ...@@ -93,8 +91,7 @@ template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t);
template <typename IdType> template <typename IdType>
__global__ void _COOGetAllRowNNZKernel( __global__ void _COOGetAllRowNNZKernel(
const IdType* __restrict__ row_indices, const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnts,
IdType* __restrict__ glb_cnts,
IdType nnz) { IdType nnz) {
IdType eid = blockIdx.x * blockDim.x + threadIdx.x; IdType eid = blockIdx.x * blockDim.x + threadIdx.x;
while (eid < nnz) { while (eid < nnz) {
...@@ -118,20 +115,18 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) { ...@@ -118,20 +115,18 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt); IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx); NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
_Fill(rst.Ptr<IdType>(), 1, IdType(0)); _Fill(rst.Ptr<IdType>(), 1, IdType(0));
CUDA_KERNEL_CALL(_COOGetRowNNZKernel, CUDA_KERNEL_CALL(
nb, nt, 0, stream, _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
coo.row.Ptr<IdType>(), rst.Ptr<IdType>(), rst.Ptr<IdType>(), row, nnz);
row, nnz);
return rst; return rst;
} else { } else {
IdType nt = 1024; IdType nt = 1024;
IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt); IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx); NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx);
_Fill(in_degrees.Ptr<IdType>(), num_rows, IdType(0)); _Fill(in_degrees.Ptr<IdType>(), num_rows, IdType(0));
CUDA_KERNEL_CALL(_COOGetAllRowNNZKernel, CUDA_KERNEL_CALL(
nb, nt, 0, stream, _COOGetAllRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
coo.row.Ptr<IdType>(), in_degrees.Ptr<IdType>(), in_degrees.Ptr<IdType>(), nnz);
nnz);
return IndexSelect(in_degrees, rows); return IndexSelect(in_degrees, rows);
} }
} }
......
This diff is collapsed.
...@@ -4,17 +4,18 @@ ...@@ -4,17 +4,18 @@
* \brief Array index select GPU implementation * \brief Array index select GPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../../runtime/cuda/cuda_common.h" #include "../../../runtime/cuda/cuda_common.h"
#include "../array_index_select.cuh" #include "../array_index_select.cuh"
#include "./array_index_select_uvm.cuh"
#include "../utils.h" #include "../utils.h"
#include "./array_index_select_uvm.cuh"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
namespace aten { namespace aten {
namespace impl { namespace impl {
template<typename DType, typename IdType> template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
const IdType* idx_data = static_cast<IdType*>(index->data); const IdType* idx_data = static_cast<IdType*>(index->data);
...@@ -34,30 +35,30 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { ...@@ -34,30 +35,30 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
} }
NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx); NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx);
if (len == 0) if (len == 0) return ret;
return ret;
DType* ret_data = static_cast<DType*>(ret->data); DType* ret_data = static_cast<DType*>(ret->data);
if (num_feat == 1) { if (num_feat == 1) {
const int nt = cuda::FindNumThreads(len); const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt; const int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(IndexSelectSingleKernel, nb, nt, 0, CUDA_KERNEL_CALL(
stream, array_data, idx_data, len, arr_len, ret_data); IndexSelectSingleKernel, nb, nt, 0, stream, array_data, idx_data, len,
arr_len, ret_data);
} else { } else {
dim3 block(256, 1); dim3 block(256, 1);
while (static_cast<int64_t>(block.x) >= 2*num_feat) { while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
block.x /= 2; block.x /= 2;
block.y *= 2; block.y *= 2;
} }
const dim3 grid((len+block.y-1)/block.y); const dim3 grid((len + block.y - 1) / block.y);
if (num_feat * sizeof(DType) < 2 * CACHE_LINE_SIZE) { if (num_feat * sizeof(DType) < 2 * CACHE_LINE_SIZE) {
CUDA_KERNEL_CALL(IndexSelectMultiKernel, grid, block, 0, CUDA_KERNEL_CALL(
stream, array_data, num_feat, idx_data, IndexSelectMultiKernel, grid, block, 0, stream, array_data, num_feat,
len, arr_len, ret_data); idx_data, len, arr_len, ret_data);
} else { } else {
CUDA_KERNEL_CALL(IndexSelectMultiKernelAligned, grid, block, 0, CUDA_KERNEL_CALL(
stream, array_data, num_feat, idx_data, IndexSelectMultiKernelAligned, grid, block, 0, stream, array_data,
len, arr_len, ret_data); num_feat, idx_data, len, arr_len, ret_data);
} }
} }
return ret; return ret;
...@@ -73,8 +74,7 @@ template NDArray IndexSelectCPUFromGPU<int32_t, int64_t>(NDArray, IdArray); ...@@ -73,8 +74,7 @@ template NDArray IndexSelectCPUFromGPU<int32_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<int64_t, int32_t>(NDArray, IdArray); template NDArray IndexSelectCPUFromGPU<int64_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray); template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template <typename DType, typename IdType>
template<typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
const DType* source_data = static_cast<DType*>(source->data); const DType* source_data = static_cast<DType*>(source->data);
...@@ -94,24 +94,24 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { ...@@ -94,24 +94,24 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
num_feat *= source->shape[d]; num_feat *= source->shape[d];
} }
if (len == 0) if (len == 0) return;
return;
if (num_feat == 1) { if (num_feat == 1) {
const int nt = cuda::FindNumThreads(len); const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt; const int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(IndexScatterSingleKernel, nb, nt, 0, CUDA_KERNEL_CALL(
stream, source_data, idx_data, len, arr_len, dest_data); IndexScatterSingleKernel, nb, nt, 0, stream, source_data, idx_data, len,
arr_len, dest_data);
} else { } else {
dim3 block(256, 1); dim3 block(256, 1);
while (static_cast<int64_t>(block.x) >= 2*num_feat) { while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
block.x /= 2; block.x /= 2;
block.y *= 2; block.y *= 2;
} }
const dim3 grid((len+block.y-1)/block.y); const dim3 grid((len + block.y - 1) / block.y);
CUDA_KERNEL_CALL(IndexScatterMultiKernel, grid, block, 0, CUDA_KERNEL_CALL(
stream, source_data, num_feat, idx_data, IndexScatterMultiKernel, grid, block, 0, stream, source_data, num_feat,
len, arr_len, dest_data); idx_data, len, arr_len, dest_data);
} }
} }
......
...@@ -14,31 +14,28 @@ namespace aten { ...@@ -14,31 +14,28 @@ namespace aten {
namespace impl { namespace impl {
/* This is a cross-device access version of IndexSelectMultiKernel. /* This is a cross-device access version of IndexSelectMultiKernel.
* Since the memory access over PCIe is more sensitive to the * Since the memory access over PCIe is more sensitive to the
* data access aligment (cacheline), we need a separate version here. * data access aligment (cacheline), we need a separate version here.
*/ */
template <typename DType, typename IdType> template <typename DType, typename IdType>
__global__ void IndexSelectMultiKernelAligned( __global__ void IndexSelectMultiKernelAligned(
const DType* const array, const DType* const array, const int64_t num_feat, const IdType* const index,
const int64_t num_feat, const int64_t length, const int64_t arr_len, DType* const out) {
const IdType* const index, int64_t out_row = blockIdx.x * blockDim.y + threadIdx.y;
const int64_t length,
const int64_t arr_len,
DType* const out) {
int64_t out_row = blockIdx.x*blockDim.y+threadIdx.y;
const int64_t stride = blockDim.y*gridDim.x; const int64_t stride = blockDim.y * gridDim.x;
while (out_row < length) { while (out_row < length) {
int64_t col = threadIdx.x; int64_t col = threadIdx.x;
const int64_t in_row = index[out_row]; const int64_t in_row = index[out_row];
assert(in_row >= 0 && in_row < arr_len); assert(in_row >= 0 && in_row < arr_len);
const int64_t idx_offset = const int64_t idx_offset =
((uint64_t)(&array[in_row*num_feat]) % CACHE_LINE_SIZE) / sizeof(DType); ((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) /
sizeof(DType);
col = col - idx_offset; col = col - idx_offset;
while (col < num_feat) { while (col < num_feat) {
if (col >= 0) if (col >= 0)
out[out_row*num_feat+col] = array[in_row*num_feat+col]; out[out_row * num_feat + col] = array[in_row * num_feat + col];
col += blockDim.x; col += blockDim.x;
} }
out_row += stride; out_row += stride;
......
...@@ -6,49 +6,49 @@ ...@@ -6,49 +6,49 @@
#include "./filter.h" #include "./filter.h"
#include <dgl/runtime/registry.h>
#include <dgl/runtime/packed_func.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/packed_func.h>
#include <dgl/runtime/registry.h>
namespace dgl { namespace dgl {
namespace array { namespace array {
using namespace dgl::runtime; using namespace dgl::runtime;
template<DGLDeviceType XPU, typename IdType> template <DGLDeviceType XPU, typename IdType>
FilterRef CreateSetFilter(IdArray set); FilterRef CreateSetFilter(IdArray set);
DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet") DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([](DGLArgs args, DGLRetValue* rv) {
IdArray array = args[0]; IdArray array = args[0];
auto ctx = array->ctx; auto ctx = array->ctx;
// TODO(nv-dlasalle): Implement CPU version. // TODO(nv-dlasalle): Implement CPU version.
if (ctx.device_type == kDGLCUDA) { if (ctx.device_type == kDGLCUDA) {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, { ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
*rv = CreateSetFilter<kDGLCUDA, IdType>(array); *rv = CreateSetFilter<kDGLCUDA, IdType>(array);
}); });
#else #else
LOG(FATAL) << "GPU support not compiled."; LOG(FATAL) << "GPU support not compiled.";
#endif #endif
} else { } else {
LOG(FATAL) << "CPU support not yet implemented."; LOG(FATAL) << "CPU support not yet implemented.";
} }
}); });
DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindIncludedIndices") DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindIncludedIndices")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([](DGLArgs args, DGLRetValue* rv) {
FilterRef filter = args[0]; FilterRef filter = args[0];
IdArray array = args[1]; IdArray array = args[1];
*rv = filter->find_included_indices(array); *rv = filter->find_included_indices(array);
}); });
DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindExcludedIndices") DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindExcludedIndices")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([](DGLArgs args, DGLRetValue* rv) {
FilterRef filter = args[0]; FilterRef filter = args[0];
IdArray array = args[1]; IdArray array = args[1];
*rv = filter->find_excluded_indices(array); *rv = filter->find_excluded_indices(array);
}); });
} // namespace array } // namespace array
} // namespace dgl } // namespace dgl
...@@ -4,12 +4,11 @@ ...@@ -4,12 +4,11 @@
* \brief Object for selecting items in a set, or selecting items not in a set. * \brief Object for selecting items in a set, or selecting items not in a set.
*/ */
#ifndef DGL_ARRAY_FILTER_H_ #ifndef DGL_ARRAY_FILTER_H_
#define DGL_ARRAY_FILTER_H_ #define DGL_ARRAY_FILTER_H_
#include <dgl/runtime/object.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/object.h>
namespace dgl { namespace dgl {
namespace array { namespace array {
...@@ -28,8 +27,7 @@ class Filter : public runtime::Object { ...@@ -28,8 +27,7 @@ class Filter : public runtime::Object {
* @return The indices of the items from `test` that are selected by * @return The indices of the items from `test` that are selected by
* this filter. * this filter.
*/ */
virtual IdArray find_included_indices( virtual IdArray find_included_indices(IdArray test) = 0;
IdArray test) = 0;
/** /**
* @brief From the test set of items, get the indices of those which are * @brief From the test set of items, get the indices of those which are
...@@ -40,8 +38,7 @@ class Filter : public runtime::Object { ...@@ -40,8 +38,7 @@ class Filter : public runtime::Object {
* @return The indices of the items from `test` that are not selected by this * @return The indices of the items from `test` that are not selected by this
* filter. * filter.
*/ */
virtual IdArray find_excluded_indices( virtual IdArray find_excluded_indices(IdArray test) = 0;
IdArray test) = 0;
}; };
DGL_DEFINE_OBJECT_REF(FilterRef, Filter); DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
...@@ -50,4 +47,3 @@ DGL_DEFINE_OBJECT_REF(FilterRef, Filter); ...@@ -50,4 +47,3 @@ DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
} // namespace dgl } // namespace dgl
#endif // DGL_ARRAY_FILTER_H_ #endif // DGL_ARRAY_FILTER_H_
This diff is collapsed.
This diff is collapsed.
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
* \brief DGL array utilities implementation * \brief DGL array utilities implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <sstream> #include <sstream>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./uvm_array_op.h" #include "./uvm_array_op.h"
...@@ -35,10 +37,12 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { ...@@ -35,10 +37,12 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
CHECK(dest.IsPinned()) << "Destination array must be in pinned memory."; CHECK(dest.IsPinned()) << "Destination array must be in pinned memory.";
CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU."; CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
CHECK_EQ(source->ctx.device_type, kDGLCUDA) << "Source array must be on the GPU."; CHECK_EQ(source->ctx.device_type, kDGLCUDA)
<< "Source array must be on the GPU.";
CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source " CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source "
"array must have the same dtype."; "array must have the same dtype.";
CHECK_GE(dest->ndim, 1) << "Destination array must have at least 1 dimension."; CHECK_GE(dest->ndim, 1)
<< "Destination array must have at least 1 dimension.";
CHECK_EQ(index->ndim, 1) << "Index must be a 1D array."; CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";
ATEN_DTYPE_BITS_ONLY_SWITCH(source->dtype, DType, "values", { ATEN_DTYPE_BITS_ONLY_SWITCH(source->dtype, DType, "values", {
...@@ -52,21 +56,19 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { ...@@ -52,21 +56,19 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
} }
DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU") DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([](DGLArgs args, DGLRetValue* rv) {
NDArray array = args[0]; NDArray array = args[0];
IdArray index = args[1]; IdArray index = args[1];
*rv = IndexSelectCPUFromGPU(array, index); *rv = IndexSelectCPUFromGPU(array, index);
}); });
DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexScatterGPUToCPU") DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexScatterGPUToCPU")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([](DGLArgs args, DGLRetValue* rv) {
NDArray dest = args[0]; NDArray dest = args[0];
IdArray index = args[1]; IdArray index = args[1];
NDArray source = args[2]; NDArray source = args[2];
IndexScatterGPUToCPU(dest, index, source); IndexScatterGPUToCPU(dest, index, source);
}); });
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#define DGL_ARRAY_UVM_ARRAY_OP_H_ #define DGL_ARRAY_UVM_ARRAY_OP_H_
#include <dgl/array.h> #include <dgl/array.h>
#include <utility> #include <utility>
namespace dgl { namespace dgl {
......
...@@ -3,14 +3,15 @@ ...@@ -3,14 +3,15 @@
* \file c_runtime_api.cc * \file c_runtime_api.cc
* \brief DGL C API common implementations * \brief DGL C API common implementations
*/ */
#include <dgl/graph_interface.h>
#include "c_api_common.h" #include "c_api_common.h"
#include <dgl/graph_interface.h>
using dgl::runtime::DGLArgs; using dgl::runtime::DGLArgs;
using dgl::runtime::DGLArgValue; using dgl::runtime::DGLArgValue;
using dgl::runtime::DGLRetValue; using dgl::runtime::DGLRetValue;
using dgl::runtime::PackedFunc;
using dgl::runtime::NDArray; using dgl::runtime::NDArray;
using dgl::runtime::PackedFunc;
namespace dgl { namespace dgl {
...@@ -27,7 +28,7 @@ PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector<NDArray>& vec) { ...@@ -27,7 +28,7 @@ PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector<NDArray>& vec) {
} }
PackedFunc ConvertEdgeArrayToPackedFunc(const EdgeArray& ea) { PackedFunc ConvertEdgeArrayToPackedFunc(const EdgeArray& ea) {
auto body = [ea] (DGLArgs args, DGLRetValue* rv) { auto body = [ea](DGLArgs args, DGLRetValue* rv) {
const int which = args[0]; const int which = args[0];
if (which == 0) { if (which == 0) {
*rv = std::move(ea.src); *rv = std::move(ea.src);
......
...@@ -6,15 +6,16 @@ ...@@ -6,15 +6,16 @@
#ifndef DGL_C_API_COMMON_H_ #ifndef DGL_C_API_COMMON_H_
#define DGL_C_API_COMMON_H_ #define DGL_C_API_COMMON_H_
#include <dgl/array.h>
#include <dgl/graph_interface.h>
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include <dgl/runtime/packed_func.h> #include <dgl/runtime/packed_func.h>
#include <dgl/runtime/registry.h> #include <dgl/runtime/registry.h>
#include <dgl/array.h>
#include <dgl/graph_interface.h>
#include <algorithm> #include <algorithm>
#include <vector>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector>
namespace dgl { namespace dgl {
...@@ -36,12 +37,12 @@ dgl::runtime::PackedFunc ConvertNDArrayVectorToPackedFunc( ...@@ -36,12 +37,12 @@ dgl::runtime::PackedFunc ConvertNDArrayVectorToPackedFunc(
* The data type of the NDArray will be IdType, which must be an integer type. * The data type of the NDArray will be IdType, which must be an integer type.
* The element type (DType) of the vector must be convertible to IdType. * The element type (DType) of the vector must be convertible to IdType.
*/ */
template<typename IdType, typename DType> template <typename IdType, typename DType>
dgl::runtime::NDArray CopyVectorToNDArray( dgl::runtime::NDArray CopyVectorToNDArray(const std::vector<DType>& vec) {
const std::vector<DType>& vec) {
using dgl::runtime::NDArray; using dgl::runtime::NDArray;
const int64_t len = vec.size(); const int64_t len = vec.size();
NDArray a = NDArray::Empty({len}, DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, NDArray a = NDArray::Empty(
{len}, DGLDataType{kDGLInt, sizeof(IdType) * 8, 1},
DGLContext{kDGLCPU, 0}); DGLContext{kDGLCPU, 0});
std::copy(vec.begin(), vec.end(), static_cast<IdType*>(a->data)); std::copy(vec.begin(), vec.end(), static_cast<IdType*>(a->data));
return a; return a;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment