Unverified Commit 8ae50c42 authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] clang-format auto fix. (#4804)



* [Misc] clang-format auto fix.

* manual

* manual

* manual

* manual

* todo

* fix
Co-authored-by: default avatarSteve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
parent 81831111
......@@ -6,10 +6,10 @@
#ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_
#include "macro.cuh"
#include "atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "atomic.cuh"
#include "macro.cuh"
namespace dgl {
......@@ -23,23 +23,19 @@ namespace cuda {
* \note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
* The grid dimension x and y are reordered for better performance.
*/
template <typename Idx, typename DType,
typename BinaryOp>
template <typename Idx, typename DType, typename BinaryOp>
__global__ void GESpMMKernel(
const DType* __restrict__ ufeat,
const DType* __restrict__ efeat,
DType* __restrict__ out,
const Idx* __restrict__ indptr,
const Idx* __restrict__ indices,
const int64_t num_rows, const int64_t num_cols,
const int64_t feat_len) {
const Idx rid = blockIdx.x * blockDim.y + threadIdx.y; // over vertices dimension
const Idx fid = (blockIdx.y * 64) + threadIdx.x; // over feature dimension
const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
DType* __restrict__ out, const Idx* __restrict__ indptr,
const Idx* __restrict__ indices, const int64_t num_rows,
const int64_t num_cols, const int64_t feat_len) {
const Idx rid =
blockIdx.x * blockDim.y + threadIdx.y; // over vertices dimension
const Idx fid = (blockIdx.y * 64) + threadIdx.x; // over feature dimension
if (rid < num_rows && fid < feat_len) {
const Idx low = __ldg(indptr + rid), high = __ldg(indptr + rid + 1);
DType accum_0 = 0.,
accum_1 = 0.;
DType accum_0 = 0., accum_1 = 0.;
if (blockIdx.y != gridDim.y - 1) { // fid + 32 < feat_len
for (Idx left = low; left < high; left += 32) {
......@@ -109,24 +105,21 @@ __global__ void GESpMMKernel(
}
out[feat_len * rid + fid] = accum_0;
if (fid + 32 < feat_len)
out[feat_len * rid + fid + 32] = accum_1;
if (fid + 32 < feat_len) out[feat_len * rid + fid + 32] = accum_1;
}
}
}
}
template <typename Idx, typename DType,
typename BinaryOp>
template <typename Idx, typename DType, typename BinaryOp>
void GESpMMCsr(
const CSRMatrix& csr,
NDArray ufeat, NDArray efeat,
NDArray out, int64_t feat_len) {
const Idx *indptr = csr.indptr.Ptr<Idx>();
const Idx *indices = csr.indices.Ptr<Idx>();
const DType *ufeat_data = ufeat.Ptr<DType>();
const DType *efeat_data = efeat.Ptr<DType>();
DType *out_data = out.Ptr<DType>();
const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
int64_t feat_len) {
const Idx* indptr = csr.indptr.Ptr<Idx>();
const Idx* indices = csr.indices.Ptr<Idx>();
const DType* ufeat_data = ufeat.Ptr<DType>();
const DType* efeat_data = efeat.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
......@@ -138,12 +131,10 @@ void GESpMMCsr(
const dim3 nthrs(ntx, nty);
const int sh_mem_size = 0;
CUDA_KERNEL_CALL((GESpMMKernel<Idx, DType, BinaryOp>),
nblks, nthrs, sh_mem_size, stream,
ufeat_data, efeat_data, out_data,
indptr, indices,
csr.num_rows, csr.num_cols,
feat_len);
CUDA_KERNEL_CALL(
(GESpMMKernel<Idx, DType, BinaryOp>), nblks, nthrs, sh_mem_size, stream,
ufeat_data, efeat_data, out_data, indptr, indices, csr.num_rows,
csr.num_cols, feat_len);
}
} // namespace cuda
......
......@@ -8,44 +8,46 @@
///////////////////////// Dispatchers //////////////////////////
/* Macro used for switching between broadcasting and non-broadcasting kernels.
* It also copies the auxiliary information for calculating broadcasting offsets
* to GPU.
*/
#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...) do { \
const BcastOff &info = (BCAST); \
if (!info.use_bcast) { \
constexpr bool UseBcast = false; \
if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \
} else { \
constexpr bool UseIdx = false; \
{ __VA_ARGS__ } \
} \
} else { \
constexpr bool UseBcast = true; \
const DGLContext ctx = (CTX); \
const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t*>( \
device->AllocWorkspace(ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(cudaMemcpy((LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t*>( \
device->AllocWorkspace(ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(cudaMemcpy((RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \
} else { \
constexpr bool UseIdx = false; \
{ __VA_ARGS__ } \
} \
device->FreeWorkspace(ctx, (LHS_OFF)); \
device->FreeWorkspace(ctx, (RHS_OFF)); \
} \
} while (0)
#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...) \
do { \
const BcastOff &info = (BCAST); \
if (!info.use_bcast) { \
constexpr bool UseBcast = false; \
if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \
} else { \
constexpr bool UseIdx = false; \
{ __VA_ARGS__ } \
} \
} else { \
constexpr bool UseBcast = true; \
const DGLContext ctx = (CTX); \
const auto device = runtime::DeviceAPI::Get(ctx); \
(LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.lhs_offset.size())); \
CUDA_CALL(cudaMemcpy( \
(LHS_OFF), &info.lhs_offset[0], \
sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
(RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace( \
ctx, sizeof(int64_t) * info.rhs_offset.size())); \
CUDA_CALL(cudaMemcpy( \
(RHS_OFF), &info.rhs_offset[0], \
sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
if ((EDGE_MAP)) { \
constexpr bool UseIdx = true; \
{ __VA_ARGS__ } \
} else { \
constexpr bool UseIdx = false; \
{ __VA_ARGS__ } \
} \
device->FreeWorkspace(ctx, (LHS_OFF)); \
device->FreeWorkspace(ctx, (RHS_OFF)); \
} \
} while (0)
#endif // DGL_ARRAY_CUDA_MACRO_CUH_
......@@ -4,14 +4,14 @@
* \brief rowwise sampling
*/
#include <dgl/random.h>
#include <curand_kernel.h>
#include <dgl/array.h>
#include <dgl/array_iterator.h>
#include <curand_kernel.h>
#include <dgl/random.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
#include "../../runtime/cuda/cuda_common.h"
using namespace dgl::runtime;
......@@ -23,20 +23,15 @@ namespace {
template <typename IdType>
__global__ void _GlobalUniformNegativeSamplingKernel(
const IdType* __restrict__ indptr,
const IdType* __restrict__ indices,
IdType* __restrict__ row,
IdType* __restrict__ col,
int64_t num_row,
int64_t num_col,
int64_t num_samples,
int num_trials,
bool exclude_self_loops,
int32_t random_seed) {
const IdType* __restrict__ indptr, const IdType* __restrict__ indices,
IdType* __restrict__ row, IdType* __restrict__ col, int64_t num_row,
int64_t num_col, int64_t num_samples, int num_trials,
bool exclude_self_loops, int32_t random_seed) {
int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
curandStatePhilox4_32_10_t rng; // this allows generating 4 32-bit ints at a time
curandStatePhilox4_32_10_t
rng; // this allows generating 4 32-bit ints at a time
curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (tx < num_samples) {
......@@ -50,8 +45,7 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
int64_t u = static_cast<int64_t>(((y_lo << 32L) | z) % num_row);
int64_t v = static_cast<int64_t>(((y_hi << 32L) | w) % num_col);
if (exclude_self_loops && (u == v))
continue;
if (exclude_self_loops && (u == v)) continue;
// binary search of v among indptr[u:u+1]
int64_t b = indptr[u], e = indptr[u + 1] - 1;
......@@ -81,48 +75,47 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
template <typename DType>
struct IsNotMinusOne {
__device__ __forceinline__ bool operator() (const std::pair<DType, DType>& a) {
__device__ __forceinline__ bool operator()(const std::pair<DType, DType>& a) {
return a.first != -1;
}
};
/*!
* \brief Sort ordered pairs in ascending order, using \a tmp_major and \a tmp_minor as
* temporary buffers, each with \a n elements.
* \brief Sort ordered pairs in ascending order, using \a tmp_major and \a
* tmp_minor as temporary buffers, each with \a n elements.
*/
template <typename IdType>
void SortOrderedPairs(
runtime::DeviceAPI* device,
DGLContext ctx,
IdType* major,
IdType* minor,
IdType* tmp_major,
IdType* tmp_minor,
int64_t n,
cudaStream_t stream) {
runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
// Sort ordered pairs in lexicographical order by two radix sorts since
// cub's radix sorts are stable.
// We need a 2*n auxiliary storage to store the results form the first radix sort.
// We need a 2*n auxiliary storage to store the results form the first radix
// sort.
size_t s1 = 0, s2 = 0;
void* tmp1 = nullptr;
void* tmp2 = nullptr;
// Radix sort by minor key first, reorder the major key in the progress.
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream));
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
tmp1 = device->AllocWorkspace(ctx, s1);
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream));
tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
stream));
// Radix sort by major key next.
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) : tmp1; // reuse buffer if s2 <= s1
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
: tmp1; // reuse buffer if s2 <= s1
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream));
tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
stream));
if (tmp1 != tmp2)
device->FreeWorkspace(ctx, tmp2);
if (tmp1 != tmp2) device->FreeWorkspace(ctx, tmp2);
device->FreeWorkspace(ctx, tmp1);
}
......@@ -130,17 +123,14 @@ void SortOrderedPairs(
template <DGLDeviceType XPU, typename IdType>
std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
const CSRMatrix& csr,
int64_t num_samples,
int num_trials,
bool exclude_self_loops,
bool replace,
double redundancy) {
const CSRMatrix& csr, int64_t num_samples, int num_trials,
bool exclude_self_loops, bool replace, double redundancy) {
auto ctx = csr.indptr->ctx;
auto dtype = csr.indptr->dtype;
const int64_t num_row = csr.num_rows;
const int64_t num_col = csr.num_cols;
const int64_t num_actual_samples = static_cast<int64_t>(num_samples * (1 + redundancy));
const int64_t num_actual_samples =
static_cast<int64_t>(num_samples * (1 + redundancy));
IdArray row = Full<IdType>(-1, num_actual_samples, ctx);
IdArray col = Full<IdType>(-1, num_actual_samples, ctx);
IdArray out_row = IdArray::Empty({num_actual_samples}, dtype, ctx);
......@@ -156,22 +146,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
std::pair<IdArray, IdArray> result;
int64_t num_out;
CUDA_KERNEL_CALL(_GlobalUniformNegativeSamplingKernel,
nb, nt, 0, stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
row_data, col_data, num_row, num_col, num_actual_samples, num_trials,
exclude_self_loops, RandomEngine::ThreadLocal()->RandInt32());
CUDA_KERNEL_CALL(
_GlobalUniformNegativeSamplingKernel, nb, nt, 0, stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), row_data, col_data,
num_row, num_col, num_actual_samples, num_trials, exclude_self_loops,
RandomEngine::ThreadLocal()->RandInt32());
size_t tmp_size = 0;
int64_t* num_out_cuda = static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
int64_t* num_out_cuda =
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
IsNotMinusOne<IdType> op;
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> out_begin(out_row_data, out_col_data);
CUDA_CALL(cub::DeviceSelect::If(
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream));
nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
void* tmp = device->AllocWorkspace(ctx, tmp_size);
CUDA_CALL(cub::DeviceSelect::If(
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream));
tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
if (!replace) {
......@@ -182,28 +175,33 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
PairIterator<IdType> unique_begin(unique_row_data, unique_col_data);
SortOrderedPairs(
device, ctx, out_row_data, out_col_data, unique_row_data, unique_col_data,
num_out, stream);
device, ctx, out_row_data, out_col_data, unique_row_data,
unique_col_data, num_out, stream);
size_t tmp_size_unique = 0;
void* tmp_unique = nullptr;
CUDA_CALL(cub::DeviceSelect::Unique(
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size) ?
device->AllocWorkspace(ctx, tmp_size_unique) :
tmp; // reuse buffer
nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
tmp_unique = (tmp_size_unique > tmp_size)
? device->AllocWorkspace(ctx, tmp_size_unique)
: tmp; // reuse buffer
CUDA_CALL(cub::DeviceSelect::Unique(
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream));
tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
num_out, stream));
num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
num_out = std::min(num_samples, num_out);
result = {unique_row.CreateView({num_out}, dtype), unique_col.CreateView({num_out}, dtype)};
result = {
unique_row.CreateView({num_out}, dtype),
unique_col.CreateView({num_out}, dtype)};
if (tmp_unique != tmp)
device->FreeWorkspace(ctx, tmp_unique);
if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
} else {
num_out = std::min(num_samples, num_out);
result = {out_row.CreateView({num_out}, dtype), out_col.CreateView({num_out}, dtype)};
result = {
out_row.CreateView({num_out}, dtype),
out_col.CreateView({num_out}, dtype)};
}
device->FreeWorkspace(ctx, tmp);
......@@ -211,10 +209,10 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
return result;
}
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int32_t>(
const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int64_t>(
const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
kDGLCUDA, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
kDGLCUDA, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
}; // namespace impl
}; // namespace aten
......
......@@ -4,15 +4,15 @@
* \brief uniform rowwise sampling
*/
#include <curand_kernel.h>
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <curand_kernel.h>
#include <numeric>
#include "./dgl_cub.cuh"
#include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "./dgl_cub.cuh"
using namespace dgl::aten::cuda;
......@@ -25,29 +25,28 @@ namespace {
constexpr int BLOCK_SIZE = 128;
/**
* @brief Compute the size of each row in the sampled CSR, without replacement.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by
* `in_rows` (output).
*/
template<typename IdType>
* @brief Compute the size of each row in the sampled CSR, without replacement.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by
* `in_rows` (output).
*/
template <typename IdType>
__global__ void _CSRRowWiseSampleDegreeKernel(
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
IdType * const out_deg) {
const int64_t num_picks, const int64_t num_rows,
const IdType* const in_rows, const IdType* const in_ptr,
IdType* const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) {
const int in_row = in_rows[tIdx];
const int out_row = tIdx;
out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
out_deg[out_row] = min(
static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
if (out_row == num_rows - 1) {
// make the prefixsum work
......@@ -57,23 +56,21 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
}
/**
* @brief Compute the size of each row in the sampled CSR, with replacement.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by
* `in_rows` (output).
*/
template<typename IdType>
* @brief Compute the size of each row in the sampled CSR, with replacement.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by
* `in_rows` (output).
*/
template <typename IdType>
__global__ void _CSRRowWiseSampleDegreeReplaceKernel(
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
IdType * const out_deg) {
const int64_t num_picks, const int64_t num_rows,
const IdType* const in_rows, const IdType* const in_ptr,
IdType* const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) {
......@@ -94,41 +91,36 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
}
/**
* @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, without replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_index The indices array of the input CSR.
* @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
template<typename IdType, int TILE_SIZE>
* @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, without replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_index The indices array of the input CSR.
* @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
template <typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleUniformKernel(
const uint64_t rand_seed,
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
const IdType * const in_index,
const IdType * const data,
const IdType * const out_ptr,
IdType * const out_rows,
IdType * const out_cols,
IdType * const out_idxs) {
const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
const IdType* const in_rows, const IdType* const in_ptr,
const IdType* const in_index, const IdType* const data,
const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
IdType* const out_idxs) {
// we assign one warp per row
assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
......@@ -177,41 +169,36 @@ __global__ void _CSRRowWiseSampleUniformKernel(
}
/**
* @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, with replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_index The indices array of the input CSR.
* @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
template<typename IdType, int TILE_SIZE>
* @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, with replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_index The indices array of the input CSR.
* @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
template <typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleUniformReplaceKernel(
const uint64_t rand_seed,
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
const IdType * const in_index,
const IdType * const data,
const IdType * const out_ptr,
IdType * const out_rows,
IdType * const out_cols,
IdType * const out_idxs) {
const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
const IdType* const in_rows, const IdType* const in_ptr,
const IdType* const in_index, const IdType* const data,
const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
IdType* const out_idxs) {
// we assign one warp per row
assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
const int64_t last_row =
min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
......@@ -229,7 +216,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start + edge];
out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge;
out_idxs[out_idx] =
data ? data[in_row_start + edge] : in_row_start + edge;
}
}
out_row += 1;
......@@ -248,11 +236,14 @@ COOMatrix _CSRRowWiseSamplingUniform(
cudaStream_t stream = runtime::getCurrentCUDAStream();
const int64_t num_rows = rows->shape[0];
const IdType * const slice_rows = static_cast<const IdType*>(rows->data);
IdArray picked_row = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_col = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_idx = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
IdArray picked_row =
NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_col =
NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_idx =
NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdType* const out_rows = static_cast<IdType*>(picked_row->data);
IdType* const out_cols = static_cast<IdType*>(picked_col->data);
IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
......@@ -261,65 +252,52 @@ COOMatrix _CSRRowWiseSamplingUniform(
const IdType* in_cols = mat.indices.Ptr<IdType>();
const IdType* data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
if (mat.is_pinned) {
CUDA_CALL(cudaHostGetDevicePointer(
&in_ptr, mat.indptr.Ptr<IdType>(), 0));
CUDA_CALL(cudaHostGetDevicePointer(
&in_cols, mat.indices.Ptr<IdType>(), 0));
CUDA_CALL(cudaHostGetDevicePointer(&in_ptr, mat.indptr.Ptr<IdType>(), 0));
CUDA_CALL(cudaHostGetDevicePointer(&in_cols, mat.indices.Ptr<IdType>(), 0));
if (CSRHasData(mat)) {
CUDA_CALL(cudaHostGetDevicePointer(
&data, mat.data.Ptr<IdType>(), 0));
CUDA_CALL(cudaHostGetDevicePointer(&data, mat.data.Ptr<IdType>(), 0));
}
}
// compute degree
IdType * out_deg = static_cast<IdType*>(
IdType* out_deg = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
if (replace) {
const dim3 block(512);
const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeReplaceKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg);
_CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks,
num_rows, slice_rows, in_ptr, out_deg);
} else {
const dim3 block(512);
const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg);
_CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks,
num_rows, slice_rows, in_ptr, out_deg);
}
// fill out_ptr
IdType * out_ptr = static_cast<IdType*>(
IdType* out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
out_deg,
out_ptr,
num_rows+1,
stream));
void * prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
out_deg,
out_ptr,
num_rows+1,
stream));
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
// a cudaevent
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
// wait on a cudaevent
IdType new_len;
// copy using the internal current stream
device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
sizeof(new_len),
ctx,
DGLContext{kDGLCPU, 0},
mat.indptr->dtype);
device->CopyDataFromTo(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
CUDA_CALL(cudaEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
......@@ -331,36 +309,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL(
(_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>),
grid, block, 0, stream,
random_seed,
num_picks,
num_rows,
slice_rows,
in_ptr,
in_cols,
data,
out_ptr,
out_rows,
out_cols,
out_idxs);
(_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>), grid, block,
0, stream, random_seed, num_picks, num_rows, slice_rows, in_ptr,
in_cols, data, out_ptr, out_rows, out_cols, out_idxs);
} else { // without replacement
const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL(
(_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>),
grid, block, 0, stream,
random_seed,
num_picks,
num_rows,
slice_rows,
in_ptr,
in_cols,
data,
out_ptr,
out_rows,
out_cols,
out_idxs);
(_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>), grid, block, 0,
stream, random_seed, num_picks, num_rows, slice_rows, in_ptr, in_cols,
data, out_ptr, out_rows, out_cols, out_idxs);
}
device->FreeWorkspace(ctx, out_ptr);
......@@ -372,8 +330,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
return COOMatrix(mat.num_rows, mat.num_cols, picked_row,
picked_col, picked_idx);
return COOMatrix(
mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx);
}
template <DGLDeviceType XPU, typename IdType>
......@@ -383,9 +341,11 @@ COOMatrix CSRRowWiseSamplingUniform(
// Basically this is UnitGraph::InEdges().
COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false);
IdArray sliced_rows = IndexSelect(rows, coo.row);
return COOMatrix(mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
return COOMatrix(
mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
} else {
return _CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_picks, replace);
return _CSRRowWiseSamplingUniform<XPU, IdType>(
mat, rows, num_picks, replace);
}
}
......
......@@ -8,9 +8,10 @@
#include <string>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./atomic.cuh"
#include "./utils.h"
namespace dgl {
......@@ -24,11 +25,9 @@ namespace cuda {
* \note each blockthread is responsible for aggregation on a row
* in the result tensor.
*/
template <typename IdType, typename DType,
typename ReduceOp>
template <typename IdType, typename DType, typename ReduceOp>
__global__ void SegmentReduceKernel(
const DType* feat, const IdType* offsets,
DType* out, IdType* arg,
const DType* feat, const IdType* offsets, DType* out, IdType* arg,
int64_t n, int64_t dim) {
for (int row = blockIdx.x; row < n; row += gridDim.x) {
int col = blockIdx.y * blockDim.x + threadIdx.x;
......@@ -39,8 +38,7 @@ __global__ void SegmentReduceKernel(
ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i);
}
out[row * dim + col] = local_accum;
if (ReduceOp::require_arg)
arg[row * dim + col] = local_arg;
if (ReduceOp::require_arg) arg[row * dim + col] = local_arg;
col += gridDim.y * blockDim.x;
}
}
......@@ -53,8 +51,7 @@ __global__ void SegmentReduceKernel(
*/
template <typename IdType, typename DType>
__global__ void ScatterAddKernel(
const DType *feat, const IdType *idx, DType *out,
int64_t n, int64_t dim) {
const DType* feat, const IdType* idx, DType* out, int64_t n, int64_t dim) {
for (int row = blockIdx.x; row < n; row += gridDim.x) {
const int write_row = idx[row];
int col = blockIdx.y * blockDim.x + threadIdx.x;
......@@ -73,7 +70,7 @@ __global__ void ScatterAddKernel(
template <typename IdType, typename DType>
__global__ void UpdateGradMinMaxHeteroKernel(
const DType *feat, const IdType *idx, const IdType *idx_type, DType *out,
const DType* feat, const IdType* idx, const IdType* idx_type, DType* out,
int64_t n, int64_t dim, int type) {
unsigned int tId = threadIdx.x;
unsigned int laneId = tId & 31;
......@@ -100,8 +97,7 @@ __global__ void UpdateGradMinMaxHeteroKernel(
*/
template <typename IdType, typename DType>
__global__ void BackwardSegmentCmpKernel(
const DType *feat, const IdType *arg, DType *out,
int64_t n, int64_t dim) {
const DType* feat, const IdType* arg, DType* out, int64_t n, int64_t dim) {
for (int row = blockIdx.x; row < n; row += gridDim.x) {
int col = blockIdx.y * blockDim.x + threadIdx.x;
while (col < dim) {
......@@ -122,11 +118,7 @@ __global__ void BackwardSegmentCmpKernel(
* \param arg An auxiliary tensor storing ArgMax/Min information,
*/
template <typename IdType, typename DType, typename ReduceOp>
void SegmentReduce(
NDArray feat,
NDArray offsets,
NDArray out,
NDArray arg) {
void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
......@@ -135,8 +127,7 @@ void SegmentReduce(
cudaStream_t stream = runtime::getCurrentCUDAStream();
int64_t n = out->shape[0];
int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const int nbx = FindNumBlocks<'x'>(n);
const int ntx = FindNumThreads(dim);
......@@ -145,10 +136,9 @@ void SegmentReduce(
const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty);
// TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance.
CUDA_KERNEL_CALL((SegmentReduceKernel<IdType, DType, ReduceOp>),
nblks, nthrs, 0, stream,
feat_data, offsets_data, out_data, arg_data,
n, dim);
CUDA_KERNEL_CALL(
(SegmentReduceKernel<IdType, DType, ReduceOp>), nblks, nthrs, 0, stream,
feat_data, offsets_data, out_data, arg_data, n, dim);
}
/*!
......@@ -159,19 +149,15 @@ void SegmentReduce(
* \param out The output tensor.
*/
template <typename IdType, typename DType>
void ScatterAdd(
NDArray feat,
NDArray idx,
NDArray out) {
void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
const DType* feat_data = feat.Ptr<DType>();
const IdType* idx_data = idx.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
int64_t n = feat->shape[0];
int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const int nbx = FindNumBlocks<'x'>(n);
const int ntx = FindNumThreads(dim);
......@@ -179,10 +165,9 @@ void ScatterAdd(
const int nty = 1;
const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty);
CUDA_KERNEL_CALL((ScatterAddKernel<IdType, DType>),
nblks, nthrs, 0, stream,
feat_data, idx_data, out_data,
n, dim);
CUDA_KERNEL_CALL(
(ScatterAddKernel<IdType, DType>), nblks, nthrs, 0, stream, feat_data,
idx_data, out_data, n, dim);
}
/*!
......@@ -195,24 +180,26 @@ void ScatterAdd(
* \param list_out List of the output tensors.
*/
template <typename IdType, typename DType>
void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph,
const std::string& op,
const std::vector<NDArray>& list_feat,
const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) {
void UpdateGradMinMax_hetero(
const HeteroGraphPtr& graph, const std::string& op,
const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
if (op == "copy_lhs" || op == "copy_rhs") {
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(graph->NumVertexTypes(),
std::vector<dgl_id_t>());
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
graph->NumVertexTypes(), std::vector<dgl_id_t>());
for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
auto pair = graph->meta_graph()->FindEdge(etype);
const dgl_id_t dst_ntype = pair.first; // graph is reversed
const dgl_id_t src_ntype = pair.second;
auto same_src_dst_ntype = std::find(std::begin(src_dst_ntypes[dst_ntype]),
std::end(src_dst_ntypes[dst_ntype]), src_ntype);
// if op is "copy_lhs", relation type with same src and dst node type will be updated once
if (op == "copy_lhs" && same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
auto same_src_dst_ntype = std::find(
std::begin(src_dst_ntypes[dst_ntype]),
std::end(src_dst_ntypes[dst_ntype]), src_ntype);
// if op is "copy_lhs", relation type with same src and dst node type will
// be updated once
if (op == "copy_lhs" &&
same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
continue;
src_dst_ntypes[dst_ntype].push_back(src_ntype);
const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
......@@ -229,35 +216,31 @@ void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph,
const int nbx = FindNumBlocks<'x'>((n * th_per_row + ntx - 1) / ntx);
const dim3 nblks(nbx);
const dim3 nthrs(ntx);
CUDA_KERNEL_CALL((UpdateGradMinMaxHeteroKernel<IdType, DType>),
nblks, nthrs, 0, stream,
feat_data, idx_data, idx_type_data,
out_data, n, dim, type);
CUDA_KERNEL_CALL(
(UpdateGradMinMaxHeteroKernel<IdType, DType>), nblks, nthrs, 0,
stream, feat_data, idx_data, idx_type_data, out_data, n, dim, type);
}
}
}
/*!
* \brief CUDA implementation of backward phase of Segment Reduce with Min/Max reducer.
* \note math equation: out[arg[i, k], k] = feat[i, k]
* \param feat The input tensor.
* \brief CUDA implementation of backward phase of Segment Reduce with Min/Max
* reducer.
* \note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
* tensor.
* \param arg The ArgMin/Max information, used for indexing.
* \param out The output tensor.
*/
template <typename IdType, typename DType>
void BackwardSegmentCmp(
NDArray feat,
NDArray arg,
NDArray out) {
void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream();
int64_t n = feat->shape[0];
int64_t dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const int nbx = FindNumBlocks<'x'>(n);
const int ntx = FindNumThreads(dim);
......@@ -265,10 +248,9 @@ void BackwardSegmentCmp(
const int nty = 1;
const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty);
CUDA_KERNEL_CALL((BackwardSegmentCmpKernel<IdType, DType>),
nblks, nthrs, 0, stream,
feat_data, arg_data, out_data,
n, dim);
CUDA_KERNEL_CALL(
(BackwardSegmentCmpKernel<IdType, DType>), nblks, nthrs, 0, stream,
feat_data, arg_data, out_data, n, dim);
}
} // namespace cuda
......
......@@ -4,12 +4,14 @@
* \brief COO operator GPU implementation
*/
#include <dgl/array.h>
#include <vector>
#include <unordered_set>
#include <numeric>
#include <unordered_set>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./atomic.cuh"
#include "./utils.h"
namespace dgl {
......@@ -19,9 +21,8 @@ using namespace cuda;
namespace aten {
namespace impl {
template <typename IdType>
__device__ void _warpReduce(volatile IdType *sdata, IdType tid) {
__device__ void _warpReduce(volatile IdType* sdata, IdType tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
......@@ -32,10 +33,8 @@ __device__ void _warpReduce(volatile IdType *sdata, IdType tid) {
template <typename IdType>
__global__ void _COOGetRowNNZKernel(
const IdType* __restrict__ row_indices,
IdType* __restrict__ glb_cnt,
const int64_t row_query,
IdType nnz) {
const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnt,
const int64_t row_query, IdType nnz) {
__shared__ IdType local_cnt[1024];
IdType tx = threadIdx.x;
IdType bx = blockIdx.x;
......@@ -80,10 +79,9 @@ int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
_Fill(rst.Ptr<IdType>(), 1, IdType(0));
CUDA_KERNEL_CALL(_COOGetRowNNZKernel,
nb, nt, 0, stream,
coo.row.Ptr<IdType>(), rst.Ptr<IdType>(),
row, nnz);
CUDA_KERNEL_CALL(
_COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
rst.Ptr<IdType>(), row, nnz);
rst = rst.CopyTo(DGLContext{kDGLCPU, 0});
return *rst.Ptr<IdType>();
}
......@@ -93,8 +91,7 @@ template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t);
template <typename IdType>
__global__ void _COOGetAllRowNNZKernel(
const IdType* __restrict__ row_indices,
IdType* __restrict__ glb_cnts,
const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnts,
IdType nnz) {
IdType eid = blockIdx.x * blockDim.x + threadIdx.x;
while (eid < nnz) {
......@@ -118,20 +115,18 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
_Fill(rst.Ptr<IdType>(), 1, IdType(0));
CUDA_KERNEL_CALL(_COOGetRowNNZKernel,
nb, nt, 0, stream,
coo.row.Ptr<IdType>(), rst.Ptr<IdType>(),
row, nnz);
CUDA_KERNEL_CALL(
_COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
rst.Ptr<IdType>(), row, nnz);
return rst;
} else {
IdType nt = 1024;
IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx);
_Fill(in_degrees.Ptr<IdType>(), num_rows, IdType(0));
CUDA_KERNEL_CALL(_COOGetAllRowNNZKernel,
nb, nt, 0, stream,
coo.row.Ptr<IdType>(), in_degrees.Ptr<IdType>(),
nnz);
CUDA_KERNEL_CALL(
_COOGetAllRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
in_degrees.Ptr<IdType>(), nnz);
return IndexSelect(in_degrees, rows);
}
}
......
This diff is collapsed.
......@@ -14,31 +14,28 @@ namespace aten {
namespace impl {
/* This is a cross-device access version of IndexSelectMultiKernel.
* Since the memory access over PCIe is more sensitive to the
* data access aligment (cacheline), we need a separate version here.
*/
* Since the memory access over PCIe is more sensitive to the
* data access aligment (cacheline), we need a separate version here.
*/
template <typename DType, typename IdType>
__global__ void IndexSelectMultiKernelAligned(
const DType* const array,
const int64_t num_feat,
const IdType* const index,
const int64_t length,
const int64_t arr_len,
DType* const out) {
int64_t out_row = blockIdx.x*blockDim.y+threadIdx.y;
const DType* const array, const int64_t num_feat, const IdType* const index,
const int64_t length, const int64_t arr_len, DType* const out) {
int64_t out_row = blockIdx.x * blockDim.y + threadIdx.y;
const int64_t stride = blockDim.y*gridDim.x;
const int64_t stride = blockDim.y * gridDim.x;
while (out_row < length) {
int64_t col = threadIdx.x;
const int64_t in_row = index[out_row];
assert(in_row >= 0 && in_row < arr_len);
const int64_t idx_offset =
((uint64_t)(&array[in_row*num_feat]) % CACHE_LINE_SIZE) / sizeof(DType);
((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) /
sizeof(DType);
col = col - idx_offset;
while (col < num_feat) {
if (col >= 0)
out[out_row*num_feat+col] = array[in_row*num_feat+col];
out[out_row * num_feat + col] = array[in_row * num_feat + col];
col += blockDim.x;
}
out_row += stride;
......
This diff is collapsed.
......@@ -4,12 +4,11 @@
* \brief Object for selecting items in a set, or selecting items not in a set.
*/
#ifndef DGL_ARRAY_FILTER_H_
#define DGL_ARRAY_FILTER_H_
#include <dgl/runtime/object.h>
#include <dgl/array.h>
#include <dgl/runtime/object.h>
namespace dgl {
namespace array {
......@@ -28,8 +27,7 @@ class Filter : public runtime::Object {
* @return The indices of the items from `test` that are selected by
* this filter.
*/
virtual IdArray find_included_indices(
IdArray test) = 0;
virtual IdArray find_included_indices(IdArray test) = 0;
/**
* @brief From the test set of items, get the indices of those which are
......@@ -40,8 +38,7 @@ class Filter : public runtime::Object {
* @return The indices of the items from `test` that are not selected by this
* filter.
*/
virtual IdArray find_excluded_indices(
IdArray test) = 0;
virtual IdArray find_excluded_indices(IdArray test) = 0;
};
DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
......@@ -50,4 +47,3 @@ DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
} // namespace dgl
#endif // DGL_ARRAY_FILTER_H_
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -7,6 +7,7 @@
#define DGL_ARRAY_UVM_ARRAY_OP_H_
#include <dgl/array.h>
#include <utility>
namespace dgl {
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment