"docs/source/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "2f4146a4c2f48400dc02f86dcdcb3325a1fa0c7d"
Unverified Commit c3aea1b6 authored by Muhammed Fatih BALIN's avatar Muhammed Fatih BALIN Committed by GitHub
Browse files

[Performance][CUDA] Labor UVA optimization (#5885)


Co-authored-by: default avatarXin Yao <xiny@nvidia.com>
Co-authored-by: default avatarRhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
parent b0fc234f
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <functional> #include <functional>
#include <iostream>
#include <memory> #include <memory>
#include <numeric> #include <numeric>
#include <string> #include <string>
...@@ -124,7 +123,10 @@ auto compute_importance_sampling_probabilities( ...@@ -124,7 +123,10 @@ auto compute_importance_sampling_probabilities(
var_1 = 0; var_1 = 0;
if (weighted) { if (weighted) {
for (auto j = indptr[rid]; j < indptr[rid + 1]; j++) for (auto j = indptr[rid]; j < indptr[rid + 1]; j++)
var_1 += A[j] * A[j] / std::min(ONE, c * ps[j - indptr[rid]]); // The check for zero is necessary for numerical stability
var_1 += A[j] > 0
? A[j] * A[j] / std::min(ONE, c * ps[j - indptr[rid]])
: 0;
} else { } else {
for (auto j = indptr[rid]; j < indptr[rid + 1]; j++) for (auto j = indptr[rid]; j < indptr[rid + 1]; j++)
var_1 += ONE / std::min(ONE, c * ps[j - indptr[rid]]); var_1 += ONE / std::min(ONE, c * ps[j - indptr[rid]]);
...@@ -208,8 +210,9 @@ std::pair<COOMatrix, FloatArray> CSRLaborPick( ...@@ -208,8 +210,9 @@ std::pair<COOMatrix, FloatArray> CSRLaborPick(
IdArray picked_row = NDArray::Empty({hop_size}, vidtype, ctx); IdArray picked_row = NDArray::Empty({hop_size}, vidtype, ctx);
IdArray picked_col = NDArray::Empty({hop_size}, vidtype, ctx); IdArray picked_col = NDArray::Empty({hop_size}, vidtype, ctx);
IdArray picked_idx = NDArray::Empty({hop_size}, vidtype, ctx); IdArray picked_idx = NDArray::Empty({hop_size}, vidtype, ctx);
FloatArray picked_imp = FloatArray picked_imp = importance_sampling
NDArray::Empty({importance_sampling ? hop_size : 0}, dtype, ctx); ? NDArray::Empty({hop_size}, dtype, ctx)
: NullArray();
IdxType* picked_rdata = picked_row.Ptr<IdxType>(); IdxType* picked_rdata = picked_row.Ptr<IdxType>();
IdxType* picked_cdata = picked_col.Ptr<IdxType>(); IdxType* picked_cdata = picked_col.Ptr<IdxType>();
IdxType* picked_idata = picked_idx.Ptr<IdxType>(); IdxType* picked_idata = picked_idx.Ptr<IdxType>();
......
...@@ -66,11 +66,12 @@ struct TransformOp { ...@@ -66,11 +66,12 @@ struct TransformOp {
const IdType* subindptr; const IdType* subindptr;
const IdType* indices; const IdType* indices;
const IdType* data_arr; const IdType* data_arr;
bool is_pinned;
__host__ __device__ auto operator()(IdType idx) { __host__ __device__ auto operator()(IdType idx) {
const auto in_row = idx_coo[idx]; const auto in_row = idx_coo[idx];
const auto row = rows[in_row]; const auto row = rows[in_row];
const auto in_idx = indptr[row] + idx - subindptr[in_row]; const auto in_idx = indptr[in_row] + idx - subindptr[in_row];
const auto u = indices[in_idx]; const auto u = indices[is_pinned ? idx : in_idx];
const auto data = data_arr ? data_arr[in_idx] : in_idx; const auto data = data_arr ? data_arr[in_idx] : in_idx;
return thrust::make_tuple(row, u, data); return thrust::make_tuple(row, u, data);
} }
...@@ -90,13 +91,14 @@ struct TransformOpImp { ...@@ -90,13 +91,14 @@ struct TransformOpImp {
const IdType* subindptr; const IdType* subindptr;
const IdType* indices; const IdType* indices;
const IdType* data_arr; const IdType* data_arr;
bool is_pinned;
__host__ __device__ auto operator()(IdType idx) { __host__ __device__ auto operator()(IdType idx) {
const auto ps = probs[idx]; const auto ps = probs[idx];
const auto in_row = idx_coo[idx]; const auto in_row = idx_coo[idx];
const auto c = cs[in_row]; const auto c = cs[in_row];
const auto row = rows[in_row]; const auto row = rows[in_row];
const auto in_idx = indptr[row] + idx - subindptr[in_row]; const auto in_idx = indptr[in_row] + idx - subindptr[in_row];
const auto u = indices[in_idx]; const auto u = indices[is_pinned ? idx : in_idx];
const auto w = A[in_idx]; const auto w = A[in_idx];
const auto w2 = B[in_idx]; const auto w2 = B[in_idx];
const auto data = data_arr ? data_arr[in_idx] : in_idx; const auto data = data_arr ? data_arr[in_idx] : in_idx;
...@@ -123,17 +125,16 @@ struct StencilOpFused { ...@@ -123,17 +125,16 @@ struct StencilOpFused {
const ps_t probs; const ps_t probs;
const A_t A; const A_t A;
const IdType* subindptr; const IdType* subindptr;
const IdType* rows;
const IdType* indptr; const IdType* indptr;
const IdType* indices; const IdType* indices;
const IdType* nids; const IdType* nids;
bool is_pinned;
__device__ auto operator()(IdType idx) { __device__ auto operator()(IdType idx) {
const auto in_row = idx_coo[idx]; const auto in_row = idx_coo[idx];
const auto ps = probs[idx]; const auto ps = probs[idx];
IdType rofs = idx - subindptr[in_row]; IdType rofs = idx - subindptr[in_row];
const IdType row = rows[in_row]; const auto in_idx = indptr[in_row] + rofs;
const auto in_idx = indptr[row] + rofs; const auto u = indices[is_pinned ? idx : in_idx];
const auto u = indices[in_idx];
const auto t = nids ? nids[u] : u; // t in the paper const auto t = nids ? nids[u] : u; // t in the paper
curandStatePhilox4_32_10_t rng; curandStatePhilox4_32_10_t rng;
// rolled random number r_t is a function of the random_seed and t // rolled random number r_t is a function of the random_seed and t
...@@ -162,7 +163,10 @@ struct TransformOpMinWith1 { ...@@ -162,7 +163,10 @@ struct TransformOpMinWith1 {
template <typename IdType> template <typename IdType>
struct IndptrFunc { struct IndptrFunc {
const IdType* indptr; const IdType* indptr;
__host__ __device__ auto operator()(IdType row) { return indptr[row]; } const IdType* in_deg;
__host__ __device__ auto operator()(IdType row) {
return indptr[row] + (in_deg ? in_deg[row] : 0);
}
}; };
template <typename FloatType> template <typename FloatType>
...@@ -186,24 +190,26 @@ struct DegreeFunc { ...@@ -186,24 +190,26 @@ struct DegreeFunc {
const IdType num_picks; const IdType num_picks;
const IdType* rows; const IdType* rows;
const IdType* indptr; const IdType* indptr;
const FloatType* ds;
IdType* in_deg; IdType* in_deg;
IdType* inrow_indptr;
FloatType* cs; FloatType* cs;
__host__ __device__ auto operator()(IdType tIdx) { __host__ __device__ auto operator()(IdType tIdx) {
const auto out_row = rows[tIdx]; const auto out_row = rows[tIdx];
const auto d = indptr[out_row + 1] - indptr[out_row]; const auto indptr_val = indptr[out_row];
const auto d = indptr[out_row + 1] - indptr_val;
in_deg[tIdx] = d; in_deg[tIdx] = d;
cs[tIdx] = num_picks / (ds ? ds[tIdx] : (FloatType)d); inrow_indptr[tIdx] = indptr_val;
cs[tIdx] = num_picks / (FloatType)d;
} }
}; };
template <typename IdType, typename FloatType> template <typename IdType, typename FloatType>
__global__ void _CSRRowWiseOneHopExtractorKernel( __global__ void _CSRRowWiseOneHopExtractorKernel(
const uint64_t rand_seed, const IdType hop_size, const IdType* const rows, const uint64_t rand_seed, const IdType hop_size, const IdType* const indptr,
const IdType* const indptr, const IdType* const subindptr, const IdType* const subindptr, const IdType* const indices,
const IdType* const indices, const IdType* const idx_coo, const IdType* const idx_coo, const IdType* const nids,
const IdType* const nids, const FloatType* const A, FloatType* const rands, const FloatType* const A, FloatType* const rands, IdType* const hop,
IdType* const hop, FloatType* const A_l) { FloatType* const A_l) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x; IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x; const int stride_x = gridDim.x * blockDim.x;
...@@ -212,10 +218,10 @@ __global__ void _CSRRowWiseOneHopExtractorKernel( ...@@ -212,10 +218,10 @@ __global__ void _CSRRowWiseOneHopExtractorKernel(
while (tx < hop_size) { while (tx < hop_size) {
IdType rpos = idx_coo[tx]; IdType rpos = idx_coo[tx];
IdType rofs = tx - subindptr[rpos]; IdType rofs = tx - subindptr[rpos];
const IdType row = rows[rpos]; const auto in_idx = indptr[rpos] + rofs;
const auto in_idx = indptr[row] + rofs; const auto not_pinned = indices != hop;
const auto u = indices[in_idx]; const auto u = indices[not_pinned ? in_idx : tx];
hop[tx] = u; if (not_pinned) hop[tx] = u;
const auto v = nids ? nids[u] : u; const auto v = nids ? nids[u] : u;
// 123123 is just a number with no significance. // 123123 is just a number with no significance.
curand_init(123123, rand_seed, v, &rng); curand_init(123123, rand_seed, v, &rng);
...@@ -226,10 +232,53 @@ __global__ void _CSRRowWiseOneHopExtractorKernel( ...@@ -226,10 +232,53 @@ __global__ void _CSRRowWiseOneHopExtractorKernel(
} }
} }
constexpr int CACHE_LINE_SIZE = 128;
template <typename IdType>
struct AlignmentFunc {
static_assert(CACHE_LINE_SIZE % sizeof(IdType) == 0);
const IdType* in_deg;
const int64_t* perm;
IdType num_rows;
__host__ __device__ auto operator()(IdType row) {
constexpr int num_elements = CACHE_LINE_SIZE / sizeof(IdType);
return in_deg[perm ? perm[row % num_rows] : row] + num_elements - 1;
}
};
template <typename IdType>
__global__ void _CSRRowWiseOneHopExtractorAlignedKernel(
const IdType hop_size, const IdType num_rows, const IdType* const indptr,
const IdType* const subindptr, const IdType* const subindptr_aligned,
const IdType* const indices, IdType* const hop, const int64_t* const perm) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < hop_size) {
const IdType rpos_ =
dgl::cuda::_UpperBound(subindptr_aligned, num_rows, tx) - 1;
const IdType rpos = perm ? perm[rpos_] : rpos_;
const auto out_row = subindptr[rpos];
const auto d = subindptr[rpos + 1] - out_row;
const int offset =
((uint64_t)(indices + indptr[rpos] - subindptr_aligned[rpos_]) %
CACHE_LINE_SIZE) /
sizeof(IdType);
const IdType rofs = tx - subindptr_aligned[rpos_] - offset;
if (rofs >= 0 && rofs < d) {
const auto in_idx = indptr[rpos] + rofs;
assert((uint64_t)(indices + in_idx - tx) % CACHE_LINE_SIZE == 0);
const auto u = indices[in_idx];
hop[out_row + rofs] = u;
}
tx += stride_x;
}
}
template <typename IdType, typename FloatType, int BLOCK_CTAS, int TILE_SIZE> template <typename IdType, typename FloatType, int BLOCK_CTAS, int TILE_SIZE>
__global__ void _CSRRowWiseLayerSampleDegreeKernel( __global__ void _CSRRowWiseLayerSampleDegreeKernel(
const IdType num_picks, const IdType num_rows, const IdType* const rows, const IdType num_picks, const IdType num_rows, FloatType* const cs,
FloatType* const cs, const FloatType* const ds, const FloatType* const d2s, const FloatType* const ds, const FloatType* const d2s,
const IdType* const indptr, const FloatType* const probs, const IdType* const indptr, const FloatType* const probs,
const FloatType* const A, const IdType* const subindptr) { const FloatType* const A, const IdType* const subindptr) {
typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce; typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
...@@ -247,21 +296,19 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel( ...@@ -247,21 +296,19 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
constexpr FloatType ONE = 1; constexpr FloatType ONE = 1;
while (out_row < last_row) { while (out_row < last_row) {
const auto row = rows[out_row]; const auto in_row_start = indptr[out_row];
const auto in_row_start = indptr[row];
const auto out_row_start = subindptr[out_row]; const auto out_row_start = subindptr[out_row];
const IdType degree = indptr[row + 1] - in_row_start; const IdType degree = subindptr[out_row + 1] - out_row_start;
if (degree > 0) { if (degree > 0) {
// stands for k in in arXiv:2210.13339, i.e. fanout // stands for k in in arXiv:2210.13339, i.e. fanout
const auto k = min(num_picks, degree); const auto k = min(num_picks, degree);
// slightly better than NS // slightly better than NS
const FloatType d_ = ds ? ds[row] : degree; const FloatType d_ = ds ? ds[out_row] : degree;
// stands for right handside of Equation (22) in arXiv:2210.13339 // stands for right handside of Equation (22) in arXiv:2210.13339
FloatType var_target = FloatType var_target =
d_ * d_ / k + (ds ? d2s[row] - d_ * d_ / degree : 0); d_ * d_ / k + (ds ? d2s[out_row] - d_ * d_ / degree : 0);
auto c = cs[out_row]; auto c = cs[out_row];
const int num_valid = min(degree, (IdType)CTA_SIZE); const int num_valid = min(degree, (IdType)CTA_SIZE);
...@@ -273,7 +320,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel( ...@@ -273,7 +320,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) { for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) {
const auto w = A[in_row_start + idx]; const auto w = A[in_row_start + idx];
const auto ps = probs ? probs[out_row_start + idx] : w; const auto ps = probs ? probs[out_row_start + idx] : w;
var_1 += w * w / min(ONE, c * ps); var_1 += w > 0 ? w * w / min(ONE, c * ps) : 0;
} }
} else { } else {
for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) { for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) {
...@@ -298,12 +345,20 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel( ...@@ -298,12 +345,20 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
} // namespace } // namespace
template <typename IdType>
int log_size(const IdType size) {
if (size <= 0) return 0;
for (int i = 0; i < static_cast<int>(sizeof(IdType)) * 8; i++)
if (((size - 1) >> i) == 0) return i;
return sizeof(IdType) * 8;
}
template <typename IdType, typename FloatType, typename exec_policy_t> template <typename IdType, typename FloatType, typename exec_policy_t>
void compute_importance_sampling_probabilities( void compute_importance_sampling_probabilities(
CSRMatrix mat, const IdType hop_size, cudaStream_t stream, CSRMatrix mat, const IdType hop_size, cudaStream_t stream,
const uint64_t random_seed, const IdType num_rows, const IdType* rows, const uint64_t random_seed, const IdType num_rows, const IdType* indptr,
const IdType* indptr, const IdType* subindptr, const IdType* indices, const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
IdArray idx_coo_arr, const IdType* nids, const IdType* nids,
FloatArray cs_arr, // holds the computed cs values, has size num_rows FloatArray cs_arr, // holds the computed cs values, has size num_rows
const bool weighted, const FloatType* A, const FloatType* ds, const bool weighted, const FloatType* A, const FloatType* ds,
const FloatType* d2s, const IdType num_picks, DGLContext ctx, const FloatType* d2s, const IdType num_picks, DGLContext ctx,
...@@ -322,19 +377,15 @@ void compute_importance_sampling_probabilities( ...@@ -322,19 +377,15 @@ void compute_importance_sampling_probabilities(
: NullArray(); : NullArray();
auto A_l = A_l_arr.Ptr<FloatType>(); auto A_l = A_l_arr.Ptr<FloatType>();
const uint64_t max_log_num_vertices = [&]() -> int { const int max_log_num_vertices = log_size(mat.num_cols);
for (int i = 0; i < static_cast<int>(sizeof(IdType)) * 8; i++)
if (mat.num_cols <= ((IdType)1) << i) return i;
return sizeof(IdType) * 8;
}();
{ // extracts the onehop neighborhood cols to a contiguous range into hop_1 { // extracts the onehop neighborhood cols to a contiguous range into hop_1
const dim3 block(BLOCK_SIZE); const dim3 block(BLOCK_SIZE);
const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE); const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_CSRRowWiseOneHopExtractorKernel<IdType, FloatType>), grid, block, 0, (_CSRRowWiseOneHopExtractorKernel<IdType, FloatType>), grid, block, 0,
stream, random_seed, hop_size, rows, indptr, subindptr, indices, stream, random_seed, hop_size, indptr, subindptr, indices, idx_coo,
idx_coo, nids, weighted ? A : nullptr, rands, hop_1, A_l); nids, weighted ? A : nullptr, rands, hop_1, A_l);
} }
int64_t hop_uniq_size = 0; int64_t hop_uniq_size = 0;
IdArray hop_new_arr = NewIdArray(hop_size, ctx, sizeof(IdType) * 8); IdArray hop_new_arr = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
...@@ -445,7 +496,7 @@ void compute_importance_sampling_probabilities( ...@@ -445,7 +496,7 @@ void compute_importance_sampling_probabilities(
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_CSRRowWiseLayerSampleDegreeKernel< (_CSRRowWiseLayerSampleDegreeKernel<
IdType, FloatType, BLOCK_CTAS, TILE_SIZE>), IdType, FloatType, BLOCK_CTAS, TILE_SIZE>),
grid, block, 0, stream, (IdType)num_picks, num_rows, rows, cs, grid, block, 0, stream, (IdType)num_picks, num_rows, cs,
weighted ? ds : nullptr, weighted ? d2s : nullptr, indptr, weighted ? ds : nullptr, weighted ? d2s : nullptr, indptr,
probs_found, A, subindptr); probs_found, A, subindptr);
} }
...@@ -484,10 +535,12 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -484,10 +535,12 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>(); IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>();
FloatType* const A = prob_arr.Ptr<FloatType>(); FloatType* const A = prob_arr.Ptr<FloatType>();
IdType* const indptr = mat.indptr.Ptr<IdType>(); IdType* const indptr_ = mat.indptr.Ptr<IdType>();
IdType* const indices = mat.indices.Ptr<IdType>(); IdType* const indices_ = mat.indices.Ptr<IdType>();
IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr; IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
// Read indptr only once in case it is pinned and access is slow.
auto indptr = allocator.alloc_unique<IdType>(num_rows);
// compute in-degrees // compute in-degrees
auto in_deg = allocator.alloc_unique<IdType>(num_rows + 1); auto in_deg = allocator.alloc_unique<IdType>(num_rows + 1);
// cs stands for c_s in arXiv:2210.13339 // cs stands for c_s in arXiv:2210.13339
...@@ -504,11 +557,17 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -504,11 +557,17 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
: NullArray(); : NullArray();
auto d2s = d2s_arr.Ptr<FloatType>(); auto d2s = d2s_arr.Ptr<FloatType>();
thrust::counting_iterator<IdType> iota(0);
thrust::for_each(
exec_policy, iota, iota + num_rows,
DegreeFunc<IdType, FloatType>{
(IdType)num_picks, rows, indptr_, in_deg.get(), indptr.get(), cs});
if (weighted) { if (weighted) {
auto b_offsets = auto b_offsets = thrust::make_transform_iterator(
thrust::make_transform_iterator(rows, IndptrFunc<IdType>{indptr}); iota, IndptrFunc<IdType>{indptr.get(), nullptr});
auto e_offsets = auto e_offsets = thrust::make_transform_iterator(
thrust::make_transform_iterator(rows, IndptrFunc<IdType>{indptr + 1}); iota, IndptrFunc<IdType>{indptr.get(), in_deg.get()});
auto A_A2 = thrust::make_transform_iterator(A, SquareFunc<FloatType>{}); auto A_A2 = thrust::make_transform_iterator(A, SquareFunc<FloatType>{});
auto ds_d2s = thrust::make_zip_iterator(ds, d2s); auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
...@@ -524,13 +583,6 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -524,13 +583,6 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
stream)); stream));
} }
thrust::counting_iterator<IdType> iota(0);
thrust::for_each(
exec_policy, iota, iota + num_rows,
DegreeFunc<IdType, FloatType>{
(IdType)num_picks, rows, indptr, weighted ? ds : nullptr,
in_deg.get(), cs});
// fill subindptr // fill subindptr
IdArray subindptr_arr = NewIdArray(num_rows + 1, ctx, sizeof(IdType) * 8); IdArray subindptr_arr = NewIdArray(num_rows + 1, ctx, sizeof(IdType) * 8);
auto subindptr = subindptr_arr.Ptr<IdType>(); auto subindptr = subindptr_arr.Ptr<IdType>();
...@@ -560,6 +612,38 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -560,6 +612,38 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
auto idx_coo = idx_coo_arr.Ptr<IdType>(); auto idx_coo = idx_coo_arr.Ptr<IdType>();
auto hop_1 = hop_arr.Ptr<IdType>(); auto hop_1 = hop_arr.Ptr<IdType>();
const bool is_pinned = mat.indices.IsPinned();
if (is_pinned) {
const auto res = Sort(rows_arr, log_size(mat.num_rows));
const int64_t* perm = static_cast<int64_t*>(res.second->data);
IdType hop_size; // Shadows the original one as this is temporary
auto subindptr_aligned = allocator.alloc_unique<IdType>(num_rows + 1);
{
auto modified_in_deg = thrust::make_transform_iterator(
iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
num_rows + 1, stream));
auto temp = allocator.alloc_unique<char>(prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
temp.get(), prefix_temp_size, modified_in_deg,
subindptr_aligned.get(), num_rows + 1, stream));
device->CopyDataFromTo(
subindptr_aligned.get(), num_rows * sizeof(hop_size), &hop_size, 0,
sizeof(hop_size), ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
}
const dim3 block(BLOCK_SIZE);
const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
CUDA_KERNEL_CALL(
(_CSRRowWiseOneHopExtractorAlignedKernel<IdType>), grid, block, 0,
stream, hop_size, num_rows, indptr.get(), subindptr,
subindptr_aligned.get(), indices_, hop_1, perm);
}
const auto indices = is_pinned ? hop_1 : indices_;
auto rands = auto rands =
allocator.alloc_unique<FloatType>(importance_sampling ? hop_size : 1); allocator.alloc_unique<FloatType>(importance_sampling ? hop_size : 1);
auto probs_found = auto probs_found =
...@@ -575,8 +659,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -575,8 +659,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_CSRRowWiseLayerSampleDegreeKernel< (_CSRRowWiseLayerSampleDegreeKernel<
IdType, FloatType, BLOCK_CTAS, TILE_SIZE>), IdType, FloatType, BLOCK_CTAS, TILE_SIZE>),
grid, block, 0, stream, (IdType)num_picks, num_rows, rows, cs, ds, d2s, grid, block, 0, stream, (IdType)num_picks, num_rows, cs, ds, d2s,
indptr, nullptr, A, subindptr); indptr.get(), nullptr, A, subindptr);
} }
const uint64_t random_seed = const uint64_t random_seed =
...@@ -587,7 +671,7 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -587,7 +671,7 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
if (importance_sampling) if (importance_sampling)
compute_importance_sampling_probabilities< compute_importance_sampling_probabilities<
IdType, FloatType, decltype(exec_policy)>( IdType, FloatType, decltype(exec_policy)>(
mat, hop_size, stream, random_seed, num_rows, rows, indptr, subindptr, mat, hop_size, stream, random_seed, num_rows, indptr.get(), subindptr,
indices, idx_coo_arr, nids, cs_arr, weighted, A, ds, d2s, indices, idx_coo_arr, nids, cs_arr, weighted, A, ds, d2s,
(IdType)num_picks, ctx, allocator, exec_policy, importance_sampling, (IdType)num_picks, ctx, allocator, exec_policy, importance_sampling,
hop_1, rands.get(), probs_found.get()); hop_1, rands.get(), probs_found.get());
...@@ -621,8 +705,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -621,8 +705,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
output, output,
TransformOpImp< TransformOpImp<
IdType, FloatType, FloatType*, FloatType*, decltype(one)>{ IdType, FloatType, FloatType*, FloatType*, decltype(one)>{
probs_found.get(), A, one, idx_coo, rows, cs, indptr, subindptr, probs_found.get(), A, one, idx_coo, rows, cs, indptr.get(),
indices, data}); subindptr, indices, data, is_pinned});
auto stencil = auto stencil =
thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get()); thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get());
num_edges = num_edges =
...@@ -635,8 +719,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -635,8 +719,8 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
output, output,
TransformOpImp< TransformOpImp<
IdType, FloatType, FloatType*, decltype(one), decltype(one)>{ IdType, FloatType, FloatType*, decltype(one), decltype(one)>{
probs_found.get(), one, one, idx_coo, rows, cs, indptr, probs_found.get(), one, one, idx_coo, rows, cs, indptr.get(),
subindptr, indices, data}); subindptr, indices, data, is_pinned});
auto stencil = auto stencil =
thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get()); thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get());
num_edges = num_edges =
...@@ -654,12 +738,12 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -654,12 +738,12 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
output, output,
TransformOpImp< TransformOpImp<
IdType, FloatType, decltype(one), FloatType*, FloatType*>{ IdType, FloatType, decltype(one), FloatType*, FloatType*>{
one, A, A, idx_coo, rows, cs, indptr, subindptr, indices, one, A, A, idx_coo, rows, cs, indptr.get(), subindptr, indices,
data}); data, is_pinned});
const auto pred = const auto pred =
StencilOpFused<IdType, FloatType, decltype(one), FloatType*>{ StencilOpFused<IdType, FloatType, decltype(one), FloatType*>{
random_seed, idx_coo, cs, one, A, random_seed, idx_coo, cs, one, A,
subindptr, rows, indptr, indices, nids}; subindptr, indptr.get(), indices, nids, is_pinned};
num_edges = thrust::copy_if( num_edges = thrust::copy_if(
exec_policy, iota, iota + hop_size, iota, exec_policy, iota, iota + hop_size, iota,
transformed_output, pred) - transformed_output, pred) -
...@@ -669,11 +753,12 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling( ...@@ -669,11 +753,12 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
picked_row_data, picked_col_data, picked_idx_data); picked_row_data, picked_col_data, picked_idx_data);
auto transformed_output = thrust::make_transform_output_iterator( auto transformed_output = thrust::make_transform_output_iterator(
output, TransformOp<IdType>{ output, TransformOp<IdType>{
idx_coo, rows, indptr, subindptr, indices, data}); idx_coo, rows, indptr.get(), subindptr, indices, data,
is_pinned});
const auto pred = const auto pred =
StencilOpFused<IdType, FloatType, decltype(one), decltype(one)>{ StencilOpFused<IdType, FloatType, decltype(one), decltype(one)>{
random_seed, idx_coo, cs, one, one, random_seed, idx_coo, cs, one, one,
subindptr, rows, indptr, indices, nids}; subindptr, indptr.get(), indices, nids, is_pinned};
num_edges = thrust::copy_if( num_edges = thrust::copy_if(
exec_policy, iota, iota + hop_size, iota, exec_policy, iota, iota + hop_size, iota,
transformed_output, pred) - transformed_output, pred) -
......
...@@ -115,47 +115,44 @@ std::pair<HeteroSubgraph, std::vector<FloatArray>> SampleLabors( ...@@ -115,47 +115,44 @@ std::pair<HeteroSubgraph, std::vector<FloatArray>> SampleLabors(
hg->NumVertices(src_vtype), hg->NumVertices(dst_vtype), hg->NumVertices(src_vtype), hg->NumVertices(dst_vtype),
hg->DataType(), ctx); hg->DataType(), ctx);
induced_edges[etype] = aten::NullArray(hg->DataType(), ctx); induced_edges[etype] = aten::NullArray(hg->DataType(), ctx);
} else if (fanouts[etype] == -1) { subimportances[etype] = NullArray();
const auto& earr = (dir == EdgeDir::kOut)
? hg->OutEdges(etype, nodes_ntype)
: hg->InEdges(etype, nodes_ntype);
subrels[etype] = UnitGraph::CreateFromCOO(
hg->GetRelationGraph(etype)->NumVertexTypes(),
hg->NumVertices(src_vtype), hg->NumVertices(dst_vtype), earr.src,
earr.dst);
induced_edges[etype] = earr.id;
} else { } else {
// sample from one relation graph // sample from one relation graph
auto req_fmt = (dir == EdgeDir::kOut) ? CSR_CODE : CSC_CODE; auto req_fmt = (dir == EdgeDir::kOut) ? CSR_CODE : CSC_CODE;
auto avail_fmt = hg->SelectFormat(etype, req_fmt); auto avail_fmt = hg->SelectFormat(etype, req_fmt);
COOMatrix sampled_coo; COOMatrix sampled_coo;
FloatArray importances; FloatArray importances;
const int64_t fanout =
fanouts[etype] >= 0
? fanouts[etype]
: std::max(
hg->NumVertices(dst_vtype), hg->NumVertices(src_vtype));
switch (avail_fmt) { switch (avail_fmt) {
case SparseFormat::kCOO: case SparseFormat::kCOO:
if (dir == EdgeDir::kIn) { if (dir == EdgeDir::kIn) {
auto fs = aten::COOLaborSampling( auto fs = aten::COOLaborSampling(
aten::COOTranspose(hg->GetCOOMatrix(etype)), nodes_ntype, aten::COOTranspose(hg->GetCOOMatrix(etype)), nodes_ntype,
fanouts[etype], prob[etype], importance_sampling, random_seed, fanout, prob[etype], importance_sampling, random_seed,
NIDs_ntype); NIDs_ntype);
sampled_coo = aten::COOTranspose(fs.first); sampled_coo = aten::COOTranspose(fs.first);
importances = fs.second; importances = fs.second;
} else { } else {
std::tie(sampled_coo, importances) = aten::COOLaborSampling( std::tie(sampled_coo, importances) = aten::COOLaborSampling(
hg->GetCOOMatrix(etype), nodes_ntype, fanouts[etype], hg->GetCOOMatrix(etype), nodes_ntype, fanout, prob[etype],
prob[etype], importance_sampling, random_seed, NIDs_ntype); importance_sampling, random_seed, NIDs_ntype);
} }
break; break;
case SparseFormat::kCSR: case SparseFormat::kCSR:
CHECK(dir == EdgeDir::kOut) CHECK(dir == EdgeDir::kOut)
<< "Cannot sample out edges on CSC matrix."; << "Cannot sample out edges on CSC matrix.";
std::tie(sampled_coo, importances) = aten::CSRLaborSampling( std::tie(sampled_coo, importances) = aten::CSRLaborSampling(
hg->GetCSRMatrix(etype), nodes_ntype, fanouts[etype], prob[etype], hg->GetCSRMatrix(etype), nodes_ntype, fanout, prob[etype],
importance_sampling, random_seed, NIDs_ntype); importance_sampling, random_seed, NIDs_ntype);
break; break;
case SparseFormat::kCSC: case SparseFormat::kCSC:
CHECK(dir == EdgeDir::kIn) << "Cannot sample in edges on CSR matrix."; CHECK(dir == EdgeDir::kIn) << "Cannot sample in edges on CSR matrix.";
std::tie(sampled_coo, importances) = aten::CSRLaborSampling( std::tie(sampled_coo, importances) = aten::CSRLaborSampling(
hg->GetCSCMatrix(etype), nodes_ntype, fanouts[etype], prob[etype], hg->GetCSCMatrix(etype), nodes_ntype, fanout, prob[etype],
importance_sampling, random_seed, NIDs_ntype); importance_sampling, random_seed, NIDs_ntype);
sampled_coo = aten::COOTranspose(sampled_coo); sampled_coo = aten::COOTranspose(sampled_coo);
break; break;
......
...@@ -681,6 +681,120 @@ def _test_sample_neighbors(hypersparse, prob): ...@@ -681,6 +681,120 @@ def _test_sample_neighbors(hypersparse, prob):
assert subg["flips"].num_edges() == 4 assert subg["flips"].num_edges() == 4
def _test_sample_labors(hypersparse, prob):
g, hg = _gen_neighbor_sampling_test_graph(hypersparse, False)
# test with seed nodes [0, 1]
def _test1(p):
subg = dgl.sampling.sample_labors(g, [0, 1], -1, prob=p)[0]
assert subg.num_nodes() == g.num_nodes()
u, v = subg.edges()
u_ans, v_ans, e_ans = g.in_edges([0, 1], form="all")
if p is not None:
emask = F.gather_row(g.edata[p], e_ans)
if p == "prob":
emask = emask != 0
u_ans = F.boolean_mask(u_ans, emask)
v_ans = F.boolean_mask(v_ans, emask)
uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
assert uv == uv_ans
for i in range(10):
subg = dgl.sampling.sample_labors(g, [0, 1], 2, prob=p)[0]
assert subg.num_nodes() == g.num_nodes()
assert subg.num_edges() >= 0
u, v = subg.edges()
assert set(F.asnumpy(F.unique(v))).issubset({0, 1})
assert F.array_equal(
F.astype(g.has_edges_between(u, v), F.int64),
F.ones((subg.num_edges(),), dtype=F.int64),
)
assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
# check no duplication
assert len(edge_set) == subg.num_edges()
if p is not None:
assert not (3, 0) in edge_set
assert not (3, 1) in edge_set
_test1(prob)
# test with seed nodes [0, 2]
def _test2(p):
subg = dgl.sampling.sample_labors(g, [0, 2], -1, prob=p)[0]
assert subg.num_nodes() == g.num_nodes()
u, v = subg.edges()
u_ans, v_ans, e_ans = g.in_edges([0, 2], form="all")
if p is not None:
emask = F.gather_row(g.edata[p], e_ans)
if p == "prob":
emask = emask != 0
u_ans = F.boolean_mask(u_ans, emask)
v_ans = F.boolean_mask(v_ans, emask)
uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
assert uv == uv_ans
for i in range(10):
subg = dgl.sampling.sample_labors(g, [0, 2], 2, prob=p)[0]
assert subg.num_nodes() == g.num_nodes()
assert subg.num_edges() >= 0
u, v = subg.edges()
assert set(F.asnumpy(F.unique(v))).issubset({0, 2})
assert F.array_equal(
F.astype(g.has_edges_between(u, v), F.int64),
F.ones((subg.num_edges(),), dtype=F.int64),
)
assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
# check no duplication
assert len(edge_set) == subg.num_edges()
if p is not None:
assert not (3, 0) in edge_set
_test2(prob)
# test with heterogenous seed nodes
def _test3(p):
subg = dgl.sampling.sample_labors(
hg, {"user": [0, 1], "game": 0}, -1, prob=p
)[0]
assert len(subg.ntypes) == 3
assert len(subg.etypes) == 4
assert subg["follow"].num_edges() == 6 if p is None else 4
assert subg["play"].num_edges() == 1
assert subg["liked-by"].num_edges() == 4
assert subg["flips"].num_edges() == 0
for i in range(10):
subg = dgl.sampling.sample_labors(
hg, {"user": [0, 1], "game": 0}, 2, prob=p
)[0]
assert len(subg.ntypes) == 3
assert len(subg.etypes) == 4
assert subg["follow"].num_edges() >= 0
assert subg["play"].num_edges() >= 0
assert subg["liked-by"].num_edges() >= 0
assert subg["flips"].num_edges() >= 0
_test3(prob)
# test different fanouts for different relations
for i in range(10):
subg = dgl.sampling.sample_labors(
hg,
{"user": [0, 1], "game": 0, "coin": 0},
{"follow": 1, "play": 2, "liked-by": 0, "flips": g.num_nodes()},
)[0]
assert len(subg.ntypes) == 3
assert len(subg.etypes) == 4
assert subg["follow"].num_edges() >= 0
assert subg["play"].num_edges() >= 0
assert subg["liked-by"].num_edges() == 0
assert subg["flips"].num_edges() == 4
def _test_sample_neighbors_outedge(hypersparse): def _test_sample_neighbors_outedge(hypersparse):
g, hg = _gen_neighbor_sampling_test_graph(hypersparse, True) g, hg = _gen_neighbor_sampling_test_graph(hypersparse, True)
...@@ -967,11 +1081,19 @@ def test_sample_neighbors_noprob(): ...@@ -967,11 +1081,19 @@ def test_sample_neighbors_noprob():
# _test_sample_neighbors(True) # _test_sample_neighbors(True)
def test_sample_labors_noprob():
_test_sample_labors(False, None)
def test_sample_neighbors_prob(): def test_sample_neighbors_prob():
_test_sample_neighbors(False, "prob") _test_sample_neighbors(False, "prob")
# _test_sample_neighbors(True) # _test_sample_neighbors(True)
def test_sample_labors_prob():
_test_sample_labors(False, "prob")
def test_sample_neighbors_outedge(): def test_sample_neighbors_outedge():
_test_sample_neighbors_outedge(False) _test_sample_neighbors_outedge(False)
# _test_sample_neighbors_outedge(True) # _test_sample_neighbors_outedge(True)
...@@ -1575,7 +1697,9 @@ if __name__ == "__main__": ...@@ -1575,7 +1697,9 @@ if __name__ == "__main__":
from itertools import product from itertools import product
test_sample_neighbors_noprob() test_sample_neighbors_noprob()
test_sample_labors_noprob()
test_sample_neighbors_prob() test_sample_neighbors_prob()
test_sample_labors_prob()
test_sample_neighbors_mask() test_sample_neighbors_mask()
for args in product(["coo", "csr", "csc"], ["in", "out"], [False, True]): for args in product(["coo", "csr", "csc"], ["in", "out"], [False, True]):
test_sample_neighbors_etype_homogeneous(*args) test_sample_neighbors_etype_homogeneous(*args)
......
...@@ -320,7 +320,9 @@ def _ddp_runner(proc_id, nprocs, g, data, args): ...@@ -320,7 +320,9 @@ def _ddp_runner(proc_id, nprocs, g, data, args):
@parametrize_idtype @parametrize_idtype
@pytest.mark.parametrize("sampler_name", ["full", "neighbor", "neighbor2"]) @pytest.mark.parametrize(
"sampler_name", ["full", "neighbor", "neighbor2", "labor"]
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"mode", ["cpu", "uva_cuda_indices", "uva_cpu_indices", "pure_gpu"] "mode", ["cpu", "uva_cuda_indices", "uva_cpu_indices", "pure_gpu"]
) )
...@@ -353,6 +355,7 @@ def test_node_dataloader(idtype, sampler_name, mode, use_ddp): ...@@ -353,6 +355,7 @@ def test_node_dataloader(idtype, sampler_name, mode, use_ddp):
"full": dgl.dataloading.MultiLayerFullNeighborSampler(2), "full": dgl.dataloading.MultiLayerFullNeighborSampler(2),
"neighbor": dgl.dataloading.MultiLayerNeighborSampler([3, 3]), "neighbor": dgl.dataloading.MultiLayerNeighborSampler([3, 3]),
"neighbor2": dgl.dataloading.MultiLayerNeighborSampler([3, 3]), "neighbor2": dgl.dataloading.MultiLayerNeighborSampler([3, 3]),
"labor": dgl.dataloading.LaborSampler([3, 3]),
}[sampler_name] }[sampler_name]
for num_workers in [0, 1, 2] if mode == "cpu" else [0]: for num_workers in [0, 1, 2] if mode == "cpu" else [0]:
dataloader = dgl.dataloading.DataLoader( dataloader = dgl.dataloading.DataLoader(
...@@ -405,6 +408,7 @@ def test_node_dataloader(idtype, sampler_name, mode, use_ddp): ...@@ -405,6 +408,7 @@ def test_node_dataloader(idtype, sampler_name, mode, use_ddp):
[{etype: 3 for etype in g2.etypes}] * 2 [{etype: 3 for etype in g2.etypes}] * 2
), ),
"neighbor2": dgl.dataloading.MultiLayerNeighborSampler([3, 3]), "neighbor2": dgl.dataloading.MultiLayerNeighborSampler([3, 3]),
"labor": dgl.dataloading.LaborSampler([3, 3]),
}[sampler_name] }[sampler_name]
for num_workers in [0, 1, 2] if mode == "cpu" else [0]: for num_workers in [0, 1, 2] if mode == "cpu" else [0]:
dataloader = dgl.dataloading.DataLoader( dataloader = dgl.dataloading.DataLoader(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment