Unverified Commit 6e46bbf5 authored by ayasar70's avatar ayasar70 Committed by GitHub
Browse files

[Performance][GPU] Improving Disjoint Union kernel for Graph Dataloaders (#3895)



* Based on issue #3436. Improving _SegmentCopyKernel s GPU utilization by switching to nonzero based thread assignment

* fixing lint issues

* Update cub for cuda 11.5 compatibility (#3468)

* fixing type mismatch

* tx guaranteed to be smaller than nnz. Hence removing last check

* minor: updating comment

* adding three unit tests for csr slice method to cover some corner cases

* timing repeatkernel

* clean

* clean

* clean

* updating _SegmentMaskColKernel

* Working on requests: removing sorted array check and adding comments to utility functions

* fixing lint issue

* Optimizing disjoint union kernel

* Trying to resolve compilation issue on CI

* [EMPTY] Relevant commit message here

* applying revision requests on cpu/disjoint_union.cc

* removing unnecessary casts

* remove extra space
Co-authored-by: default avatarAbdurrahman Yasar <ayasar@nvidia.com>
Co-authored-by: default avatarnv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com>
Co-authored-by: default avatarJinjing Zhou <VoVAllen@users.noreply.github.com>
Co-authored-by: default avatarQuan (Andy) Gan <coin2028@hotmail.com>
parent 306e0a46
......@@ -857,6 +857,16 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo) {
return ret;
}
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
COOMatrix ret;
ATEN_XPU_SWITCH_CUDA(coos[0].row->ctx.device_type, XPU, "DisjointUnionCoo", {
ATEN_ID_TYPE_SWITCH(coos[0].row->dtype, IdType, {
ret = impl::DisjointUnionCoo<XPU, IdType>(coos);
});
});
return ret;
}
COOMatrix COOLineGraph(const COOMatrix &coo, bool backtracking) {
COOMatrix ret;
ATEN_COO_SWITCH(coo, XPU, IdType, "COOLineGraph", {
......
......@@ -256,6 +256,9 @@ COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray
template <DLDeviceType XPU, typename IdType>
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
template <DLDeviceType XPU, typename IdType>
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos);
template <DLDeviceType XPU, typename IdType>
void COOSort_(COOMatrix* mat, bool sort_column);
......
/**
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file array/cpu/disjoint_union.cc
* \brief Disjoint union CPU implementation.
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple>
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMatrix>& coos) {
IdArray prefix_src_arr = NewIdArray(
coos.size(), coos[0].row->ctx, coos[0].row->dtype.bits);
IdArray prefix_dst_arr = NewIdArray(
coos.size(), coos[0].row->ctx, coos[0].row->dtype.bits);
IdArray prefix_elm_arr = NewIdArray(
coos.size(), coos[0].row->ctx, coos[0].row->dtype.bits);
auto prefix_src = prefix_src_arr.Ptr<IdType>();
auto prefix_dst = prefix_dst_arr.Ptr<IdType>();
auto prefix_elm = prefix_elm_arr.Ptr<IdType>();
dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
for (IdType i = b; i < e; ++i) {
prefix_src[i] = coos[i].num_rows;
prefix_dst[i] = coos[i].num_cols;
prefix_elm[i] = coos[i].row->shape[0];
}
});
return std::make_tuple(CumSum(prefix_src_arr, true),
CumSum(prefix_dst_arr, true),
CumSum(prefix_elm_arr, true));
}
template <DLDeviceType XPU, typename IdType>
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
bool has_data = false;
bool row_sorted = true;
bool col_sorted = true;
// check if data index array
for (size_t i = 0; i < coos.size(); ++i) {
CHECK_SAME_DTYPE(coos[0].row, coos[i].row);
CHECK_SAME_CONTEXT(coos[0].row, coos[i].row);
has_data |= COOHasData(coos[i]);
}
auto prefixes = _ComputePrefixSums<XPU, IdType>(coos);
auto prefix_src = static_cast<IdArray>(std::get<0>(prefixes)).Ptr<IdType>();
auto prefix_dst = static_cast<IdArray>(std::get<1>(prefixes)).Ptr<IdType>();
auto prefix_elm = static_cast<IdArray>(std::get<2>(prefixes)).Ptr<IdType>();
IdArray result_src = NewIdArray(
prefix_elm[coos.size()], coos[0].row->ctx, coos[0].row->dtype.bits);
IdArray result_dst = NewIdArray(
prefix_elm[coos.size()], coos[0].col->ctx, coos[0].col->dtype.bits);
IdArray result_dat = NullArray();
if (has_data) {
result_dat = NewIdArray(
prefix_elm[coos.size()], coos[0].row->ctx, coos[0].row->dtype.bits);
}
auto res_src_data = result_src.Ptr<IdType>();
auto res_dst_data = result_dst.Ptr<IdType>();
auto res_dat_data = result_dat.Ptr<IdType>();
dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
for (IdType i = b; i < e; ++i) {
const aten::COOMatrix &coo = coos[i];
if (!coo.row_sorted) row_sorted = false;
if (!coo.col_sorted) col_sorted = false;
auto edges_src = coo.row.Ptr<IdType>();
auto edges_dst = coo.col.Ptr<IdType>();
auto edges_dat = coo.data.Ptr<IdType>();
for (IdType j = 0; j < coo.row->shape[0]; j++) {
res_src_data[prefix_elm[i] + j] = edges_src[j] + prefix_src[i];
}
for (IdType j = 0; j < coo.row->shape[0]; j++) {
res_dst_data[prefix_elm[i] + j] = edges_dst[j] + prefix_dst[i];
}
if (has_data) {
for (IdType j = 0; j < coo.row->shape[0]; j++) {
const auto d = (!COOHasData(coo)) ? j : edges_dat[j];
res_dat_data[prefix_elm[i]+j] = d + prefix_elm[i];
}
}
}
});
return COOMatrix(
prefix_src[coos.size()], prefix_dst[coos.size()],
result_src,
result_dst,
result_dat,
row_sorted,
col_sorted);
}
template COOMatrix DisjointUnionCoo<kDLCPU, int32_t>(const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDLCPU, int64_t>(const std::vector<COOMatrix>& coos);
} // namespace impl
} // namespace aten
} // namespace dgl
/**
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file array/gpu/disjoint_union.cu
* \brief Disjoint union GPU implementation.
*/
#include <dgl/runtime/parallel_for.h>
#include <dgl/array.h>
#include <vector>
#include <tuple>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <typename IdType>
__global__ void _DisjointUnionKernel(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
int64_t n_arrs, int n_elms) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < n_elms) {
IdType i = dgl::cuda::_UpperBound(offset, n_arrs, tx) - 1;
if (arrs[i] == NULL) {
out[tx] = tx;
} else {
IdType j = tx - offset[i];
out[tx] = arrs[i][j] + prefix[i];
}
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMatrix>& coos) {
IdType n = coos.size(), nbits = coos[0].row->dtype.bits;
IdArray n_rows = NewIdArray(n, CPU, nbits);
IdArray n_cols = NewIdArray(n, CPU, nbits);
IdArray n_elms = NewIdArray(n, CPU, nbits);
IdType* n_rows_data = n_rows.Ptr<IdType>();
IdType* n_cols_data = n_cols.Ptr<IdType>();
IdType* n_elms_data = n_elms.Ptr<IdType>();
dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
for (IdType i = b; i < e; ++i) {
n_rows_data[i] = coos[i].num_rows;
n_cols_data[i] = coos[i].num_cols;
n_elms_data[i] = coos[i].row->shape[0];
}
});
return std::make_tuple(CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
}
template <DLDeviceType XPU, typename IdType>
void _Merge(IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
int64_t n_arrs, int n_elms,
DGLContext ctx, DGLType dtype, cudaStream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx);
int nt = 256;
int nb = (n_elms + nt - 1) / nt;
IdType** arrs_dev = static_cast<IdType**>(
device->AllocWorkspace(ctx, n_arrs*sizeof(IdType*)));
device->CopyDataFromTo(
arrs, 0, arrs_dev, 0, sizeof(IdType*)*n_arrs,
DGLContext{kDLCPU, 0}, ctx, dtype, 0);
CUDA_KERNEL_CALL(_DisjointUnionKernel,
nb, nt, 0, stream,
arrs_dev, prefix, offset,
out, n_arrs, n_elms);
device->FreeWorkspace(ctx, arrs_dev);
}
template <DLDeviceType XPU, typename IdType>
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
uint64_t src_offset = 0, dst_offset = 0;
bool has_data = false;
bool row_sorted = true;
bool col_sorted = true;
// check if data index array
for (size_t i = 0; i < coos.size(); ++i) {
CHECK_SAME_DTYPE(coos[0].row, coos[i].row);
CHECK_SAME_CONTEXT(coos[0].row, coos[i].row);
has_data |= COOHasData(coos[i]);
}
auto prefixes = _ComputePrefixSums<XPU, IdType>(coos);
auto prefix_src = static_cast<IdType*>(std::get<0>(prefixes)->data);
auto prefix_dst = static_cast<IdType*>(std::get<1>(prefixes)->data);
auto prefix_elm = static_cast<IdType*>(std::get<2>(prefixes)->data);
std::unique_ptr<IdType*[]> rows(new IdType*[coos.size()]);
std::unique_ptr<IdType*[]> cols(new IdType*[coos.size()]);
std::unique_ptr<IdType*[]> data(new IdType*[coos.size()]);
for (size_t i = 0; i < coos.size(); i++) {
row_sorted &= coos[i].row_sorted;
col_sorted &= coos[i].col_sorted;
rows[i] = coos[i].row.Ptr<IdType>();
cols[i] = coos[i].col.Ptr<IdType>();
data[i] = coos[i].data.Ptr<IdType>();
}
auto ctx = coos[0].row->ctx;
auto dtype = coos[0].row->dtype;
auto stream = thr_entry->stream;
IdType n_elements = 0;
device->CopyDataFromTo(
&prefix_elm[coos.size()], 0, &n_elements, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDLCPU, 0},
coos[0].row->dtype, 0);
device->CopyDataFromTo(
&prefix_src[coos.size()], 0, &src_offset, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDLCPU, 0},
coos[0].row->dtype, 0);
device->CopyDataFromTo(
&prefix_dst[coos.size()], 0, &dst_offset, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDLCPU, 0},
coos[0].row->dtype, 0);
// Union src array
IdArray result_src = NewIdArray(
n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
// Union dst array
IdArray result_dst = NewIdArray(
n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
_Merge<XPU, IdType>(cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
// Union data array if exists and fetch number of elements
IdArray result_dat = NullArray();
if (has_data) {
result_dat = NewIdArray(
n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
}
return COOMatrix(
src_offset, dst_offset,
result_src,
result_dst,
result_dat,
row_sorted,
col_sorted);
}
template COOMatrix DisjointUnionCoo<kDLGPU, int32_t>(const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDLGPU, int64_t>(const std::vector<COOMatrix>& coos);
} // namespace impl
} // namespace aten
} // namespace dgl
......@@ -9,66 +9,6 @@
namespace dgl {
namespace aten {
///////////////////////// COO Based Operations/////////////////////////
COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
uint64_t src_offset = 0, dst_offset = 0;
int64_t edge_data_offset = 0;
bool has_data = false;
bool row_sorted = true;
bool col_sorted = true;
// check if data index array
for (size_t i = 0; i < coos.size(); ++i) {
CHECK_SAME_DTYPE(coos[0].row, coos[i].row);
CHECK_SAME_CONTEXT(coos[0].row, coos[i].row);
has_data |= COOHasData(coos[i]);
}
std::vector<IdArray> res_src;
std::vector<IdArray> res_dst;
std::vector<IdArray> res_data;
res_src.resize(coos.size());
res_dst.resize(coos.size());
for (size_t i = 0; i < coos.size(); ++i) {
const aten::COOMatrix &coo = coos[i];
row_sorted &= coo.row_sorted;
col_sorted &= coo.col_sorted;
IdArray edges_src = coo.row + src_offset;
IdArray edges_dst = coo.col + dst_offset;
res_src[i] = edges_src;
res_dst[i] = edges_dst;
src_offset += coo.num_rows;
dst_offset += coo.num_cols;
// any one of input coo has data index array
if (has_data) {
IdArray edges_data;
if (COOHasData(coo) == false) {
edges_data = Range(edge_data_offset,
edge_data_offset + coo.row->shape[0],
coo.row->dtype.bits,
coo.row->ctx);
} else {
edges_data = coo.data + edge_data_offset;
}
res_data.push_back(edges_data);
edge_data_offset += coo.row->shape[0];
}
}
IdArray result_src = Concat(res_src);
IdArray result_dst = Concat(res_dst);
IdArray result_data = has_data ? Concat(res_data) : NullArray();
return COOMatrix(
src_offset, dst_offset,
result_src,
result_dst,
result_data,
row_sorted,
col_sorted);
}
std::vector<COOMatrix> DisjointPartitionCooBySizes(
const COOMatrix &coo,
const uint64_t batch_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment