Unverified Commit 8ae50c42 authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] clang-format auto fix. (#4804)



* [Misc] clang-format auto fix.

* manual

* manual

* manual

* manual

* todo

* fix
Co-authored-by: default avatarSteve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
parent 81831111
......@@ -6,10 +6,11 @@
#include <dgl/array.h>
#include <dgl/array_iterator.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/random.h>
#include <utility>
#include <dgl/runtime/parallel_for.h>
#include <algorithm>
#include <utility>
using namespace dgl::runtime;
......@@ -19,15 +20,12 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType>
std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
const CSRMatrix &csr,
int64_t num_samples,
int num_trials,
bool exclude_self_loops,
bool replace,
double redundancy) {
const CSRMatrix& csr, int64_t num_samples, int num_trials,
bool exclude_self_loops, bool replace, double redundancy) {
const int64_t num_row = csr.num_rows;
const int64_t num_col = csr.num_cols;
const int64_t num_actual_samples = static_cast<int64_t>(num_samples * (1 + redundancy));
const int64_t num_actual_samples =
static_cast<int64_t>(num_samples * (1 + redundancy));
IdArray row = Full<IdType>(-1, num_actual_samples, csr.indptr->ctx);
IdArray col = Full<IdType>(-1, num_actual_samples, csr.indptr->ctx);
IdType* row_data = row.Ptr<IdType>();
......@@ -48,23 +46,30 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
});
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> end = std::remove_if(begin, begin + num_actual_samples,
PairIterator<IdType> end = std::remove_if(
begin, begin + num_actual_samples,
[](const std::pair<IdType, IdType>& val) { return val.first == -1; });
if (!replace) {
std::sort(begin, end,
[](const std::pair<IdType, IdType>& a, const std::pair<IdType, IdType>& b) {
return a.first < b.first || (a.first == b.first && a.second < b.second);
});;
std::sort(
begin, end,
[](const std::pair<IdType, IdType>& a,
const std::pair<IdType, IdType>& b) {
return a.first < b.first ||
(a.first == b.first && a.second < b.second);
});
end = std::unique(begin, end);
}
int64_t num_sampled = std::min(static_cast<int64_t>(end - begin), num_samples);
return {row.CreateView({num_sampled}, row->dtype), col.CreateView({num_sampled}, col->dtype)};
int64_t num_sampled =
std::min(static_cast<int64_t>(end - begin), num_samples);
return {
row.CreateView({num_sampled}, row->dtype),
col.CreateView({num_sampled}, col->dtype)};
}
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCPU, int32_t>(
const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCPU, int64_t>(
const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
kDGLCPU, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
kDGLCPU, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
}; // namespace impl
}; // namespace aten
......
......@@ -9,6 +9,7 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include "../selector.h"
namespace dgl {
......@@ -25,38 +26,41 @@ namespace cpu {
* \note it uses node parallel strategy, different threads are responsible
* for the computation of different nodes.
*/
template <typename IdType, typename DType, typename Op,
int LhsTarget = 0, int RhsTarget = 2>
void SDDMMCsr(const BcastOff& bcast,
const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out) {
template <
typename IdType, typename DType, typename Op, int LhsTarget = 0,
int RhsTarget = 2>
void SDDMMCsr(
const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs,
NDArray out) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
const DType* X = lhs.Ptr<DType>();
const DType* Y = rhs.Ptr<DType>();
const int64_t dim = bcast.out_len,
lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len, reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx? edges[j] : j;
const IdType eid = has_idx ? edges[j] : j;
DType* out_off = O + eid * dim;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs
? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
: nullptr;
const DType* rhs_off = Op::use_rhs
? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
: nullptr;
const DType* lhs_off =
Op::use_lhs
? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim +
lhs_add * reduce_size
: nullptr;
const DType* rhs_off =
Op::use_rhs
? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim +
rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
}
}
......@@ -74,35 +78,38 @@ void SDDMMCsr(const BcastOff& bcast,
* \note it uses edge parallel strategy, different threads are responsible
* for the computation of different edges.
*/
template <typename IdType, typename DType, typename Op,
int LhsTarget = 0, int RhsTarget = 2>
void SDDMMCoo(const BcastOff& bcast,
const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out) {
template <
typename IdType, typename DType, typename Op, int LhsTarget = 0,
int RhsTarget = 2>
void SDDMMCoo(
const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs,
NDArray out) {
const bool has_idx = !IsNullArray(coo.data);
const IdType* row = coo.row.Ptr<IdType>();
const IdType* col = coo.col.Ptr<IdType>();
const IdType* edges = coo.data.Ptr<IdType>();
const DType* X = lhs.Ptr<DType>();
const DType* Y = rhs.Ptr<DType>();
const int64_t dim = bcast.out_len,
lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len, reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
#pragma omp parallel for
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType rid = row[i];
const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i;
const IdType eid = has_idx ? edges[i] : i;
DType* out_off = O + eid * dim;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs ?
X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr;
const DType* rhs_off = Op::use_rhs ?
Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr;
const DType* lhs_off =
Op::use_lhs ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim +
lhs_add * reduce_size
: nullptr;
const DType* rhs_off =
Op::use_rhs ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim +
rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, bcast.reduce_size);
}
}
......@@ -110,12 +117,13 @@ void SDDMMCoo(const BcastOff& bcast,
namespace op {
//////////////////////////////// binary operators on CPU ////////////////////////////////
////////////////////////// binary operators on CPU /////////////////////////////
template <typename DType>
struct Add {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off + *rhs_off;
}
};
......@@ -124,7 +132,8 @@ template <typename DType>
struct Sub {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off - *rhs_off;
}
};
......@@ -133,7 +142,8 @@ template <typename DType>
struct Mul {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off * *rhs_off;
}
};
......@@ -142,7 +152,8 @@ template <typename DType>
struct Div {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off / *rhs_off;
}
};
......@@ -151,7 +162,8 @@ template <typename DType>
struct CopyLhs {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = false;
inline static DType Call(const DType* lhs_off, const DType*, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType*, int64_t len = 1) {
return *lhs_off;
}
};
......@@ -160,7 +172,8 @@ template <typename DType>
struct CopyRhs {
static constexpr bool use_lhs = false;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* , const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType*, const DType* rhs_off, int64_t len = 1) {
return *rhs_off;
}
};
......@@ -169,7 +182,8 @@ template <typename DType>
struct Dot {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
DType rst = 0;
for (int64_t l = 0; l < len; ++l) {
rst += lhs_off[l] * rhs_off[l];
......@@ -178,32 +192,32 @@ struct Dot {
}
};
#define SWITCH_OP(op, Op, ...) \
do { \
if ((op) == "add") { \
typedef dgl::aten::cpu::op::Add<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "sub") { \
typedef dgl::aten::cpu::op::Sub<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "mul") { \
typedef dgl::aten::cpu::op::Mul<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "div") { \
typedef dgl::aten::cpu::op::Div<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_lhs") { \
typedef dgl::aten::cpu::op::CopyLhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_rhs") { \
typedef dgl::aten::cpu::op::CopyRhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "dot") { \
typedef dgl::aten::cpu::op::Dot<DType> Op; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported SDDMM binary operator: " << op; \
} \
#define SWITCH_OP(op, Op, ...) \
do { \
if ((op) == "add") { \
typedef dgl::aten::cpu::op::Add<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "sub") { \
typedef dgl::aten::cpu::op::Sub<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "mul") { \
typedef dgl::aten::cpu::op::Mul<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "div") { \
typedef dgl::aten::cpu::op::Div<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_lhs") { \
typedef dgl::aten::cpu::op::CopyLhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_rhs") { \
typedef dgl::aten::cpu::op::CopyRhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "dot") { \
typedef dgl::aten::cpu::op::Dot<DType> Op; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported SDDMM binary operator: " << op; \
} \
} while (0)
} // namespace op
......
......@@ -7,10 +7,11 @@
#define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/base_heterograph.h>
#include <vector>
#include <dgl/runtime/parallel_for.h>
#include <string>
#include <vector>
namespace dgl {
namespace aten {
......@@ -26,11 +27,10 @@ template <typename IdType, typename DType>
void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
int n = out->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
......@@ -51,16 +51,14 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
* used in backward phase.
*/
template <typename IdType, typename DType, typename Cmp>
void SegmentCmp(NDArray feat, NDArray offsets,
NDArray out, NDArray arg) {
void SegmentCmp(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
int n = out->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
IdType *arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
IdType* arg_data = arg.Ptr<IdType>();
std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
std::fill(arg_data, arg_data + arg.NumElements(), -1);
runtime::parallel_for(0, n, [=](int b, int e) {
......@@ -89,8 +87,7 @@ template <typename IdType, typename DType>
void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
int n = feat->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* idx_data = idx.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
......@@ -114,24 +111,26 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
* \param list_out List of the output tensors.
*/
template <typename IdType, typename DType>
void UpdateGradMinMax_hetero(HeteroGraphPtr graph,
const std::string& op,
const std::vector<NDArray>& list_feat,
const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) {
void UpdateGradMinMax_hetero(
HeteroGraphPtr graph, const std::string& op,
const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) {
if (op == "copy_lhs" || op == "copy_rhs") {
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(graph->NumVertexTypes(),
std::vector<dgl_id_t>());
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
graph->NumVertexTypes(), std::vector<dgl_id_t>());
for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
auto pair = graph->meta_graph()->FindEdge(etype);
const dgl_id_t dst_ntype = pair.first; // graph is reversed
const dgl_id_t src_ntype = pair.second;
auto same_src_dst_ntype = std::find(std::begin(src_dst_ntypes[dst_ntype]),
std::end(src_dst_ntypes[dst_ntype]), src_ntype);
// if op is "copy_lhs", relation type with same src and dst node type will be updated once
if (op == "copy_lhs" && same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
auto same_src_dst_ntype = std::find(
std::begin(src_dst_ntypes[dst_ntype]),
std::end(src_dst_ntypes[dst_ntype]), src_ntype);
// if op is "copy_lhs", relation type with same src and dst node type will
// be updated once
if (op == "copy_lhs" &&
same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
continue;
src_dst_ntypes[dst_ntype].push_back(src_ntype);
const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
......@@ -149,7 +148,8 @@ void UpdateGradMinMax_hetero(HeteroGraphPtr graph,
if (type == idx_type_data[i * dim + k]) {
const int write_row = idx_data[i * dim + k];
#pragma omp atomic
out_data[write_row * dim + k] += feat_data[i * dim + k]; // feat = dZ
out_data[write_row * dim + k] +=
feat_data[i * dim + k]; // feat = dZ
}
}
}
......@@ -170,8 +170,7 @@ template <typename IdType, typename DType>
void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
int n = feat->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -4,58 +4,49 @@
* \brief Graph traversal implementation
*/
#include "./traversal.h"
#include <dgl/graph_traversal.h>
#include <algorithm>
#include <queue>
#include "./traversal.h"
namespace dgl {
namespace aten {
namespace impl {
namespace {
// A utility view class to wrap a vector into a queue.
template<typename DType>
template <typename DType>
struct VectorQueueWrapper {
std::vector<DType>* vec;
size_t head = 0;
explicit VectorQueueWrapper(std::vector<DType>* vec): vec(vec) {}
explicit VectorQueueWrapper(std::vector<DType>* vec) : vec(vec) {}
void push(const DType& elem) {
vec->push_back(elem);
}
void push(const DType& elem) { vec->push_back(elem); }
DType top() const {
return vec->operator[](head);
}
DType top() const { return vec->operator[](head); }
void pop() {
++head;
}
void pop() { ++head; }
bool empty() const {
return head == vec->size();
}
bool empty() const { return head == vec->size(); }
size_t size() const {
return vec->size() - head;
}
size_t size() const { return vec->size() - head; }
};
// Internal function to merge multiple traversal traces into one ndarray.
// It is similar to zip the vectors together.
template<typename DType>
IdArray MergeMultipleTraversals(
const std::vector<std::vector<DType>>& traces) {
template <typename DType>
IdArray MergeMultipleTraversals(const std::vector<std::vector<DType>>& traces) {
int64_t max_len = 0, total_len = 0;
for (size_t i = 0; i < traces.size(); ++i) {
const int64_t tracelen = traces[i].size();
max_len = std::max(max_len, tracelen);
total_len += traces[i].size();
}
IdArray ret = IdArray::Empty({total_len},
DGLDataType{kDGLInt, sizeof(DType) * 8, 1},
DGLContext{kDGLCPU, 0});
IdArray ret = IdArray::Empty(
{total_len}, DGLDataType{kDGLInt, sizeof(DType) * 8, 1},
DGLContext{kDGLCPU, 0});
DType* ret_data = static_cast<DType*>(ret->data);
for (int64_t i = 0; i < max_len; ++i) {
for (size_t j = 0; j < traces.size(); ++j) {
......@@ -71,15 +62,15 @@ IdArray MergeMultipleTraversals(
// Internal function to compute sections if multiple traversal traces
// are merged into one ndarray.
template<typename DType>
IdArray ComputeMergedSections(
const std::vector<std::vector<DType>>& traces) {
template <typename DType>
IdArray ComputeMergedSections(const std::vector<std::vector<DType>>& traces) {
int64_t max_len = 0;
for (size_t i = 0; i < traces.size(); ++i) {
const int64_t tracelen = traces[i].size();
max_len = std::max(max_len, tracelen);
}
IdArray ret = IdArray::Empty({max_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
IdArray ret = IdArray::Empty(
{max_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
int64_t* ret_data = static_cast<int64_t*>(ret->data);
for (int64_t i = 0; i < max_len; ++i) {
int64_t sec_len = 0;
......@@ -101,13 +92,13 @@ Frontiers BFSNodesFrontiers(const CSRMatrix& csr, IdArray source) {
std::vector<IdType> ids;
std::vector<int64_t> sections;
VectorQueueWrapper<IdType> queue(&ids);
auto visit = [&] (const int64_t v) { };
auto make_frontier = [&] () {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
auto visit = [&](const int64_t v) {};
auto make_frontier = [&]() {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
BFSTraverseNodes<IdType>(csr, source, &queue, visit, make_frontier);
Frontiers front;
......@@ -116,8 +107,10 @@ Frontiers BFSNodesFrontiers(const CSRMatrix& csr, IdArray source) {
return front;
}
template Frontiers BFSNodesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
template Frontiers BFSNodesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
template Frontiers BFSNodesFrontiers<kDGLCPU, int32_t>(
const CSRMatrix&, IdArray);
template Frontiers BFSNodesFrontiers<kDGLCPU, int64_t>(
const CSRMatrix&, IdArray);
template <DGLDeviceType XPU, typename IdType>
Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
......@@ -126,16 +119,16 @@ Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
// NOTE: std::queue has no top() method.
std::vector<IdType> nodes;
VectorQueueWrapper<IdType> queue(&nodes);
auto visit = [&] (const IdType e) { ids.push_back(e); };
auto visit = [&](const IdType e) { ids.push_back(e); };
bool first_frontier = true;
auto make_frontier = [&] {
if (first_frontier) {
first_frontier = false; // do not push the first section when doing edges
} else if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
if (first_frontier) {
first_frontier = false; // do not push the first section when doing edges
} else if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
BFSTraverseEdges<IdType>(csr, source, &queue, visit, make_frontier);
Frontiers front;
......@@ -144,21 +137,23 @@ Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
return front;
}
template Frontiers BFSEdgesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
template Frontiers BFSEdgesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
template Frontiers BFSEdgesFrontiers<kDGLCPU, int32_t>(
const CSRMatrix&, IdArray);
template Frontiers BFSEdgesFrontiers<kDGLCPU, int64_t>(
const CSRMatrix&, IdArray);
template <DGLDeviceType XPU, typename IdType>
Frontiers TopologicalNodesFrontiers(const CSRMatrix& csr) {
std::vector<IdType> ids;
std::vector<int64_t> sections;
VectorQueueWrapper<IdType> queue(&ids);
auto visit = [&] (const uint64_t v) { };
auto make_frontier = [&] () {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
auto visit = [&](const uint64_t v) {};
auto make_frontier = [&]() {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
TopologicalNodes<IdType>(csr, &queue, visit, make_frontier);
Frontiers front;
......@@ -167,8 +162,10 @@ Frontiers TopologicalNodesFrontiers(const CSRMatrix& csr) {
return front;
}
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&);
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&);
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int32_t>(
const CSRMatrix&);
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int64_t>(
const CSRMatrix&);
template <DGLDeviceType XPU, typename IdType>
Frontiers DGLDFSEdges(const CSRMatrix& csr, IdArray source) {
......@@ -177,7 +174,7 @@ Frontiers DGLDFSEdges(const CSRMatrix& csr, IdArray source) {
std::vector<std::vector<IdType>> edges(len);
for (int64_t i = 0; i < len; ++i) {
auto visit = [&] (IdType e, int tag) { edges[i].push_back(e); };
auto visit = [&](IdType e, int tag) { edges[i].push_back(e); };
DFSLabeledEdges<IdType>(csr, src_data[i], false, false, visit);
}
......@@ -191,11 +188,9 @@ template Frontiers DGLDFSEdges<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
template Frontiers DGLDFSEdges<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
template <DGLDeviceType XPU, typename IdType>
Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
IdArray source,
const bool has_reverse_edge,
const bool has_nontree_edge,
const bool return_labels) {
Frontiers DGLDFSLabeledEdges(
const CSRMatrix& csr, IdArray source, const bool has_reverse_edge,
const bool has_nontree_edge, const bool return_labels) {
const int64_t len = source->shape[0];
const IdType* src_data = static_cast<IdType*>(source->data);
std::vector<std::vector<IdType>> edges(len);
......@@ -206,14 +201,14 @@ Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
}
for (int64_t i = 0; i < len; ++i) {
auto visit = [&] (IdType e, int64_t tag) {
auto visit = [&](IdType e, int64_t tag) {
edges[i].push_back(e);
if (return_labels) {
tags[i].push_back(tag);
}
};
DFSLabeledEdges<IdType>(csr, src_data[i],
has_reverse_edge, has_nontree_edge, visit);
DFSLabeledEdges<IdType>(
csr, src_data[i], has_reverse_edge, has_nontree_edge, visit);
}
Frontiers front;
......@@ -226,16 +221,10 @@ Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
return front;
}
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int32_t>(const CSRMatrix&,
IdArray,
const bool,
const bool,
const bool);
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int64_t>(const CSRMatrix&,
IdArray,
const bool,
const bool,
const bool);
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int32_t>(
const CSRMatrix&, IdArray, const bool, const bool, const bool);
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int64_t>(
const CSRMatrix&, IdArray, const bool, const bool, const bool);
} // namespace impl
} // namespace aten
......
......@@ -3,15 +3,16 @@
* \file array/cpu/traversal.h
* \brief Graph traversal routines.
*
* Traversal routines generate frontiers. Frontiers can be node frontiers or edge
* frontiers depending on the traversal function. Each frontier is a
* list of nodes/edges (specified by their ids). An optional tag can be specified
* for each node/edge (represented by an int value).
* Traversal routines generate frontiers. Frontiers can be node frontiers or
* edge frontiers depending on the traversal function. Each frontier is a list
* of nodes/edges (specified by their ids). An optional tag can be specified for
* each node/edge (represented by an int value).
*/
#ifndef DGL_ARRAY_CPU_TRAVERSAL_H_
#define DGL_ARRAY_CPU_TRAVERSAL_H_
#include <dgl/graph_interface.h>
#include <stack>
#include <tuple>
#include <vector>
......@@ -43,16 +44,16 @@ namespace impl {
* \param reversed If true, BFS follows the in-edge direction
* \param queue The queue used to do bfs.
* \param visit The function to call when a node is visited.
* \param make_frontier The function to indicate that a new froniter can be made;
* \param make_frontier The function to indicate that a new froniter can be
* made;
*/
template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseNodes(const CSRMatrix& csr,
IdArray source,
Queue* queue,
VisitFn visit,
FrontierFn make_frontier) {
template <
typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseNodes(
const CSRMatrix &csr, IdArray source, Queue *queue, VisitFn visit,
FrontierFn make_frontier) {
const int64_t len = source->shape[0];
const IdType *src_data = static_cast<IdType*>(source->data);
const IdType *src_data = static_cast<IdType *>(source->data);
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
......@@ -71,7 +72,7 @@ void BFSTraverseNodes(const CSRMatrix& csr,
for (size_t i = 0; i < size; ++i) {
const IdType u = queue->top();
queue->pop();
for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
auto v = indices_data[idx];
if (!visited[v]) {
visited[v] = true;
......@@ -109,16 +110,16 @@ void BFSTraverseNodes(const CSRMatrix& csr,
* \param queue The queue used to do bfs.
* \param visit The function to call when a node is visited.
* The argument would be edge ID.
* \param make_frontier The function to indicate that a new frontier can be made;
* \param make_frontier The function to indicate that a new frontier can be
* made;
*/
template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseEdges(const CSRMatrix& csr,
IdArray source,
Queue* queue,
VisitFn visit,
FrontierFn make_frontier) {
template <
typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseEdges(
const CSRMatrix &csr, IdArray source, Queue *queue, VisitFn visit,
FrontierFn make_frontier) {
const int64_t len = source->shape[0];
const IdType* src_data = static_cast<IdType*>(source->data);
const IdType *src_data = static_cast<IdType *>(source->data);
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
......@@ -138,7 +139,7 @@ void BFSTraverseEdges(const CSRMatrix& csr,
for (size_t i = 0; i < size; ++i) {
const IdType u = queue->top();
queue->pop();
for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
auto e = eid_data ? eid_data[idx] : idx;
const IdType v = indices_data[idx];
if (!visited[v]) {
......@@ -174,13 +175,14 @@ void BFSTraverseEdges(const CSRMatrix& csr,
* \param reversed If true, follows the in-edge direction
* \param queue The queue used to do bfs.
* \param visit The function to call when a node is visited.
* \param make_frontier The function to indicate that a new froniter can be made;
* \param make_frontier The function to indicate that a new froniter can be
* made;
*/
template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void TopologicalNodes(const CSRMatrix& csr,
Queue* queue,
VisitFn visit,
FrontierFn make_frontier) {
template <
typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void TopologicalNodes(
const CSRMatrix &csr, Queue *queue, VisitFn visit,
FrontierFn make_frontier) {
int64_t num_visited_nodes = 0;
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
......@@ -206,7 +208,7 @@ void TopologicalNodes(const CSRMatrix& csr,
for (size_t i = 0; i < size; ++i) {
const IdType u = queue->top();
queue->pop();
for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
const IdType v = indices_data[idx];
if (--(degrees[v]) == 0) {
visit(v);
......@@ -219,7 +221,8 @@ void TopologicalNodes(const CSRMatrix& csr,
}
if (num_visited_nodes != num_nodes) {
LOG(FATAL) << "Error in topological traversal: loop detected in the given graph.";
LOG(FATAL)
<< "Error in topological traversal: loop detected in the given graph.";
}
}
......@@ -236,32 +239,29 @@ enum DFSEdgeTag {
* FORWARD(0), REVERSE(1), NONTREE(2)
*
* A FORWARD edge is one in which `u` has been visisted but `v` has not.
* A REVERSE edge is one in which both `u` and `v` have been visisted and the edge
* is in the DFS tree.
* A NONTREE edge is one in which both `u` and `v` have been visisted but the edge
* is NOT in the DFS tree.
* A REVERSE edge is one in which both `u` and `v` have been visisted and the
* edge is in the DFS tree. A NONTREE edge is one in which both `u` and `v` have
* been visisted but the edge is NOT in the DFS tree.
*
* \param source Source node.
* \param reversed If true, DFS follows the in-edge direction
* \param has_reverse_edge If true, REVERSE edges are included
* \param has_nontree_edge If true, NONTREE edges are included
* \param visit The function to call when an edge is visited; the edge id and its
* tag will be given as the arguments.
* \param visit The function to call when an edge is visited; the edge id and
* its tag will be given as the arguments.
*/
template<typename IdType, typename VisitFn>
void DFSLabeledEdges(const CSRMatrix& csr,
IdType source,
bool has_reverse_edge,
bool has_nontree_edge,
VisitFn visit) {
template <typename IdType, typename VisitFn>
void DFSLabeledEdges(
const CSRMatrix &csr, IdType source, bool has_reverse_edge,
bool has_nontree_edge, VisitFn visit) {
const int64_t num_nodes = csr.num_rows;
CHECK_GE(num_nodes, source) << "source " << source <<
" is out of range [0," << num_nodes << "]";
CHECK_GE(num_nodes, source)
<< "source " << source << " is out of range [0," << num_nodes << "]";
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
const IdType *eid_data = static_cast<IdType *>(csr.data->data);
if (indptr_data[source+1]-indptr_data[source] == 0) {
if (indptr_data[source + 1] - indptr_data[source] == 0) {
// no out-going edges from the source node
return;
}
......@@ -278,7 +278,8 @@ void DFSLabeledEdges(const CSRMatrix& csr,
while (!stack.empty()) {
std::tie(u, i, on_tree) = stack.top();
const IdType v = indices_data[indptr_data[u] + i];
const IdType uv = eid_data ? eid_data[indptr_data[u] + i] : indptr_data[u] + i;
const IdType uv =
eid_data ? eid_data[indptr_data[u] + i] : indptr_data[u] + i;
if (visited[v]) {
if (!on_tree && has_nontree_edge) {
visit(uv, kNonTree);
......@@ -288,7 +289,7 @@ void DFSLabeledEdges(const CSRMatrix& csr,
stack.pop();
// find next one.
if (indptr_data[u] + i < indptr_data[u + 1] - 1) {
stack.push(std::make_tuple(u, i+1, false));
stack.push(std::make_tuple(u, i + 1, false));
}
} else {
visited[v] = true;
......
......@@ -4,9 +4,10 @@
* \brief Array cumsum GPU implementation
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -17,7 +18,8 @@ template <DGLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero) {
const int64_t len = array.NumElements();
if (len == 0)
return !prepend_zero ? array : aten::Full(0, 1, array->dtype.bits, array->ctx);
return !prepend_zero ? array
: aten::Full(0, 1, array->dtype.bits, array->ctx);
auto device = runtime::DeviceAPI::Get(array->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
......
......@@ -5,9 +5,10 @@
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -16,14 +17,11 @@ namespace impl {
template <typename IdType>
struct IsNonZeroIndex {
explicit IsNonZeroIndex(const IdType * array) : array_(array) {
}
explicit IsNonZeroIndex(const IdType* array) : array_(array) {}
__device__ bool operator() (const int64_t index) {
return array_[index] != 0;
}
__device__ bool operator()(const int64_t index) { return array_[index] != 0; }
const IdType * array_;
const IdType* array_;
};
template <DGLDeviceType XPU, typename IdType>
......@@ -36,22 +34,23 @@ IdArray NonZero(IdArray array) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
const IdType * const in_data = static_cast<const IdType*>(array->data);
int64_t * const out_data = static_cast<int64_t*>(ret->data);
const IdType* const in_data = static_cast<const IdType*>(array->data);
int64_t* const out_data = static_cast<int64_t*>(ret->data);
IsNonZeroIndex<IdType> comp(in_data);
cub::CountingInputIterator<int64_t> counter(0);
// room for cub to output on GPU
int64_t * d_num_nonzeros = static_cast<int64_t*>(
device->AllocWorkspace(ctx, sizeof(int64_t)));
int64_t* d_num_nonzeros =
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
size_t temp_size = 0;
CUDA_CALL(cub::DeviceSelect::If(nullptr, temp_size, counter, out_data,
d_num_nonzeros, len, comp, stream));
void * temp = device->AllocWorkspace(ctx, temp_size);
CUDA_CALL(cub::DeviceSelect::If(temp, temp_size, counter, out_data,
d_num_nonzeros, len, comp, stream));
CUDA_CALL(cub::DeviceSelect::If(
nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
stream));
void* temp = device->AllocWorkspace(ctx, temp_size);
CUDA_CALL(cub::DeviceSelect::If(
temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
device->FreeWorkspace(ctx, temp);
// copy number of selected elements from GPU to CPU
......
......@@ -4,9 +4,10 @@
* \brief Array sort GPU implementation
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -29,26 +30,30 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
if (num_bits == 0) {
num_bits = sizeof(IdType)*8;
num_bits = sizeof(IdType) * 8;
}
// Allocate workspace
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr, workspace_size,
keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream));
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
0, num_bits, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
CUDA_CALL(cub::DeviceRadixSort::SortPairs(workspace, workspace_size,
keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream));
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
workspace, workspace_size, keys_in, keys_out, values_in, values_out,
nitems, 0, num_bits, stream));
device->FreeWorkspace(ctx, workspace);
return std::make_pair(sorted_array, sorted_idx);
}
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int32_t>(IdArray, int num_bits);
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int64_t>(IdArray, int num_bits);
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int32_t>(
IdArray, int num_bits);
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int64_t>(
IdArray, int num_bits);
} // namespace impl
} // namespace aten
......
......@@ -4,6 +4,7 @@
* \brief COO2CSR
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
......@@ -46,18 +47,15 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
if (!COOHasData(coo))
coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
NDArray indptr =
aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
CUSPARSE_CALL(cusparseXcoo2csr(
thr_entry->cusparse_handle,
coo.row.Ptr<int32_t>(),
nnz,
coo.num_rows,
indptr_ptr,
CUSPARSE_INDEX_BASE_ZERO));
return CSRMatrix(coo.num_rows, coo.num_cols,
indptr, coo.col, coo.data, col_sorted);
thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
indptr_ptr, CUSPARSE_INDEX_BASE_ZERO));
return CSRMatrix(
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
}
/*!
......@@ -77,9 +75,8 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
*/
template <typename IdType>
__global__ void _SortedSearchKernelUpperBound(
const IdType* hay, int64_t hay_size,
const IdType* needles, int64_t num_needles,
IdType* pos) {
const IdType* hay, int64_t hay_size, const IdType* needles,
int64_t num_needles, IdType* pos) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < num_needles) {
......@@ -123,14 +120,12 @@ CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
const int nt = cuda::FindNumThreads(coo.num_rows);
const int nb = (coo.num_rows + nt - 1) / nt;
IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
CUDA_KERNEL_CALL(_SortedSearchKernelUpperBound,
nb, nt, 0, stream,
coo.row.Ptr<int64_t>(), nnz,
rowids.Ptr<int64_t>(), coo.num_rows,
indptr.Ptr<int64_t>() + 1);
return CSRMatrix(coo.num_rows, coo.num_cols,
indptr, coo.col, coo.data, col_sorted);
CUDA_KERNEL_CALL(
_SortedSearchKernelUpperBound, nb, nt, 0, stream, coo.row.Ptr<int64_t>(),
nnz, rowids.Ptr<int64_t>(), coo.num_rows, indptr.Ptr<int64_t>() + 1);
return CSRMatrix(
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
}
template CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo);
......
......@@ -4,8 +4,9 @@
* \brief Sort COO index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../c_api_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
......@@ -18,21 +19,20 @@ namespace impl {
///////////////////////////// COOSort_ /////////////////////////////
/**
* @brief Encode row and column IDs into a single scalar per edge.
*
* @tparam IdType The type to encode as.
* @param row The row (src) IDs per edge.
* @param col The column (dst) IDs per edge.
* @param nnz The number of edges.
* @param col_bits The number of bits used to encode the destination. The row
* information is packed into the remaining bits.
* @param key The encoded edges (output).
*/
* @brief Encode row and column IDs into a single scalar per edge.
*
* @tparam IdType The type to encode as.
* @param row The row (src) IDs per edge.
* @param col The column (dst) IDs per edge.
* @param nnz The number of edges.
* @param col_bits The number of bits used to encode the destination. The row
* information is packed into the remaining bits.
* @param key The encoded edges (output).
*/
template <typename IdType>
__global__ void _COOEncodeEdgesKernel(
const IdType* const row, const IdType* const col,
const int64_t nnz, const int col_bits, IdType * const key) {
const IdType* const row, const IdType* const col, const int64_t nnz,
const int col_bits, IdType* const key) {
int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
if (tx < nnz) {
......@@ -41,20 +41,19 @@ __global__ void _COOEncodeEdgesKernel(
}
/**
* @brief Decode row and column IDs from the encoded edges.
*
* @tparam IdType The type the edges are encoded as.
* @param key The encoded edges.
* @param nnz The number of edges.
* @param col_bits The number of bits used to store the column/dst ID.
* @param row The row (src) IDs per edge (output).
* @param col The col (dst) IDs per edge (output).
*/
* @brief Decode row and column IDs from the encoded edges.
*
* @tparam IdType The type the edges are encoded as.
* @param key The encoded edges.
* @param nnz The number of edges.
* @param col_bits The number of bits used to store the column/dst ID.
* @param row The row (src) IDs per edge (output).
* @param col The col (dst) IDs per edge (output).
*/
template <typename IdType>
__global__ void _COODecodeEdgesKernel(
const IdType* const key, const int64_t nnz, const int col_bits,
IdType * const row, IdType * const col) {
IdType* const row, IdType* const col) {
int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
if (tx < nnz) {
......@@ -64,9 +63,7 @@ __global__ void _COODecodeEdgesKernel(
}
}
template<typename T>
template <typename T>
int _NumberOfBits(const T& range) {
if (range <= 1) {
// ranges of 0 or 1 require no bits to store
......@@ -74,12 +71,12 @@ int _NumberOfBits(const T& range) {
}
int bits = 1;
while (bits < static_cast<int>(sizeof(T)*8) && (1 << bits) < range) {
while (bits < static_cast<int>(sizeof(T) * 8) && (1 << bits) < range) {
++bits;
}
CHECK_EQ((range-1) >> bits, 0);
CHECK_NE((range-1) >> (bits-1), 0);
CHECK_EQ((range - 1) >> bits, 0);
CHECK_NE((range - 1) >> (bits - 1), 0);
return bits;
}
......@@ -95,20 +92,20 @@ void COOSort_(COOMatrix* coo, bool sort_column) {
const int num_bits = row_bits + col_bits;
const int nt = 256;
const int nb = (nnz+nt-1)/nt;
CHECK(static_cast<int64_t>(nb)*nt >= nnz);
const int nb = (nnz + nt - 1) / nt;
CHECK(static_cast<int64_t>(nb) * nt >= nnz);
IdArray pos = aten::NewIdArray(nnz, coo->row->ctx, coo->row->dtype.bits);
CUDA_KERNEL_CALL(_COOEncodeEdgesKernel, nb, nt, 0, stream,
coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>(),
nnz, col_bits, pos.Ptr<IdType>());
CUDA_KERNEL_CALL(
_COOEncodeEdgesKernel, nb, nt, 0, stream, coo->row.Ptr<IdType>(),
coo->col.Ptr<IdType>(), nnz, col_bits, pos.Ptr<IdType>());
auto sorted = Sort(pos, num_bits);
CUDA_KERNEL_CALL(_COODecodeEdgesKernel, nb, nt, 0, stream,
sorted.first.Ptr<IdType>(), nnz, col_bits,
coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());
CUDA_KERNEL_CALL(
_COODecodeEdgesKernel, nb, nt, 0, stream, sorted.first.Ptr<IdType>(),
nnz, col_bits, coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());
if (aten::COOHasData(*coo))
coo->data = IndexSelect(coo->data, sorted.second);
......@@ -138,8 +135,8 @@ template void COOSort_<kDGLCUDA, int64_t>(COOMatrix* coo, bool sort_column);
template <typename IdType>
__global__ void _COOIsSortedKernel(
const IdType* row, const IdType* col,
int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
const IdType* row, const IdType* col, int64_t nnz, int8_t* row_sorted,
int8_t* col_sorted) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < nnz) {
......@@ -148,8 +145,8 @@ __global__ void _COOIsSortedKernel(
col_sorted[0] = 1;
} else {
row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
col_sorted[tx] = static_cast<int8_t>(
row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
col_sorted[tx] =
static_cast<int8_t>(row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
}
tx += stride_x;
}
......@@ -161,18 +158,19 @@ std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const auto& ctx = coo.row->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
// be fine.
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
// should be fine.
int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
const int nt = cuda::FindNumThreads(nnz);
const int nb = (nnz + nt - 1) / nt;
CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, stream,
coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
nnz, row_flags, col_flags);
CUDA_KERNEL_CALL(
_COOIsSortedKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
coo.col.Ptr<IdType>(), nnz, row_flags, col_flags);
const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
const bool col_sorted =
row_sorted ? cuda::AllTrue(col_flags, nnz, ctx) : false;
device->FreeWorkspace(ctx, row_flags);
device->FreeWorkspace(ctx, col_flags);
......
......@@ -4,6 +4,7 @@
* \brief CSR2COO
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
......@@ -32,20 +33,16 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
NDArray row = aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
NDArray row =
aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
int32_t* row_ptr = static_cast<int32_t*>(row->data);
CUSPARSE_CALL(cusparseXcsr2coo(
thr_entry->cusparse_handle,
indptr_ptr,
indices->shape[0],
csr.num_rows,
row_ptr,
CUSPARSE_INDEX_BASE_ZERO));
return COOMatrix(csr.num_rows, csr.num_cols,
row, indices, data,
true, csr.sorted);
thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
row_ptr, CUSPARSE_INDEX_BASE_ZERO));
return COOMatrix(
csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
}
/*!
......@@ -65,8 +62,8 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
*/
template <typename DType, typename IdType>
__global__ void _RepeatKernel(
const DType* val, const IdType* pos,
DType* out, int64_t n_row, int64_t length) {
const DType* val, const IdType* pos, DType* out, int64_t n_row,
int64_t length) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
......@@ -88,15 +85,13 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const int nt = 256;
const int nb = (nnz + nt - 1) / nt;
CUDA_KERNEL_CALL(_RepeatKernel,
nb, nt, 0, stream,
rowids.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
csr.num_rows, nnz);
return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
CUDA_KERNEL_CALL(
_RepeatKernel, nb, nt, 0, stream, rowids.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(), csr.num_rows, nnz);
return COOMatrix(
csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
csr.sorted);
}
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
......@@ -111,8 +106,7 @@ COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
template <>
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
COOMatrix coo = CSRToCOO<kDGLCUDA, int32_t>(csr);
if (aten::IsNullArray(coo.data))
return coo;
if (aten::IsNullArray(coo.data)) return coo;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
......@@ -130,21 +124,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
data_ptr,
row_ptr,
&workspace_size));
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, &workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
data_ptr,
row_ptr,
col_ptr,
workspace));
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, col_ptr, workspace));
device->FreeWorkspace(row->ctx, workspace);
// The row and column field have already been reordered according
......@@ -158,8 +143,7 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
template <>
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr) {
COOMatrix coo = CSRToCOO<kDGLCUDA, int64_t>(csr);
if (aten::IsNullArray(coo.data))
return coo;
if (aten::IsNullArray(coo.data)) return coo;
const auto& sorted = Sort(coo.data);
coo.row = IndexSelect(coo.row, sorted.second);
......
......@@ -4,9 +4,10 @@
* \brief Sort CSR index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
......@@ -20,8 +21,8 @@ namespace impl {
*/
template <typename IdType>
__global__ void _SegmentIsSorted(
const IdType* indptr, const IdType* indices,
int64_t num_rows, int8_t* flags) {
const IdType* indptr, const IdType* indices, int64_t num_rows,
int8_t* flags) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < num_rows) {
......@@ -39,15 +40,15 @@ bool CSRIsSorted(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
// be fine.
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
int8_t* flags =
static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
CUDA_KERNEL_CALL(_SegmentIsSorted,
nb, nt, 0, stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
CUDA_KERNEL_CALL(
_SegmentIsSorted, nb, nt, 0, stream, csr.indptr.Ptr<IdType>(),
csr.indices.Ptr<IdType>(), csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
device->FreeWorkspace(ctx, flags);
return ret;
......@@ -82,10 +83,8 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
&workspace_size));
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
cusparseMatDescr_t descr;
......@@ -93,11 +92,8 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
data.Ptr<int32_t>(),
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
workspace));
csr->sorted = true;
......@@ -115,8 +111,7 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
const auto& ctx = csr->indptr->ctx;
const int64_t nnz = csr->indices->shape[0];
const auto nbits = csr->indptr->dtype.bits;
if (!aten::CSRHasData(*csr))
csr->data = aten::Range(0, nnz, nbits, ctx);
if (!aten::CSRHasData(*csr)) csr->data = aten::Range(0, nnz, nbits, ctx);
IdArray new_indices = csr->indices.Clone();
IdArray new_data = csr->data.Clone();
......@@ -129,15 +124,15 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
// Allocate workspace
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(nullptr, workspace_size,
key_in, key_out, value_in, value_out,
nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t)*8, stream));
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(workspace, workspace_size,
key_in, key_out, value_in, value_out,
nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t)*8, stream));
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
csr->sorted = true;
csr->indices = new_indices;
......
......@@ -4,6 +4,7 @@
* \brief CSR transpose (convert to CSC)
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
namespace dgl {
......@@ -33,14 +34,13 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
const int64_t nnz = indices->shape[0];
const auto& ctx = indptr->ctx;
const auto bits = indptr->dtype.bits;
if (aten::IsNullArray(data))
data = aten::Range(0, nnz, bits, ctx);
if (aten::IsNullArray(data)) data = aten::Range(0, nnz, bits, ctx);
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
const int32_t* indices_ptr = static_cast<int32_t*>(indices->data);
const void* data_ptr = data->data;
// (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz == 0.
// We need to do it ourselves.
// (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz
// == 0. We need to do it ourselves.
NDArray t_indptr = aten::Full(0, csr.num_cols + 1, bits, ctx);
NDArray t_indices = aten::NewIdArray(nnz, ctx, bits);
NDArray t_data = aten::NewIdArray(nnz, ctx, bits);
......@@ -53,40 +53,29 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
// workspace
size_t workspace_size;
CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize(
thr_entry->cusparse_handle,
csr.num_rows, csr.num_cols, nnz,
data_ptr, indptr_ptr, indices_ptr,
t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F,
CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO,
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference
&workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseCsr2cscEx2(
thr_entry->cusparse_handle,
csr.num_rows, csr.num_cols, nnz,
data_ptr, indptr_ptr, indices_ptr,
t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F,
CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO,
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference
workspace));
device->FreeWorkspace(ctx, workspace);
#else
CUSPARSE_CALL(cusparseScsr2csc(
thr_entry->cusparse_handle,
csr.num_rows, csr.num_cols, nnz,
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz,
static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr,
static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr,
CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
#endif
return CSRMatrix(csr.num_cols, csr.num_rows,
t_indptr, t_indices, t_data,
false);
return CSRMatrix(
csr.num_cols, csr.num_rows, t_indptr, t_indices, t_data, false);
}
template <>
......
......@@ -7,8 +7,8 @@
#include <dgl/runtime/device_api.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../filter.h"
#include "../../runtime/cuda/cuda_hashtable.cuh"
#include "../filter.h"
#include "./dgl_cub.cuh"
using namespace dgl::runtime::cuda;
......@@ -20,35 +20,29 @@ namespace {
cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
template<typename IdType, bool include>
template <typename IdType, bool include>
__global__ void _IsInKernel(
DeviceOrderedHashTable<IdType> table,
const IdType * const array,
const int64_t size,
IdType * const mark) {
const int64_t idx = threadIdx.x + blockDim.x*blockIdx.x;
DeviceOrderedHashTable<IdType> table, const IdType* const array,
const int64_t size, IdType* const mark) {
const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < size) {
mark[idx] = table.Contains(array[idx]) ^ (!include);
}
}
template<typename IdType>
template <typename IdType>
__global__ void _InsertKernel(
const IdType * const prefix,
const int64_t size,
IdType * const result) {
const int64_t idx = threadIdx.x + blockDim.x*blockIdx.x;
const IdType* const prefix, const int64_t size, IdType* const result) {
const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < size) {
if (prefix[idx] != prefix[idx+1]) {
if (prefix[idx] != prefix[idx + 1]) {
result[prefix[idx]] = idx;
}
}
}
template<typename IdType, bool include>
IdArray _PerformFilter(
const OrderedHashTable<IdType>& table,
IdArray test) {
template <typename IdType, bool include>
IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
const auto& ctx = test->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t size = test->shape[0];
......@@ -60,22 +54,20 @@ IdArray _PerformFilter(
// we need two arrays: 1) to act as a prefixsum
// for the number of entries that will be inserted, and
// 2) to collect the included items.
IdType * prefix = static_cast<IdType*>(
device->AllocWorkspace(ctx, sizeof(IdType)*(size+1)));
IdType* prefix = static_cast<IdType*>(
device->AllocWorkspace(ctx, sizeof(IdType) * (size + 1)));
// will resize down later
IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType)*8);
IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType) * 8);
// mark each index based on it's existence in the hashtable
{
const dim3 block(256);
const dim3 grid((size+block.x-1)/block.x);
const dim3 grid((size + block.x - 1) / block.x);
CUDA_KERNEL_CALL((_IsInKernel<IdType, include>),
grid, block, 0, cudaStream,
table.DeviceHandle(),
static_cast<const IdType*>(test->data),
size,
CUDA_KERNEL_CALL(
(_IsInKernel<IdType, include>), grid, block, 0, cudaStream,
table.DeviceHandle(), static_cast<const IdType*>(test->data), size,
prefix);
}
......@@ -83,40 +75,28 @@ IdArray _PerformFilter(
{
size_t workspace_bytes;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
nullptr,
workspace_bytes,
static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr),
size+1, cudaStream));
void * workspace = device->AllocWorkspace(ctx, workspace_bytes);
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), size + 1, cudaStream));
void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
workspace,
workspace_bytes,
prefix,
prefix,
size+1, cudaStream));
workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
device->FreeWorkspace(ctx, workspace);
}
// copy number using the internal current stream;
IdType num_unique;
device->CopyDataFromTo(prefix+size, 0,
&num_unique, 0,
sizeof(num_unique),
ctx,
DGLContext{kDGLCPU, 0},
test->dtype);
device->CopyDataFromTo(
prefix + size, 0, &num_unique, 0, sizeof(num_unique), ctx,
DGLContext{kDGLCPU, 0}, test->dtype);
// insert items into set
{
const dim3 block(256);
const dim3 grid((size+block.x-1)/block.x);
const dim3 grid((size + block.x - 1) / block.x);
CUDA_KERNEL_CALL(_InsertKernel,
grid, block, 0, cudaStream,
prefix,
size,
CUDA_KERNEL_CALL(
_InsertKernel, grid, block, 0, cudaStream, prefix, size,
static_cast<IdType*>(result->data));
}
device->FreeWorkspace(ctx, prefix);
......@@ -124,16 +104,13 @@ IdArray _PerformFilter(
return result.CreateView({num_unique}, result->dtype);
}
template<typename IdType>
template <typename IdType>
class CudaFilterSet : public Filter {
public:
explicit CudaFilterSet(IdArray array) :
table_(array->shape[0], array->ctx, cudaStream) {
explicit CudaFilterSet(IdArray array)
: table_(array->shape[0], array->ctx, cudaStream) {
table_.FillWithUnique(
static_cast<const IdType*>(array->data),
array->shape[0],
cudaStream);
static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
}
IdArray find_included_indices(IdArray test) override {
......@@ -150,7 +127,7 @@ class CudaFilterSet : public Filter {
} // namespace
template<DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
FilterRef CreateSetFilter(IdArray set) {
return FilterRef(std::make_shared<CudaFilterSet<IdType>>(set));
}
......
/*!
* Copyright (c) 2021 by Contributors
* \file cuda_common.h
* \brief Wrapper to place cub in dgl namespace.
* \brief Wrapper to place cub in dgl namespace.
*/
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
......
/**
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file array/gpu/disjoint_union.cu
* \brief Disjoint union GPU implementation.
*/
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file array/gpu/disjoint_union.cu
* \brief Disjoint union GPU implementation.
*/
#include <dgl/runtime/parallel_for.h>
#include <dgl/array.h>
#include <vector>
#include <dgl/runtime/parallel_for.h>
#include <tuple>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
......@@ -31,8 +33,8 @@ namespace impl {
template <typename IdType>
__global__ void _DisjointUnionKernel(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
int64_t n_arrs, int n_elms) {
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
int n_elms) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < n_elms) {
......@@ -48,7 +50,8 @@ __global__ void _DisjointUnionKernel(
}
template <DGLDeviceType XPU, typename IdType>
std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMatrix>& coos) {
std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
const std::vector<COOMatrix>& coos) {
IdType n = coos.size(), nbits = coos[0].row->dtype.bits;
IdArray n_rows = NewIdArray(n, CPU, nbits);
IdArray n_cols = NewIdArray(n, CPU, nbits);
......@@ -58,7 +61,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMa
IdType* n_cols_data = n_cols.Ptr<IdType>();
IdType* n_elms_data = n_elms.Ptr<IdType>();
dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e) {
for (IdType i = b; i < e; ++i) {
n_rows_data[i] = coos[i].num_rows;
n_cols_data[i] = coos[i].num_cols;
......@@ -66,30 +69,30 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMa
}
});
return std::make_tuple(CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
return std::make_tuple(
CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
}
template <DGLDeviceType XPU, typename IdType>
void _Merge(IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
int64_t n_arrs, int n_elms,
DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
void _Merge(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx);
int nt = 256;
int nb = (n_elms + nt - 1) / nt;
IdType** arrs_dev = static_cast<IdType**>(
device->AllocWorkspace(ctx, n_arrs*sizeof(IdType*)));
device->AllocWorkspace(ctx, n_arrs * sizeof(IdType*)));
device->CopyDataFromTo(
arrs, 0, arrs_dev, 0, sizeof(IdType*)*n_arrs,
DGLContext{kDGLCPU, 0}, ctx, dtype);
arrs, 0, arrs_dev, 0, sizeof(IdType*) * n_arrs, DGLContext{kDGLCPU, 0},
ctx, dtype);
CUDA_KERNEL_CALL(_DisjointUnionKernel,
nb, nt, 0, stream,
arrs_dev, prefix, offset,
out, n_arrs, n_elms);
CUDA_KERNEL_CALL(
_DisjointUnionKernel, nb, nt, 0, stream, arrs_dev, prefix, offset, out,
n_arrs, n_elms);
device->FreeWorkspace(ctx, arrs_dev);
}
......@@ -132,52 +135,50 @@ COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
IdType n_elements = 0;
device->CopyDataFromTo(
&prefix_elm[coos.size()], 0, &n_elements, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
coos[0].row->dtype);
&prefix_elm[coos.size()], 0, &n_elements, 0, sizeof(IdType),
coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
device->CopyDataFromTo(
&prefix_src[coos.size()], 0, &src_offset, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
coos[0].row->dtype);
&prefix_src[coos.size()], 0, &src_offset, 0, sizeof(IdType),
coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
device->CopyDataFromTo(
&prefix_dst[coos.size()], 0, &dst_offset, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
coos[0].row->dtype);
&prefix_dst[coos.size()], 0, &dst_offset, 0, sizeof(IdType),
coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
// Union src array
IdArray result_src = NewIdArray(
n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
IdArray result_src =
NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(
rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(), coos.size(),
n_elements, ctx, dtype, stream);
// Union dst array
IdArray result_dst = NewIdArray(
n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
_Merge<XPU, IdType>(cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
IdArray result_dst =
NewIdArray(n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
_Merge<XPU, IdType>(
cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(), coos.size(),
n_elements, ctx, dtype, stream);
// Union data array if exists and fetch number of elements
IdArray result_dat = NullArray();
if (has_data) {
result_dat = NewIdArray(
n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
result_dat =
NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(
data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
}
return COOMatrix(
src_offset, dst_offset,
result_src,
result_dst,
result_dat,
row_sorted,
col_sorted);
src_offset, dst_offset, result_src, result_dst, result_dat, row_sorted,
col_sorted);
}
template COOMatrix DisjointUnionCoo<kDGLCUDA, int32_t>(const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDGLCUDA, int64_t>(const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDGLCUDA, int32_t>(
const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDGLCUDA, int64_t>(
const std::vector<COOMatrix>& coos);
} // namespace impl
} // namespace aten
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment