Unverified Commit 8ae50c42 authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] clang-format auto fix. (#4804)



* [Misc] clang-format auto fix.

* manual

* manual

* manual

* manual

* todo

* fix
Co-authored-by: default avatarSteve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
parent 81831111
......@@ -6,10 +6,11 @@
#include <dgl/array.h>
#include <dgl/array_iterator.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/random.h>
#include <utility>
#include <dgl/runtime/parallel_for.h>
#include <algorithm>
#include <utility>
using namespace dgl::runtime;
......@@ -19,15 +20,12 @@ namespace impl {
template <DGLDeviceType XPU, typename IdType>
std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
const CSRMatrix &csr,
int64_t num_samples,
int num_trials,
bool exclude_self_loops,
bool replace,
double redundancy) {
const CSRMatrix& csr, int64_t num_samples, int num_trials,
bool exclude_self_loops, bool replace, double redundancy) {
const int64_t num_row = csr.num_rows;
const int64_t num_col = csr.num_cols;
const int64_t num_actual_samples = static_cast<int64_t>(num_samples * (1 + redundancy));
const int64_t num_actual_samples =
static_cast<int64_t>(num_samples * (1 + redundancy));
IdArray row = Full<IdType>(-1, num_actual_samples, csr.indptr->ctx);
IdArray col = Full<IdType>(-1, num_actual_samples, csr.indptr->ctx);
IdType* row_data = row.Ptr<IdType>();
......@@ -48,23 +46,30 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
});
PairIterator<IdType> begin(row_data, col_data);
PairIterator<IdType> end = std::remove_if(begin, begin + num_actual_samples,
PairIterator<IdType> end = std::remove_if(
begin, begin + num_actual_samples,
[](const std::pair<IdType, IdType>& val) { return val.first == -1; });
if (!replace) {
std::sort(begin, end,
[](const std::pair<IdType, IdType>& a, const std::pair<IdType, IdType>& b) {
return a.first < b.first || (a.first == b.first && a.second < b.second);
});;
std::sort(
begin, end,
[](const std::pair<IdType, IdType>& a,
const std::pair<IdType, IdType>& b) {
return a.first < b.first ||
(a.first == b.first && a.second < b.second);
});
end = std::unique(begin, end);
}
int64_t num_sampled = std::min(static_cast<int64_t>(end - begin), num_samples);
return {row.CreateView({num_sampled}, row->dtype), col.CreateView({num_sampled}, col->dtype)};
int64_t num_sampled =
std::min(static_cast<int64_t>(end - begin), num_samples);
return {
row.CreateView({num_sampled}, row->dtype),
col.CreateView({num_sampled}, col->dtype)};
}
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCPU, int32_t>(
const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCPU, int64_t>(
const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
kDGLCPU, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
kDGLCPU, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
}; // namespace impl
}; // namespace aten
......
......@@ -9,6 +9,7 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include "../selector.h"
namespace dgl {
......@@ -25,38 +26,41 @@ namespace cpu {
* \note it uses node parallel strategy, different threads are responsible
* for the computation of different nodes.
*/
template <typename IdType, typename DType, typename Op,
int LhsTarget = 0, int RhsTarget = 2>
void SDDMMCsr(const BcastOff& bcast,
const CSRMatrix& csr,
NDArray lhs, NDArray rhs, NDArray out) {
template <
typename IdType, typename DType, typename Op, int LhsTarget = 0,
int RhsTarget = 2>
void SDDMMCsr(
const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs,
NDArray out) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
const DType* X = lhs.Ptr<DType>();
const DType* Y = rhs.Ptr<DType>();
const int64_t dim = bcast.out_len,
lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len, reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx? edges[j] : j;
const IdType eid = has_idx ? edges[j] : j;
DType* out_off = O + eid * dim;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs
? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
: nullptr;
const DType* rhs_off = Op::use_rhs
? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
: nullptr;
const DType* lhs_off =
Op::use_lhs
? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim +
lhs_add * reduce_size
: nullptr;
const DType* rhs_off =
Op::use_rhs
? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim +
rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
}
}
......@@ -74,35 +78,38 @@ void SDDMMCsr(const BcastOff& bcast,
* \note it uses edge parallel strategy, different threads are responsible
* for the computation of different edges.
*/
template <typename IdType, typename DType, typename Op,
int LhsTarget = 0, int RhsTarget = 2>
void SDDMMCoo(const BcastOff& bcast,
const COOMatrix& coo,
NDArray lhs, NDArray rhs, NDArray out) {
template <
typename IdType, typename DType, typename Op, int LhsTarget = 0,
int RhsTarget = 2>
void SDDMMCoo(
const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs,
NDArray out) {
const bool has_idx = !IsNullArray(coo.data);
const IdType* row = coo.row.Ptr<IdType>();
const IdType* col = coo.col.Ptr<IdType>();
const IdType* edges = coo.data.Ptr<IdType>();
const DType* X = lhs.Ptr<DType>();
const DType* Y = rhs.Ptr<DType>();
const int64_t dim = bcast.out_len,
lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len, reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
#pragma omp parallel for
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType rid = row[i];
const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i;
const IdType eid = has_idx ? edges[i] : i;
DType* out_off = O + eid * dim;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs ?
X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr;
const DType* rhs_off = Op::use_rhs ?
Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr;
const DType* lhs_off =
Op::use_lhs ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim +
lhs_add * reduce_size
: nullptr;
const DType* rhs_off =
Op::use_rhs ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim +
rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, bcast.reduce_size);
}
}
......@@ -110,12 +117,13 @@ void SDDMMCoo(const BcastOff& bcast,
namespace op {
//////////////////////////////// binary operators on CPU ////////////////////////////////
////////////////////////// binary operators on CPU /////////////////////////////
template <typename DType>
struct Add {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off + *rhs_off;
}
};
......@@ -124,7 +132,8 @@ template <typename DType>
struct Sub {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off - *rhs_off;
}
};
......@@ -133,7 +142,8 @@ template <typename DType>
struct Mul {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off * *rhs_off;
}
};
......@@ -142,7 +152,8 @@ template <typename DType>
struct Div {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
return *lhs_off / *rhs_off;
}
};
......@@ -151,7 +162,8 @@ template <typename DType>
struct CopyLhs {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = false;
inline static DType Call(const DType* lhs_off, const DType*, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType*, int64_t len = 1) {
return *lhs_off;
}
};
......@@ -160,7 +172,8 @@ template <typename DType>
struct CopyRhs {
static constexpr bool use_lhs = false;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* , const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType*, const DType* rhs_off, int64_t len = 1) {
return *rhs_off;
}
};
......@@ -169,7 +182,8 @@ template <typename DType>
struct Dot {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
inline static DType Call(
const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
DType rst = 0;
for (int64_t l = 0; l < len; ++l) {
rst += lhs_off[l] * rhs_off[l];
......@@ -178,32 +192,32 @@ struct Dot {
}
};
#define SWITCH_OP(op, Op, ...) \
do { \
if ((op) == "add") { \
typedef dgl::aten::cpu::op::Add<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "sub") { \
typedef dgl::aten::cpu::op::Sub<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "mul") { \
typedef dgl::aten::cpu::op::Mul<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "div") { \
typedef dgl::aten::cpu::op::Div<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_lhs") { \
typedef dgl::aten::cpu::op::CopyLhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_rhs") { \
typedef dgl::aten::cpu::op::CopyRhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "dot") { \
typedef dgl::aten::cpu::op::Dot<DType> Op; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported SDDMM binary operator: " << op; \
} \
#define SWITCH_OP(op, Op, ...) \
do { \
if ((op) == "add") { \
typedef dgl::aten::cpu::op::Add<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "sub") { \
typedef dgl::aten::cpu::op::Sub<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "mul") { \
typedef dgl::aten::cpu::op::Mul<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "div") { \
typedef dgl::aten::cpu::op::Div<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_lhs") { \
typedef dgl::aten::cpu::op::CopyLhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_rhs") { \
typedef dgl::aten::cpu::op::CopyRhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "dot") { \
typedef dgl::aten::cpu::op::Dot<DType> Op; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported SDDMM binary operator: " << op; \
} \
} while (0)
} // namespace op
......
......@@ -7,10 +7,11 @@
#define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/base_heterograph.h>
#include <vector>
#include <dgl/runtime/parallel_for.h>
#include <string>
#include <vector>
namespace dgl {
namespace aten {
......@@ -26,11 +27,10 @@ template <typename IdType, typename DType>
void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
int n = out->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
DType* out_data = out.Ptr<DType>();
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
......@@ -51,16 +51,14 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
* used in backward phase.
*/
template <typename IdType, typename DType, typename Cmp>
void SegmentCmp(NDArray feat, NDArray offsets,
NDArray out, NDArray arg) {
void SegmentCmp(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
int n = out->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
IdType *arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
IdType* arg_data = arg.Ptr<IdType>();
std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
std::fill(arg_data, arg_data + arg.NumElements(), -1);
runtime::parallel_for(0, n, [=](int b, int e) {
......@@ -89,8 +87,7 @@ template <typename IdType, typename DType>
void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
int n = feat->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* idx_data = idx.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
......@@ -114,24 +111,26 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
* \param list_out List of the output tensors.
*/
template <typename IdType, typename DType>
void UpdateGradMinMax_hetero(HeteroGraphPtr graph,
const std::string& op,
const std::vector<NDArray>& list_feat,
const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) {
void UpdateGradMinMax_hetero(
HeteroGraphPtr graph, const std::string& op,
const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
const std::vector<NDArray>& list_idx_types,
std::vector<NDArray>* list_out) {
if (op == "copy_lhs" || op == "copy_rhs") {
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(graph->NumVertexTypes(),
std::vector<dgl_id_t>());
std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
graph->NumVertexTypes(), std::vector<dgl_id_t>());
for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
auto pair = graph->meta_graph()->FindEdge(etype);
const dgl_id_t dst_ntype = pair.first; // graph is reversed
const dgl_id_t src_ntype = pair.second;
auto same_src_dst_ntype = std::find(std::begin(src_dst_ntypes[dst_ntype]),
std::end(src_dst_ntypes[dst_ntype]), src_ntype);
// if op is "copy_lhs", relation type with same src and dst node type will be updated once
if (op == "copy_lhs" && same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
auto same_src_dst_ntype = std::find(
std::begin(src_dst_ntypes[dst_ntype]),
std::end(src_dst_ntypes[dst_ntype]), src_ntype);
// if op is "copy_lhs", relation type with same src and dst node type will
// be updated once
if (op == "copy_lhs" &&
same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
continue;
src_dst_ntypes[dst_ntype].push_back(src_ntype);
const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
......@@ -149,7 +148,8 @@ void UpdateGradMinMax_hetero(HeteroGraphPtr graph,
if (type == idx_type_data[i * dim + k]) {
const int write_row = idx_data[i * dim + k];
#pragma omp atomic
out_data[write_row * dim + k] += feat_data[i * dim + k]; // feat = dZ
out_data[write_row * dim + k] +=
feat_data[i * dim + k]; // feat = dZ
}
}
}
......@@ -170,8 +170,7 @@ template <typename IdType, typename DType>
void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
int n = feat->shape[0];
int dim = 1;
for (int i = 1; i < out->ndim; ++i)
dim *= out->shape[i];
for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
......
......@@ -3,13 +3,15 @@
* \file array/cpu/spmat_op_impl.cc
* \brief CPU implementation of COO sparse matrix operators
*/
#include <dmlc/omp.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include <tuple>
#include <dmlc/omp.h>
#include <numeric>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "array_utils.h"
namespace dgl {
......@@ -33,11 +35,10 @@ template <DGLDeviceType XPU, typename IdType>
bool COOIsNonZero(COOMatrix coo, int64_t row, int64_t col) {
CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < coo.num_cols) << "Invalid col index: " << col;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
if (coo_row_data[i] == row && coo_col_data[i] == col)
return true;
if (coo_row_data[i] == row && coo_col_data[i] == col) return true;
}
return false;
}
......@@ -51,9 +52,9 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
const auto collen = col->shape[0];
const auto rstlen = std::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data);
const IdType* row_data = static_cast<IdType*>(row->data);
const IdType* col_data = static_cast<IdType*>(col->data);
IdType *rst_data = static_cast<IdType *>(rst->data);
const IdType *row_data = static_cast<IdType *>(row->data);
const IdType *col_data = static_cast<IdType *>(col->data);
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t kmax = std::max(rowlen, collen);
......@@ -61,7 +62,8 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
for (auto k = b; k < e; ++k) {
int64_t i = row_stride * k;
int64_t j = col_stride * k;
rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
rst_data[k] =
COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j]) ? 1 : 0;
}
});
return rst;
......@@ -75,11 +77,11 @@ template NDArray COOIsNonZero<kDGLCPU, int64_t>(COOMatrix, NDArray, NDArray);
template <DGLDeviceType XPU, typename IdType>
bool COOHasDuplicate(COOMatrix coo) {
std::unordered_set<std::pair<IdType, IdType>, PairHash> hashmap;
const IdType* src_data = static_cast<IdType*>(coo.row->data);
const IdType* dst_data = static_cast<IdType*>(coo.col->data);
const IdType *src_data = static_cast<IdType *>(coo.row->data);
const IdType *dst_data = static_cast<IdType *>(coo.col->data);
const auto nnz = coo.row->shape[0];
for (IdType eid = 0; eid < nnz; ++eid) {
const auto& p = std::make_pair(src_data[eid], dst_data[eid]);
const auto &p = std::make_pair(src_data[eid], dst_data[eid]);
if (hashmap.count(p)) {
return true;
} else {
......@@ -97,11 +99,10 @@ template bool COOHasDuplicate<kDGLCPU, int64_t>(COOMatrix coo);
template <DGLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
int64_t result = 0;
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
if (coo_row_data[i] == row)
++result;
if (coo_row_data[i] == row) ++result;
}
return result;
}
......@@ -113,9 +114,9 @@ template <DGLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
CHECK_SAME_DTYPE(coo.col, rows);
const auto len = rows->shape[0];
const IdType* vid_data = static_cast<IdType*>(rows->data);
const IdType *vid_data = static_cast<IdType *>(rows->data);
NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data);
IdType *rst_data = static_cast<IdType *>(rst->data);
#pragma omp parallel for
for (int64_t i = 0; i < len; ++i) {
rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
......@@ -126,16 +127,17 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
template NDArray COOGetRowNNZ<kDGLCPU, int32_t>(COOMatrix, NDArray);
template NDArray COOGetRowNNZ<kDGLCPU, int64_t>(COOMatrix, NDArray);
///////////////////////////// COOGetRowDataAndIndices /////////////////////////////
////////////////////////// COOGetRowDataAndIndices /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
std::pair<NDArray, NDArray> COOGetRowDataAndIndices(
COOMatrix coo, int64_t row) {
CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
const IdType *coo_data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
std::vector<IdType> indices;
std::vector<IdType> data;
......@@ -147,13 +149,14 @@ std::pair<NDArray, NDArray> COOGetRowDataAndIndices(
}
}
return std::make_pair(NDArray::FromVector(data), NDArray::FromVector(indices));
return std::make_pair(
NDArray::FromVector(data), NDArray::FromVector(indices));
}
template std::pair<NDArray, NDArray>
COOGetRowDataAndIndices<kDGLCPU, int32_t>(COOMatrix, int64_t);
template std::pair<NDArray, NDArray>
COOGetRowDataAndIndices<kDGLCPU, int64_t>(COOMatrix, int64_t);
template std::pair<NDArray, NDArray> COOGetRowDataAndIndices<kDGLCPU, int32_t>(
COOMatrix, int64_t);
template std::pair<NDArray, NDArray> COOGetRowDataAndIndices<kDGLCPU, int64_t>(
COOMatrix, int64_t);
///////////////////////////// COOGetData /////////////////////////////
......@@ -162,34 +165,35 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
const int64_t rowlen = rows->shape[0];
const int64_t collen = cols->shape[0];
CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
<< "Invalid row and col Id array:" << rows << " " << cols;
<< "Invalid row and col Id array:" << rows << " " << cols;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const IdType* row_data = rows.Ptr<IdType>();
const IdType* col_data = cols.Ptr<IdType>();
const IdType *row_data = rows.Ptr<IdType>();
const IdType *col_data = cols.Ptr<IdType>();
const IdType* coo_row = coo.row.Ptr<IdType>();
const IdType* coo_col = coo.col.Ptr<IdType>();
const IdType* data = COOHasData(coo) ? coo.data.Ptr<IdType>() : nullptr;
const IdType *coo_row = coo.row.Ptr<IdType>();
const IdType *coo_col = coo.col.Ptr<IdType>();
const IdType *data = COOHasData(coo) ? coo.data.Ptr<IdType>() : nullptr;
const int64_t nnz = coo.row->shape[0];
const int64_t retlen = std::max(rowlen, collen);
IdArray ret = Full(-1, retlen, rows->dtype.bits, rows->ctx);
IdType* ret_data = ret.Ptr<IdType>();
IdType *ret_data = ret.Ptr<IdType>();
// TODO(minjie): We might need to consider sorting the COO beforehand especially
// when the number of (row, col) pairs is large. Need more benchmarks to justify
// the choice.
// TODO(minjie): We might need to consider sorting the COO beforehand
// especially when the number of (row, col) pairs is large. Need more
// benchmarks to justify the choice.
if (coo.row_sorted) {
parallel_for(0, retlen, [&](size_t b, size_t e) {
for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
const IdType row_id = row_data[p * row_stride],
col_id = col_data[p * col_stride];
auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
for (; it < coo_row + nnz && *it == row_id; ++it) {
const auto idx = it - coo_row;
if (coo_col[idx] == col_id) {
ret_data[p] = data? data[idx] : idx;
ret_data[p] = data ? data[idx] : idx;
break;
}
}
......@@ -198,10 +202,11 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
} else {
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
const IdType row_id = row_data[p * row_stride],
col_id = col_data[p * col_stride];
for (int64_t idx = 0; idx < nnz; ++idx) {
if (coo_row[idx] == row_id && coo_col[idx] == col_id) {
ret_data[p] = data? data[idx] : idx;
ret_data[p] = data ? data[idx] : idx;
break;
}
}
......@@ -217,8 +222,8 @@ template IdArray COOGetData<kDGLCPU, int64_t>(COOMatrix, IdArray, IdArray);
///////////////////////////// COOGetDataAndIndices /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
NDArray cols) {
std::vector<NDArray> COOGetDataAndIndices(
COOMatrix coo, NDArray rows, NDArray cols) {
CHECK_SAME_DTYPE(coo.col, rows);
CHECK_SAME_DTYPE(coo.col, cols);
const int64_t rowlen = rows->shape[0];
......@@ -226,16 +231,17 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
const int64_t len = std::max(rowlen, collen);
CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
<< "Invalid row and col id array.";
<< "Invalid row and col id array.";
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const IdType* row_data = static_cast<IdType*>(rows->data);
const IdType* col_data = static_cast<IdType*>(cols->data);
const IdType *row_data = static_cast<IdType *>(rows->data);
const IdType *col_data = static_cast<IdType *>(cols->data);
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const IdType* data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
const IdType *data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
std::vector<IdType> ret_rows, ret_cols;
std::vector<IdType> ret_data;
......@@ -244,21 +250,27 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
ret_data.reserve(len);
// NOTE(BarclayII): With a small number of lookups, linear scan is faster.
// The threshold 200 comes from benchmarking both algorithms on a P3.8x instance.
// I also tried sorting plus binary search. The speed gain is only significant for
// medium-sized graphs and lookups, so I didn't include it.
// The threshold 200 comes from benchmarking both algorithms on a P3.8x
// instance. I also tried sorting plus binary search. The speed gain is only
// significant for medium-sized graphs and lookups, so I didn't include it.
if (len >= 200) {
// TODO(BarclayII) Ideally we would want to cache this object. However I'm not sure
// what is the best way to do so since this object is valid for CPU only.
std::unordered_multimap<std::pair<IdType, IdType>, IdType, PairHash> pair_map;
// TODO(BarclayII) Ideally we would want to cache this object. However I'm
// not sure what is the best way to do so since this object is valid for CPU
// only.
std::unordered_multimap<std::pair<IdType, IdType>, IdType, PairHash>
pair_map;
pair_map.reserve(coo.row->shape[0]);
for (int64_t k = 0; k < coo.row->shape[0]; ++k)
pair_map.emplace(std::make_pair(coo_row_data[k], coo_col_data[k]), data ? data[k]: k);
pair_map.emplace(
std::make_pair(coo_row_data[k], coo_col_data[k]), data ? data[k] : k);
for (int64_t i = 0, j = 0; i < rowlen && j < collen; i += row_stride, j += col_stride) {
for (int64_t i = 0, j = 0; i < rowlen && j < collen;
i += row_stride, j += col_stride) {
const IdType row_id = row_data[i], col_id = col_data[j];
CHECK(row_id >= 0 && row_id < coo.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < coo.num_cols) << "Invalid col index: " << col_id;
CHECK(row_id >= 0 && row_id < coo.num_rows)
<< "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < coo.num_cols)
<< "Invalid col index: " << col_id;
auto range = pair_map.equal_range({row_id, col_id});
for (auto it = range.first; it != range.second; ++it) {
ret_rows.push_back(row_id);
......@@ -267,10 +279,13 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
}
}
} else {
for (int64_t i = 0, j = 0; i < rowlen && j < collen; i += row_stride, j += col_stride) {
for (int64_t i = 0, j = 0; i < rowlen && j < collen;
i += row_stride, j += col_stride) {
const IdType row_id = row_data[i], col_id = col_data[j];
CHECK(row_id >= 0 && row_id < coo.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < coo.num_cols) << "Invalid col index: " << col_id;
CHECK(row_id >= 0 && row_id < coo.num_rows)
<< "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < coo.num_cols)
<< "Invalid col index: " << col_id;
for (int64_t k = 0; k < coo.row->shape[0]; ++k) {
if (coo_row_data[k] == row_id && coo_col_data[k] == col_id) {
ret_rows.push_back(row_id);
......@@ -281,9 +296,9 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
}
}
return {NDArray::FromVector(ret_rows),
NDArray::FromVector(ret_cols),
NDArray::FromVector(ret_data)};
return {
NDArray::FromVector(ret_rows), NDArray::FromVector(ret_cols),
NDArray::FromVector(ret_data)};
}
template std::vector<NDArray> COOGetDataAndIndices<kDGLCPU, int32_t>(
......@@ -304,7 +319,8 @@ template COOMatrix COOTranspose<kDGLCPU, int64_t>(COOMatrix coo);
///////////////////////////// COOToCSR /////////////////////////////
namespace {
template <class IdType> CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
template <class IdType>
CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
......@@ -389,11 +405,13 @@ template <class IdType> CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
std::fill(Bp, Bp + N + 1, 0);
}
return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
ret_data, coo.col_sorted);
return CSRMatrix(
coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
coo.col_sorted);
}
template <class IdType> CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
template <class IdType>
CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
......@@ -507,11 +525,13 @@ template <class IdType> CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
Bp[i + 1] += i_start;
}
}
return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
ret_data, coo.col_sorted);
return CSRMatrix(
coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
coo.col_sorted);
}
template <class IdType> CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
template <class IdType>
CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
......@@ -597,8 +617,9 @@ template <class IdType> CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
}
CHECK_EQ(Bp[N], NNZ);
return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
ret_data, coo.col_sorted);
return CSRMatrix(
coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
coo.col_sorted);
}
} // namespace
......@@ -643,9 +664,10 @@ COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end) {
CHECK(start >= 0 && start < coo.num_rows) << "Invalid start row " << start;
CHECK(end > 0 && end <= coo.num_rows) << "Invalid end row " << end;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
const IdType *coo_data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
std::vector<IdType> ret_row, ret_col;
std::vector<IdType> ret_data;
......@@ -660,13 +682,9 @@ COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end) {
}
}
return COOMatrix(
end - start,
coo.num_cols,
NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col),
NDArray::FromVector(ret_data),
coo.row_sorted,
coo.col_sorted);
end - start, coo.num_cols, NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col), NDArray::FromVector(ret_data),
coo.row_sorted, coo.col_sorted);
}
template COOMatrix COOSliceRows<kDGLCPU, int32_t>(COOMatrix, int64_t, int64_t);
......@@ -674,9 +692,10 @@ template COOMatrix COOSliceRows<kDGLCPU, int64_t>(COOMatrix, int64_t, int64_t);
template <DGLDeviceType XPU, typename IdType>
COOMatrix COOSliceRows(COOMatrix coo, NDArray rows) {
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
const IdType *coo_data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
std::vector<IdType> ret_row, ret_col;
std::vector<IdType> ret_data;
......@@ -695,24 +714,27 @@ COOMatrix COOSliceRows(COOMatrix coo, NDArray rows) {
}
return COOMatrix{
rows->shape[0],
coo.num_cols,
NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col),
NDArray::FromVector(ret_data),
coo.row_sorted, coo.col_sorted};
rows->shape[0],
coo.num_cols,
NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col),
NDArray::FromVector(ret_data),
coo.row_sorted,
coo.col_sorted};
}
template COOMatrix COOSliceRows<kDGLCPU, int32_t>(COOMatrix , NDArray);
template COOMatrix COOSliceRows<kDGLCPU, int64_t>(COOMatrix , NDArray);
template COOMatrix COOSliceRows<kDGLCPU, int32_t>(COOMatrix, NDArray);
template COOMatrix COOSliceRows<kDGLCPU, int64_t>(COOMatrix, NDArray);
///////////////////////////// COOSliceMatrix /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols) {
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
COOMatrix COOSliceMatrix(
COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols) {
const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
const IdType *coo_data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
IdHashMap<IdType> row_map(rows), col_map(cols);
......@@ -733,11 +755,10 @@ COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray
}
}
return COOMatrix(rows->shape[0], cols->shape[0],
NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col),
NDArray::FromVector(ret_data),
coo.row_sorted, coo.col_sorted);
return COOMatrix(
rows->shape[0], cols->shape[0], NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col), NDArray::FromVector(ret_data),
coo.row_sorted, coo.col_sorted);
}
template COOMatrix COOSliceMatrix<kDGLCPU, int32_t>(
......@@ -745,36 +766,38 @@ template COOMatrix COOSliceMatrix<kDGLCPU, int32_t>(
template COOMatrix COOSliceMatrix<kDGLCPU, int64_t>(
COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);
///////////////////////////// COOReorder /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
runtime::NDArray new_col_id_arr) {
COOMatrix COOReorder(
COOMatrix coo, runtime::NDArray new_row_id_arr,
runtime::NDArray new_col_id_arr) {
CHECK_SAME_DTYPE(coo.row, new_row_id_arr);
CHECK_SAME_DTYPE(coo.col, new_col_id_arr);
// Input COO
const IdType* in_rows = static_cast<IdType*>(coo.row->data);
const IdType* in_cols = static_cast<IdType*>(coo.col->data);
const IdType *in_rows = static_cast<IdType *>(coo.row->data);
const IdType *in_cols = static_cast<IdType *>(coo.col->data);
int64_t num_rows = coo.num_rows;
int64_t num_cols = coo.num_cols;
int64_t nnz = coo.row->shape[0];
CHECK_EQ(num_rows, new_row_id_arr->shape[0])
<< "The new row Id array needs to be the same as the number of rows of COO";
<< "The new row Id array needs to be the same as the number of rows of "
"COO";
CHECK_EQ(num_cols, new_col_id_arr->shape[0])
<< "The new col Id array needs to be the same as the number of cols of COO";
<< "The new col Id array needs to be the same as the number of cols of "
"COO";
// New row/col Ids.
const IdType* new_row_ids = static_cast<IdType*>(new_row_id_arr->data);
const IdType* new_col_ids = static_cast<IdType*>(new_col_id_arr->data);
const IdType *new_row_ids = static_cast<IdType *>(new_row_id_arr->data);
const IdType *new_col_ids = static_cast<IdType *>(new_col_id_arr->data);
// Output COO
NDArray out_row_arr = NDArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
NDArray out_col_arr = NDArray::Empty({nnz}, coo.col->dtype, coo.col->ctx);
NDArray out_data_arr = COOHasData(coo) ? coo.data : NullArray();
IdType *out_row = static_cast<IdType*>(out_row_arr->data);
IdType *out_col = static_cast<IdType*>(out_col_arr->data);
IdType *out_row = static_cast<IdType *>(out_row_arr->data);
IdType *out_col = static_cast<IdType *>(out_col_arr->data);
parallel_for(0, nnz, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
......@@ -785,10 +808,10 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
}
template COOMatrix COOReorder<kDGLCPU, int64_t>(COOMatrix csr, runtime::NDArray new_row_ids,
runtime::NDArray new_col_ids);
template COOMatrix COOReorder<kDGLCPU, int32_t>(COOMatrix csr, runtime::NDArray new_row_ids,
runtime::NDArray new_col_ids);
template COOMatrix COOReorder<kDGLCPU, int64_t>(
COOMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);
template COOMatrix COOReorder<kDGLCPU, int32_t>(
COOMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);
} // namespace impl
} // namespace aten
......
......@@ -5,10 +5,12 @@
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <numeric>
#include <atomic>
#include <numeric>
#include <unordered_set>
#include <vector>
#include "array_utils.h"
namespace dgl {
......@@ -26,8 +28,8 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
if (csr.sorted) {
const IdType *start = indices_data + indptr_data[row];
const IdType *end = indices_data + indptr_data[row + 1];
const IdType* start = indices_data + indptr_data[row];
const IdType* end = indices_data + indptr_data[row + 1];
return std::binary_search(start, end, col);
} else {
for (IdType i = indptr_data[row]; i < indptr_data[row + 1]; ++i) {
......@@ -53,12 +55,15 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const IdType* col_data = static_cast<IdType*>(col->data);
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
runtime::parallel_for(0, std::max(rowlen, collen), 1, [=](int64_t b, int64_t e) {
int64_t i = (row_stride == 0) ? 0 : b;
int64_t j = (col_stride == 0) ? 0 : b;
for (int64_t k = b; i < e && j < e; i += row_stride, j += col_stride, ++k)
rst_data[k] = CSRIsNonZero<XPU, IdType>(csr, row_data[i], col_data[j]) ? 1 : 0;
});
runtime::parallel_for(
0, std::max(rowlen, collen), 1, [=](int64_t b, int64_t e) {
int64_t i = (row_stride == 0) ? 0 : b;
int64_t j = (col_stride == 0) ? 0 : b;
for (int64_t k = b; i < e && j < e;
i += row_stride, j += col_stride, ++k)
rst_data[k] =
CSRIsNonZero<XPU, IdType>(csr, row_data[i], col_data[j]) ? 1 : 0;
});
return rst;
}
......@@ -73,7 +78,7 @@ bool CSRHasDuplicate(CSRMatrix csr) {
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
for (IdType src = 0; src < csr.num_rows; ++src) {
std::unordered_set<IdType> hashmap;
for (IdType eid = indptr_data[src]; eid < indptr_data[src+1]; ++eid) {
for (IdType eid = indptr_data[src]; eid < indptr_data[src + 1]; ++eid) {
const IdType dst = indices_data[eid];
if (hashmap.count(dst)) {
return true;
......@@ -117,7 +122,7 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
template NDArray CSRGetRowNNZ<kDGLCPU, int32_t>(CSRMatrix, NDArray);
template NDArray CSRGetRowNNZ<kDGLCPU, int64_t>(CSRMatrix, NDArray);
///////////////////////////// CSRGetRowColumnIndices /////////////////////////////
/////////////////////////// CSRGetRowColumnIndices /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
......@@ -140,7 +145,8 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
if (CSRHasData(csr))
return csr.data.CreateView({len}, csr.data->dtype, offset);
else
return aten::Range(offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
return aten::Range(
offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
}
template NDArray CSRGetRowData<kDGLCPU, int32_t>(CSRMatrix, int64_t);
......@@ -150,12 +156,12 @@ template NDArray CSRGetRowData<kDGLCPU, int64_t>(CSRMatrix, int64_t);
///////////////////////////// CSRGetDataAndIndices /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
void CollectDataIndicesFromSorted(const IdType *indices_data, const IdType *data,
const IdType start, const IdType end, const IdType col,
std::vector<IdType> *col_vec,
std::vector<IdType> *ret_vec) {
const IdType *start_ptr = indices_data + start;
const IdType *end_ptr = indices_data + end;
void CollectDataIndicesFromSorted(
const IdType* indices_data, const IdType* data, const IdType start,
const IdType end, const IdType col, std::vector<IdType>* col_vec,
std::vector<IdType>* ret_vec) {
const IdType* start_ptr = indices_data + start;
const IdType* end_ptr = indices_data + end;
auto it = std::lower_bound(start_ptr, end_ptr, col);
// This might be a multi-graph. We need to collect all of the matched
// columns.
......@@ -173,13 +179,15 @@ void CollectDataIndicesFromSorted(const IdType *indices_data, const IdType *data
}
template <DGLDeviceType XPU, typename IdType>
std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray rows, NDArray cols) {
// TODO(minjie): more efficient implementation for matrix without duplicate entries
std::vector<NDArray> CSRGetDataAndIndices(
CSRMatrix csr, NDArray rows, NDArray cols) {
// TODO(minjie): more efficient implementation for matrix without duplicate
// entries
const int64_t rowlen = rows->shape[0];
const int64_t collen = cols->shape[0];
CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
<< "Invalid row and col id array.";
<< "Invalid row and col id array.";
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
......@@ -188,40 +196,43 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray rows, NDArray c
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
const IdType* data = CSRHasData(csr)? static_cast<IdType*>(csr.data->data) : nullptr;
const IdType* data =
CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
std::vector<IdType> ret_rows, ret_cols;
std::vector<IdType> ret_data;
for (int64_t i = 0, j = 0; i < rowlen && j < collen; i += row_stride, j += col_stride) {
for (int64_t i = 0, j = 0; i < rowlen && j < collen;
i += row_stride, j += col_stride) {
const IdType row_id = row_data[i], col_id = col_data[j];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
CHECK(row_id >= 0 && row_id < csr.num_rows)
<< "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols)
<< "Invalid col index: " << col_id;
if (csr.sorted) {
// Here we collect col indices and data.
CollectDataIndicesFromSorted<XPU, IdType>(indices_data, data,
indptr_data[row_id],
indptr_data[row_id + 1],
col_id, &ret_cols,
&ret_data);
CollectDataIndicesFromSorted<XPU, IdType>(
indices_data, data, indptr_data[row_id], indptr_data[row_id + 1],
col_id, &ret_cols, &ret_data);
// We need to add row Ids.
while (ret_rows.size() < ret_data.size()) {
ret_rows.push_back(row_id);
}
} else {
for (IdType i = indptr_data[row_id]; i < indptr_data[row_id+1]; ++i) {
for (IdType i = indptr_data[row_id]; i < indptr_data[row_id + 1]; ++i) {
if (indices_data[i] == col_id) {
ret_rows.push_back(row_id);
ret_cols.push_back(col_id);
ret_data.push_back(data? data[i] : i);
ret_data.push_back(data ? data[i] : i);
}
}
}
}
return {NDArray::FromVector(ret_rows, csr.indptr->ctx),
NDArray::FromVector(ret_cols, csr.indptr->ctx),
NDArray::FromVector(ret_data, csr.data->ctx)};
return {
NDArray::FromVector(ret_rows, csr.indptr->ctx),
NDArray::FromVector(ret_cols, csr.indptr->ctx),
NDArray::FromVector(ret_data, csr.data->ctx)};
}
template std::vector<NDArray> CSRGetDataAndIndices<kDGLCPU, int32_t>(
......@@ -240,9 +251,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
const int64_t nnz = csr.indices->shape[0];
const IdType* Ap = static_cast<IdType*>(csr.indptr->data);
const IdType* Aj = static_cast<IdType*>(csr.indices->data);
const IdType* Ax = CSRHasData(csr)? static_cast<IdType*>(csr.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({M + 1}, csr.indptr->dtype, csr.indptr->ctx);
NDArray ret_indices = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
const IdType* Ax =
CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
NDArray ret_indptr =
NDArray::Empty({M + 1}, csr.indptr->dtype, csr.indptr->ctx);
NDArray ret_indices =
NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
NDArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
IdType* Bi = static_cast<IdType*>(ret_indices->data);
......@@ -263,10 +277,10 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
Bp[M] = nnz;
for (int64_t i = 0; i < N; ++i) {
for (IdType j = Ap[i]; j < Ap[i+1]; ++j) {
for (IdType j = Ap[i]; j < Ap[i + 1]; ++j) {
const IdType dst = Aj[j];
Bi[Bp[dst]] = i;
Bx[Bp[dst]] = Ax? Ax[j] : j;
Bx[Bp[dst]] = Ax ? Ax[j] : j;
Bp[dst]++;
}
}
......@@ -278,7 +292,8 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
last = temp;
}
return CSRMatrix{csr.num_cols, csr.num_rows, ret_indptr, ret_indices, ret_data};
return CSRMatrix{
csr.num_cols, csr.num_rows, ret_indptr, ret_indices, ret_data};
}
template CSRMatrix CSRTranspose<kDGLCPU, int32_t>(CSRMatrix csr);
......@@ -293,14 +308,13 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
IdType* ret_row_data = static_cast<IdType*>(ret_row->data);
parallel_for(0, csr.indptr->shape[0] - 1, 10000, [=](int64_t b, int64_t e) {
for (auto i = b; i < e; ++i) {
std::fill(ret_row_data + indptr_data[i],
ret_row_data + indptr_data[i + 1],
i);
std::fill(
ret_row_data + indptr_data[i], ret_row_data + indptr_data[i + 1], i);
}
});
return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
return COOMatrix(
csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
csr.sorted);
}
template COOMatrix CSRToCOO<kDGLCPU, int32_t>(CSRMatrix csr);
......@@ -315,7 +329,8 @@ COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
// data array should have the same type as the indices arrays
const IdType* data = CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
const IdType* data =
CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
NDArray ret_row = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
NDArray ret_col = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
IdType* ret_row_data = static_cast<IdType*>(ret_row->data);
......@@ -343,7 +358,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
const int64_t num_rows = end - start;
const int64_t nnz = indptr[end] - indptr[start];
IdArray ret_indptr = IdArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indices->ctx);
IdArray ret_indptr =
IdArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indices->ctx);
IdType* r_indptr = static_cast<IdType*>(ret_indptr->data);
for (int64_t i = start; i < end + 1; ++i) {
r_indptr[i - start] = indptr[i] - indptr[start];
......@@ -353,13 +369,13 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
{nnz}, csr.indices->dtype, indptr[start] * sizeof(IdType));
IdArray ret_data;
if (CSRHasData(csr))
ret_data = csr.data.CreateView({nnz}, csr.data->dtype, indptr[start] * sizeof(IdType));
ret_data = csr.data.CreateView(
{nnz}, csr.data->dtype, indptr[start] * sizeof(IdType));
else
ret_data = aten::Range(indptr[start], indptr[end],
csr.indptr->dtype.bits, csr.indptr->ctx);
return CSRMatrix(num_rows, csr.num_cols,
ret_indptr, ret_indices, ret_data,
csr.sorted);
ret_data = aten::Range(
indptr[start], indptr[end], csr.indptr->dtype.bits, csr.indptr->ctx);
return CSRMatrix(
num_rows, csr.num_cols, ret_indptr, ret_indices, ret_data, csr.sorted);
}
template CSRMatrix CSRSliceRows<kDGLCPU, int32_t>(CSRMatrix, int64_t, int64_t);
......@@ -370,7 +386,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
CHECK_SAME_DTYPE(csr.indices, rows);
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
const IdType* data = CSRHasData(csr)? static_cast<IdType*>(csr.data->data) : nullptr;
const IdType* data =
CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
const auto len = rows->shape[0];
const IdType* rows_data = static_cast<IdType*>(rows->data);
int64_t nnz = 0;
......@@ -389,28 +406,28 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
bool err = false;
std::stringstream err_msg_stream;
// Perform two-round parallel prefix sum using OpenMP
#pragma omp parallel
// Perform two-round parallel prefix sum using OpenMP
#pragma omp parallel
{
int64_t tid = omp_get_thread_num();
int64_t num_threads = omp_get_num_threads();
#pragma omp single
#pragma omp single
{
sums.resize(num_threads + 1);
sums[0] = 0;
sums.resize(num_threads + 1);
sums[0] = 0;
}
int64_t sum = 0;
// First round of parallel prefix sum. All threads perform local prefix sums.
#pragma omp for schedule(static) nowait
// First round of parallel prefix sum. All threads perform local prefix sums.
#pragma omp for schedule(static) nowait
for (int64_t i = 0; i < len; ++i) {
int64_t rid = rows_data[i];
if (rid >= csr.num_rows) {
if (!err_flag.test_and_set()) {
err_msg_stream << "expect row ID " << rid << " to be less than number of rows "
<< csr.num_rows;
err_msg_stream << "expect row ID " << rid
<< " to be less than number of rows " << csr.num_rows;
err = true;
}
} else {
......@@ -419,20 +436,18 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
}
}
sums[tid + 1] = sum;
#pragma omp barrier
#pragma omp barrier
#pragma omp single
#pragma omp single
{
for (int64_t i = 1; i < num_threads; ++i)
sums[i] += sums[i - 1];
for (int64_t i = 1; i < num_threads; ++i) sums[i] += sums[i - 1];
}
int64_t offset = sums[tid];
// Second round of parallel prefix sum. Update the local prefix sums.
#pragma omp for schedule(static)
for (int64_t i = 0; i < len; ++i)
ret_indptr_data[i + 1] += offset;
// Second round of parallel prefix sum. Update the local prefix sums.
#pragma omp for schedule(static)
for (int64_t i = 0; i < len; ++i) ret_indptr_data[i + 1] += offset;
}
if (err) {
LOG(FATAL) << err_msg_stream.str();
......@@ -454,26 +469,30 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
for (auto i = b; i < e; ++i) {
const IdType rid = rows_data[i];
// note: zero is allowed
std::copy(indices_data + indptr_data[rid], indices_data + indptr_data[rid + 1],
ret_indices_data + ret_indptr_data[i]);
std::copy(
indices_data + indptr_data[rid], indices_data + indptr_data[rid + 1],
ret_indices_data + ret_indptr_data[i]);
if (data)
std::copy(data + indptr_data[rid], data + indptr_data[rid + 1],
ret_data + ret_indptr_data[i]);
std::copy(
data + indptr_data[rid], data + indptr_data[rid + 1],
ret_data + ret_indptr_data[i]);
else
std::iota(ret_data + ret_indptr_data[i], ret_data + ret_indptr_data[i + 1],
indptr_data[rid]);
std::iota(
ret_data + ret_indptr_data[i], ret_data + ret_indptr_data[i + 1],
indptr_data[rid]);
}
});
return ret;
}
template CSRMatrix CSRSliceRows<kDGLCPU, int32_t>(CSRMatrix , NDArray);
template CSRMatrix CSRSliceRows<kDGLCPU, int64_t>(CSRMatrix , NDArray);
template CSRMatrix CSRSliceRows<kDGLCPU, int32_t>(CSRMatrix, NDArray);
template CSRMatrix CSRSliceRows<kDGLCPU, int64_t>(CSRMatrix, NDArray);
///////////////////////////// CSRSliceMatrix /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
CSRMatrix CSRSliceMatrix(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
IdHashMap<IdType> hashmap(cols);
const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0];
......@@ -482,7 +501,8 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
const IdType* data = has_data? static_cast<IdType*>(csr.data->data) : nullptr;
const IdType* data =
has_data ? static_cast<IdType*>(csr.data->data) : nullptr;
std::vector<IdType> sub_indptr, sub_indices;
std::vector<IdType> sub_data;
......@@ -498,7 +518,7 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
if (newj != kInvalidId) {
++sub_indptr[i];
sub_indices.push_back(newj);
sub_data.push_back(has_data? data[p] : p);
sub_data.push_back(has_data ? data[p] : p);
}
}
}
......@@ -512,13 +532,13 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
sub_indptr[new_nrows] = sub_indices.size();
const int64_t nnz = sub_data.size();
NDArray sub_data_arr = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
NDArray sub_data_arr =
NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
IdType* ptr = static_cast<IdType*>(sub_data_arr->data);
std::copy(sub_data.begin(), sub_data.end(), ptr);
return CSRMatrix{new_nrows, new_ncols,
NDArray::FromVector(sub_indptr, csr.indptr->ctx),
NDArray::FromVector(sub_indices, csr.indptr->ctx),
sub_data_arr};
return CSRMatrix{
new_nrows, new_ncols, NDArray::FromVector(sub_indptr, csr.indptr->ctx),
NDArray::FromVector(sub_indices, csr.indptr->ctx), sub_data_arr};
}
template CSRMatrix CSRSliceMatrix<kDGLCPU, int32_t>(
......@@ -529,8 +549,9 @@ template CSRMatrix CSRSliceMatrix<kDGLCPU, int64_t>(
///////////////////////////// CSRReorder /////////////////////////////
template <DGLDeviceType XPU, typename IdType>
CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
runtime::NDArray new_col_id_arr) {
CSRMatrix CSRReorder(
CSRMatrix csr, runtime::NDArray new_row_id_arr,
runtime::NDArray new_col_id_arr) {
CHECK_SAME_DTYPE(csr.indices, new_row_id_arr);
CHECK_SAME_DTYPE(csr.indices, new_col_id_arr);
......@@ -543,21 +564,25 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
int64_t nnz = csr.indices->shape[0];
CHECK_EQ(nnz, in_indptr[num_rows]);
CHECK_EQ(num_rows, new_row_id_arr->shape[0])
<< "The new row Id array needs to be the same as the number of rows of CSR";
<< "The new row Id array needs to be the same as the number of rows of "
"CSR";
CHECK_EQ(num_cols, new_col_id_arr->shape[0])
<< "The new col Id array needs to be the same as the number of cols of CSR";
<< "The new col Id array needs to be the same as the number of cols of "
"CSR";
// New row/col Ids.
const IdType* new_row_ids = static_cast<IdType*>(new_row_id_arr->data);
const IdType* new_col_ids = static_cast<IdType*>(new_col_id_arr->data);
// Output CSR
NDArray out_indptr_arr = NDArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indptr->ctx);
NDArray out_indices_arr = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
NDArray out_indptr_arr =
NDArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indptr->ctx);
NDArray out_indices_arr =
NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
NDArray out_data_arr = NDArray::Empty({nnz}, csr.data->dtype, csr.data->ctx);
IdType *out_indptr = static_cast<IdType*>(out_indptr_arr->data);
IdType *out_indices = static_cast<IdType*>(out_indices_arr->data);
IdType *out_data = static_cast<IdType*>(out_data_arr->data);
IdType* out_indptr = static_cast<IdType*>(out_indptr_arr->data);
IdType* out_indices = static_cast<IdType*>(out_indices_arr->data);
IdType* out_data = static_cast<IdType*>(out_data_arr->data);
// Compute the length of rows for the new matrix.
std::vector<IdType> new_row_lens(num_rows, -1);
......@@ -579,12 +604,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
// Here I iterate rows in the order of the old matrix.
parallel_for(0, num_rows, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const IdType *in_row = in_indices + in_indptr[i];
const IdType *in_row_data = in_data + in_indptr[i];
const IdType* in_row = in_indices + in_indptr[i];
const IdType* in_row_data = in_data + in_indptr[i];
int64_t new_row_id = new_row_ids[i];
IdType *out_row = out_indices + out_indptr[new_row_id];
IdType *out_row_data = out_data + out_indptr[new_row_id];
IdType* out_row = out_indices + out_indptr[new_row_id];
IdType* out_row_data = out_data + out_indptr[new_row_id];
int64_t row_len = new_row_lens[new_row_id];
// Here I iterate col indices in a row in the order of the old matrix.
......@@ -595,14 +620,14 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
// TODO(zhengda) maybe we should sort the column indices.
}
});
return CSRMatrix(num_rows, num_cols,
out_indptr_arr, out_indices_arr, out_data_arr);
return CSRMatrix(
num_rows, num_cols, out_indptr_arr, out_indices_arr, out_data_arr);
}
template CSRMatrix CSRReorder<kDGLCPU, int64_t>(CSRMatrix csr, runtime::NDArray new_row_ids,
runtime::NDArray new_col_ids);
template CSRMatrix CSRReorder<kDGLCPU, int32_t>(CSRMatrix csr, runtime::NDArray new_row_ids,
runtime::NDArray new_col_ids);
template CSRMatrix CSRReorder<kDGLCPU, int64_t>(
CSRMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);
template CSRMatrix CSRReorder<kDGLCPU, int32_t>(
CSRMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);
} // namespace impl
} // namespace aten
......
......@@ -8,14 +8,15 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/runtime/config.h>
#include <dgl/runtime/parallel_for.h>
#include <math.h>
#include <algorithm>
#include <limits>
#include <memory>
#include <algorithm>
#include <vector>
#include "spmm_binary_ops.h"
#if !defined(_WIN32)
#ifdef USE_AVX
......@@ -44,8 +45,9 @@ namespace cpu {
* JIT'ed kernel.
*/
template <typename IdType, typename DType, typename Op>
void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast,
const CSRMatrix& csr, const DType* X, const DType* W, DType* O) {
void SpMMSumCsrXbyak(
dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast,
const CSRMatrix& csr, const DType* X, const DType* W, DType* O) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
......@@ -79,8 +81,9 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
* for the computation of different nodes.
*/
template <typename IdType, typename DType, typename Op>
void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X,
const DType* W, DType* O) {
void SpMMSumCsrNaive(
const BcastOff& bcast, const CSRMatrix& csr, const DType* X, const DType* W,
DType* O) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
......@@ -97,9 +100,9 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
out_off[k] += Op::Call(lhs_off, rhs_off);
}
}
......@@ -118,8 +121,9 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
* for the computation of different nodes.
*/
template <typename IdType, typename DType, typename Op>
void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
NDArray efeat, NDArray out) {
void SpMMSumCsr(
const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
NDArray out) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
......@@ -135,17 +139,15 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
CHECK_NOTNULL(X);
}
if (Op::use_rhs) {
if (has_idx)
CHECK_NOTNULL(edges);
if (has_idx) CHECK_NOTNULL(edges);
CHECK_NOTNULL(W);
}
#if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM
const bool no_libxsmm =
bcast.use_bcast ||
std::is_same<DType, double>::value ||
!dgl::runtime::Config::Global()->IsLibxsmmAvailable();
const bool no_libxsmm = bcast.use_bcast ||
std::is_same<DType, double>::value ||
!dgl::runtime::Config::Global()->IsLibxsmmAvailable();
if (!no_libxsmm) {
SpMMSumCsrLibxsmm<IdType, DType, Op>(bcast, csr, ufeat, efeat, out);
} else {
......@@ -156,14 +158,14 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
(dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
/* Distribute the kernel among OMP threads */
ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
? asm_kernel_ptr.get()
: nullptr;
? asm_kernel_ptr.get()
: nullptr;
if (cpu_spec && dim > 16 && !bcast.use_bcast) {
SpMMSumCsrXbyak<IdType, DType, Op>(cpu_spec, bcast, csr, X, W, O);
} else {
#endif // USE_AVX
#endif // _WIN32
SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O);
SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O);
#if !defined(_WIN32)
#ifdef USE_AVX
}
......@@ -186,8 +188,9 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
* we use atomic operators in the reduction phase.
*/
template <typename IdType, typename DType, typename Op>
void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
NDArray efeat, NDArray out) {
void SpMMSumCoo(
const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat,
NDArray out) {
const bool has_idx = !IsNullArray(coo.data);
const IdType* row = coo.row.Ptr<IdType>();
const IdType* col = coo.col.Ptr<IdType>();
......@@ -210,9 +213,9 @@ void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off);
if (val != 0) {
#pragma omp atomic
......@@ -232,21 +235,24 @@ void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
* \param argu Arg-Min/Max on source nodes, which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max
* reducer. \param arge Arg-Min/Max on edges. which refers the source node
* indices correspond to the minimum/maximum values of reduction result on
* reducer.
* \param arge Arg-Min/Max on edges. which refers the source node indices
correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max
* reducer. \note It uses node parallel strategy, different threads are
* responsible for the computation of different nodes. \note The result will
* contain infinity for zero-degree nodes.
* reducer.
* \note It uses node parallel strategy, different threads are responsible for
* the computation of different nodes.
* \note The result will contain infinity for zero-degree nodes.
*/
template <typename IdType, typename DType, typename Op, typename Cmp>
void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
void SpMMCmpCsr(
const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
const IdType* indices = static_cast<IdType*>(csr.indices->data);
const IdType* edges =
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
......@@ -262,8 +268,7 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
CHECK_NOTNULL(argX);
}
if (Op::use_rhs) {
if (has_idx)
CHECK_NOTNULL(edges);
if (has_idx) CHECK_NOTNULL(edges);
CHECK_NOTNULL(W);
CHECK_NOTNULL(argW);
}
......@@ -271,12 +276,12 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
#ifdef USE_AVX
#ifdef USE_LIBXSMM
const bool no_libxsmm =
bcast.use_bcast ||
std::is_same<DType, double>::value ||
!dgl::runtime::Config::Global()->IsLibxsmmAvailable();
const bool no_libxsmm = bcast.use_bcast ||
std::is_same<DType, double>::value ||
!dgl::runtime::Config::Global()->IsLibxsmmAvailable();
if (!no_libxsmm) {
SpMMCmpCsrLibxsmm<IdType, DType, Op, Cmp>(bcast, csr, ufeat, efeat, out, argu, arge);
SpMMCmpCsrLibxsmm<IdType, DType, Op, Cmp>(
bcast, csr, ufeat, efeat, out, argu, arge);
} else {
#endif // USE_LIBXSMM
#endif // USE_AVX
......@@ -295,9 +300,9 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off);
if (Cmp::Call(out_off[k], val)) {
out_off[k] = val;
......@@ -328,29 +333,31 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max
* reducer.
* \param arge Arg-Min/Max on edges. which refers the source node
* indices correspond to the minimum/maximum values of reduction result on
* \param arge Arg-Min/Max on edges. which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max
* reducer.
* \param argu_ntype Node type of the arg-Min/Max on source nodes, which refers the
* source node types correspond to the minimum/maximum values of reduction result
* on destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param arge_etype Edge-type of the arg-Min/Max on edges. which refers the source
* node indices correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param argu_ntype Node type of the arg-Min/Max on source nodes, which refers
* the source node types correspond to the minimum/maximum values of
* reduction result on destination nodes. It's useful in computing
* gradients of Min/Max reducer.
* \param arge_etype Edge-type of the arg-Min/Max on edges. which refers the
* source node indices correspond to the minimum/maximum values of
* reduction result on destination nodes. It's useful in computing
* gradients of Min/Max reducer.
* \param src_type Node type of the source nodes of an etype
* \param etype Edge type
*/
template <typename IdType, typename DType, typename Op, typename Cmp>
void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
NDArray efeat, NDArray out, NDArray argu, NDArray arge,
NDArray argu_ntype, NDArray arge_etype,
const int ntype, const int etype) {
void SpMMCmpCsrHetero(
const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge, NDArray argu_ntype,
NDArray arge_etype, const int ntype, const int etype) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
const IdType* indices = static_cast<IdType*>(csr.indices->data);
const IdType* edges =
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
......@@ -358,8 +365,10 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
DType* O = static_cast<DType*>(out->data);
IdType* argX = Op::use_lhs ? static_cast<IdType*>(argu->data) : nullptr;
IdType* argW = Op::use_rhs ? static_cast<IdType*>(arge->data) : nullptr;
IdType* argX_ntype = Op::use_lhs ? static_cast<IdType*>(argu_ntype->data) : nullptr;
IdType* argW_etype = Op::use_rhs ? static_cast<IdType*>(arge_etype->data) : nullptr;
IdType* argX_ntype =
Op::use_lhs ? static_cast<IdType*>(argu_ntype->data) : nullptr;
IdType* argW_etype =
Op::use_rhs ? static_cast<IdType*>(arge_etype->data) : nullptr;
CHECK_NOTNULL(indptr);
CHECK_NOTNULL(O);
if (Op::use_lhs) {
......@@ -368,8 +377,7 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
CHECK_NOTNULL(argX);
}
if (Op::use_rhs) {
if (has_idx)
CHECK_NOTNULL(edges);
if (has_idx) CHECK_NOTNULL(edges);
CHECK_NOTNULL(W);
CHECK_NOTNULL(argW);
}
......@@ -389,9 +397,9 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off);
if (Cmp::Call(out_off[k], val)) {
out_off[k] = val;
......@@ -410,7 +418,6 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
});
}
/*!
* \brief CPU kernel of SpMM-Min/Max on Coo format.
* \param bcast Broadcast information.
......@@ -421,22 +428,25 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
* \param argu Arg-Min/Max on source nodes, which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max
* reducer. \param arge Arg-Min/Max on edges. which refers the source node
* indices correspond to the minimum/maximum values of reduction result on
* reducer.
* \param arge Arg-Min/Max on edges. which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max
* reducer. \note it uses node parallel strategy, different threads are
* responsible for the computation of different nodes. To avoid possible data
* hazard, we use atomic operators in the reduction phase. \note The result will
* contain infinity for zero-degree nodes.
* reducer.
* \note it uses node parallel strategy, different threads are responsible for
* the computation of different nodes. To avoid possible data hazard, we
* use atomic operators in the reduction phase.
* \note The result will contain infinity for zero-degree nodes.
*/
template <typename IdType, typename DType, typename Op, typename Cmp>
void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
void SpMMCmpCoo(
const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge) {
const bool has_idx = !IsNullArray(coo.data);
const IdType* row = static_cast<IdType*>(coo.row->data);
const IdType* col = static_cast<IdType*>(coo.col->data);
const IdType* edges =
has_idx ? static_cast<IdType*>(coo.data->data) : nullptr;
has_idx ? static_cast<IdType*>(coo.data->data) : nullptr;
const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
......@@ -460,9 +470,9 @@ void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off);
#pragma omp critical
if (Cmp::Call(out_off[k], val)) {
......@@ -474,7 +484,6 @@ void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
}
}
/*!
* \brief CPU kernel of Edge_softmax_csr_forward on Csr format.
* \param bcast Broadcast information.
......@@ -484,28 +493,29 @@ void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
* \param out The result of edge_softmax_forward.
*/
template <typename IdType, typename DType, typename Op>
void Edge_softmax_csr_forward(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
NDArray efeat, NDArray out) {
void Edge_softmax_csr_forward(
const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
NDArray out) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
const IdType* edges =
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
const int64_t dim = bcast.out_len, rhs_dim = bcast.rhs_len;
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
std::vector<DType> data_e(row_end-row_start, 0);
std::vector<IdType> num(row_end-row_start, 0);
std::vector<DType> data_e(row_end - row_start, 0);
std::vector<IdType> num(row_end - row_start, 0);
for (int64_t k = 0; k < dim; ++k) {
DType max_v = -std::numeric_limits<DType>::infinity();
for (IdType j = row_start; j < row_end; ++j) {
const IdType eid = has_idx ? edges[j] : j;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
data_e[j-row_start] = *rhs_off;
num[j-row_start] = eid*rhs_dim+rhs_add;
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
data_e[j - row_start] = *rhs_off;
num[j - row_start] = eid * rhs_dim + rhs_add;
max_v = std::max<DType>(max_v, (*rhs_off));
}
DType exp_sum = 0;
......@@ -514,15 +524,14 @@ void Edge_softmax_csr_forward(const BcastOff& bcast, const CSRMatrix& csr, NDArr
element = std::exp(element);
exp_sum += element;
}
for (int i=0; i < row_end-row_start; i++) {
out.Ptr<DType>()[num[i]] = data_e[i]/exp_sum;
for (int i = 0; i < row_end - row_start; i++) {
out.Ptr<DType>()[num[i]] = data_e[i] / exp_sum;
}
}
}
});
}
/*!
* \brief CPU kernel of Edge_softmax_csr_backward on Csr format.
* \param bcast Broadcast information.
......@@ -532,12 +541,13 @@ void Edge_softmax_csr_forward(const BcastOff& bcast, const CSRMatrix& csr, NDArr
* \param back_out The result of edge_softmax_backward.
*/
template <typename IdType, typename DType, typename Op>
void Edge_softmax_csr_backward(const BcastOff& bcast, const CSRMatrix& csr, NDArray out,
NDArray sds, NDArray back_out) {
void Edge_softmax_csr_backward(
const BcastOff& bcast, const CSRMatrix& csr, NDArray out, NDArray sds,
NDArray back_out) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
const IdType* edges =
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
const DType* W_out = Op::use_rhs ? static_cast<DType*>(out->data) : nullptr;
const DType* W_sds = Op::use_rhs ? static_cast<DType*>(sds->data) : nullptr;
const int64_t dim = bcast.out_len, rhs_dim = bcast.rhs_len;
......@@ -550,17 +560,18 @@ void Edge_softmax_csr_backward(const BcastOff& bcast, const CSRMatrix& csr, NDAr
const IdType eid = has_idx ? edges[j] : j;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* rhs_off_sds =
Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
sum_sds += (*rhs_off_sds);
}
for (IdType j = row_start; j< row_end; ++j) {
for (IdType j = row_start; j < row_end; ++j) {
const IdType eid = has_idx ? edges[j] : j;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* rhs_off_out =
Op::use_rhs ? W_out + eid * rhs_dim + rhs_add : nullptr;
Op::use_rhs ? W_out + eid * rhs_dim + rhs_add : nullptr;
const DType* rhs_off_sds =
Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
back_out.Ptr<DType>()[eid*rhs_dim+rhs_add] = (*rhs_off_sds) - sum_sds*(*rhs_off_out);
Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
back_out.Ptr<DType>()[eid * rhs_dim + rhs_add] =
(*rhs_off_sds) - sum_sds * (*rhs_off_out);
}
}
}
......
......@@ -13,13 +13,14 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dmlc/logging.h>
#include <algorithm>
#if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM
#include <unistd.h>
#include <libxsmm.h>
#include <unistd.h>
#ifdef DEBUG
#include <x86intrin.h>
#endif // DEBUG
......@@ -53,8 +54,10 @@ int32_t GetLLCSize() {
* are assigned to OMP threads.
* \param csr The Csr matrix.
* \param block_csr_array The array containing csr matrices of all blocks.
* \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
* \param num_M_blocks Number of blocks to create along the rows of adjacency
* matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency
* matrix.
* \param M_block_size block size along the rows of adjacency matrix.
* \param K_block_size block size along the columns of adjacency matrix.
* \param use_lhs Whether to use lhs.
......@@ -62,38 +65,30 @@ int32_t GetLLCSize() {
*/
template <typename IdType>
inline void SpMMCreateBlocks(
const CSRMatrix& csr,
CSRMatrixInternal<IdType, IdType> *block_csr_array,
IdType num_M_blocks,
IdType num_K_blocks,
IdType M_block_size,
IdType K_block_size,
bool use_lhs, bool use_rhs) {
const CSRMatrix &csr, CSRMatrixInternal<IdType, IdType> *block_csr_array,
IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
IdType K_block_size, bool use_lhs, bool use_rhs) {
const IdType M = csr.num_rows;
const IdType K = csr.num_cols;
IdType* indptr = csr.indptr.Ptr<IdType>();
IdType* indices = csr.indices.Ptr<IdType>();
IdType* edges = csr.data.Ptr<IdType>();
IdType *indptr = csr.indptr.Ptr<IdType>();
IdType *indices = csr.indices.Ptr<IdType>();
IdType *edges = csr.data.Ptr<IdType>();
CHECK_NOTNULL(indptr);
if (use_lhs)
CHECK_NOTNULL(indices);
if (use_rhs)
CHECK_NOTNULL(edges);
if (use_lhs) CHECK_NOTNULL(indices);
if (use_rhs) CHECK_NOTNULL(edges);
if (num_K_blocks > 1) {
IdType *indptr_block_buf = reinterpret_cast<IdType *>(aligned_alloc(64,
(M_block_size + 1) * num_M_blocks *
num_K_blocks * sizeof(IdType)));
IdType *indices_block_buf = reinterpret_cast<IdType *>(aligned_alloc(64,
indptr[M] * sizeof(IdType)));
IdType *edges_block_buf = reinterpret_cast<IdType *>(aligned_alloc(64,
indptr[M] * sizeof(IdType)));
IdType *indptr_block_buf = reinterpret_cast<IdType *>(aligned_alloc(
64, (M_block_size + 1) * num_M_blocks * num_K_blocks * sizeof(IdType)));
IdType *indices_block_buf = reinterpret_cast<IdType *>(
aligned_alloc(64, indptr[M] * sizeof(IdType)));
IdType *edges_block_buf = reinterpret_cast<IdType *>(
aligned_alloc(64, indptr[M] * sizeof(IdType)));
#pragma omp parallel
{
IdType *my_cur_col_id = reinterpret_cast<IdType *>(aligned_alloc(64, 2 * M_block_size *
sizeof(IdType)));
IdType *my_cur_col_id = reinterpret_cast<IdType *>(
aligned_alloc(64, 2 * M_block_size * sizeof(IdType)));
#pragma omp for
for (IdType m = 0; m < num_M_blocks; m++) {
......@@ -103,10 +98,8 @@ inline void SpMMCreateBlocks(
IdType cur_indices_id = 0;
IdType *my_indices_block_buf, *my_edges_block_buf;
if (use_lhs)
my_indices_block_buf = indices_block_buf + indptr[M_start];
if (use_rhs)
my_edges_block_buf = edges_block_buf + indptr[M_start];
if (use_lhs) my_indices_block_buf = indices_block_buf + indptr[M_start];
if (use_rhs) my_edges_block_buf = edges_block_buf + indptr[M_start];
for (IdType i = M_start; i < M_end; i++) {
my_cur_col_id[(i - M_start) * 2] = indptr[i];
......@@ -119,16 +112,15 @@ inline void SpMMCreateBlocks(
cur_csr.num_rows = M_end - M_start;
cur_csr.num_cols = K_end - K_start;
// Create csr_ij
IdType *cur_csr_indptr = indptr_block_buf + (m * num_K_blocks + k) * (M_block_size + 1);
IdType *cur_csr_indptr =
indptr_block_buf + (m * num_K_blocks + k) * (M_block_size + 1);
IdType *cur_csr_indices = nullptr, *cur_csr_edges = nullptr;
if (use_lhs)
cur_csr_indices = my_indices_block_buf + cur_indices_id;
if (use_rhs)
cur_csr_edges = my_edges_block_buf + cur_indices_id;
if (use_lhs) cur_csr_indices = my_indices_block_buf + cur_indices_id;
if (use_rhs) cur_csr_edges = my_edges_block_buf + cur_indices_id;
IdType cur_nnz = 0;
for (IdType i = M_start; i < M_end; i++) {
const IdType row_start = my_cur_col_id[(i - M_start) * 2];
const IdType row_end = my_cur_col_id[(i - M_start) * 2 + 1];
const IdType row_end = my_cur_col_id[(i - M_start) * 2 + 1];
cur_csr_indptr[i - M_start] = cur_nnz;
IdType eid;
for (eid = row_start; eid < row_end; eid++) {
......@@ -138,10 +130,8 @@ inline void SpMMCreateBlocks(
break;
}
CHECK_LT(cur_indices_id + cur_nnz, nnz);
if (use_lhs)
cur_csr_indices[cur_nnz] = src;
if (use_rhs)
cur_csr_edges[cur_nnz] = edge;
if (use_lhs) cur_csr_indices[cur_nnz] = src;
if (use_rhs) cur_csr_edges[cur_nnz] = edge;
cur_nnz++;
}
my_cur_col_id[(i - M_start) * 2] = eid;
......@@ -149,10 +139,8 @@ inline void SpMMCreateBlocks(
cur_csr_indptr[cur_csr.num_rows] = cur_nnz;
cur_indices_id += cur_nnz;
cur_csr.indptr = cur_csr_indptr;
if (use_lhs)
cur_csr.indices = cur_csr_indices;
if (use_rhs)
cur_csr.data = cur_csr_edges;
if (use_lhs) cur_csr.indices = cur_csr_indices;
if (use_rhs) cur_csr.data = cur_csr_edges;
block_csr_array[m * num_K_blocks + k] = cur_csr;
}
CHECK_EQ(nnz, cur_indices_id);
......@@ -199,9 +187,7 @@ inline void SpMMCreateBlocks(
*/
template <typename IdType, typename DType, typename Op>
inline libxsmm_meltwfunction_opreduce_vecs_idx SpMMCreateLibxsmmKernel(
bool has_idx,
IdType N,
libxsmm_meltw_opreduce_vecs_flags redop_flag,
bool has_idx, IdType N, libxsmm_meltw_opreduce_vecs_flags redop_flag,
bool is_cmp) {
int _ld = N;
libxsmm_meltw_opreduce_vecs_flags opredop_flags;
......@@ -220,48 +206,61 @@ inline libxsmm_meltwfunction_opreduce_vecs_idx SpMMCreateLibxsmmKernel(
opredop_flags = LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY;
}
// Second, set which of lhs or rhs is considered first and second operand.
// This is needed since libxsmm assumes that the copy operation always copies the first operand.
// So, if we need to copy rhs, we need to set that as the first operand.
// For rhs, we also set whether to use implicit indices or provided indices.
// This is needed since libxsmm assumes that the copy operation always copies
// the first operand. So, if we need to copy rhs, we need to set that as the
// first operand. For rhs, we also set whether to use implicit indices or
// provided indices.
// TODO(Steve): fix this long line in a separate PR.
if (std::is_same<Op, op::CopyLhs<DType>>::value) {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
} else if (std::is_same<Op, op::CopyRhs<DType>>::value) {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIN_VECIDX);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIN_VECIDX);
if (!has_idx) {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VECIDX);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VECIDX);
}
} else {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
if (has_idx) {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_INDEXED_VEC);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_INDEXED_VEC);
} else {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VEC);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VEC);
}
}
// Third, we set the Redop in the opredop_flags
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags | redop_flag);
// Fourth, in case of Cmp Redop, set whether to record argmax/argmin for lhs/rhs
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags | redop_flag);
// Fourth, in case of Cmp Redop, set whether to record argmax/argmin for
// lhs/rhs
if (is_cmp) {
if (Op::use_lhs) {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_0);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_0);
}
if (Op::use_rhs) {
opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_1);
opredop_flags =
(libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_1);
}
}
libxsmm_meltwfunction_opreduce_vecs_idx kernel = nullptr;
if (std::is_same<DType, float>::value) {
kernel = libxsmm_dispatch_meltw_opreduce_vecs_idx(
N, &_ld, &_ld, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
(sizeof(IdType) == 8) ? LIBXSMM_DATATYPE_I64 : LIBXSMM_DATATYPE_I32, opredop_flags);
N, &_ld, &_ld, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
(sizeof(IdType) == 8) ? LIBXSMM_DATATYPE_I64 : LIBXSMM_DATATYPE_I32,
opredop_flags);
}
if (kernel == nullptr) {
LOG(FATAL) << "Failed to generate libxsmm kernel for the SpMM operation."
......@@ -278,32 +277,34 @@ inline libxsmm_meltwfunction_opreduce_vecs_idx SpMMCreateLibxsmmKernel(
* \param C The result feature on destination nodes.
* \param has_idx For the edge features, are there indices available.
* \param N Feature size.
* \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
* \param num_M_blocks Number of blocks to create along the rows of adjacency
* matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency
* matrix.
* \param M_block_size block size along the rows of adjacency matrix.
* \param kernel The libxsmm kernel.
*/
template <typename IdType, typename DType>
inline void SpMMBlockwiseOpSum(
CSRMatrixInternal<IdType, IdType> *block_csr_array,
const DType *B, const DType *E, DType *C, bool has_idx, IdType N,
IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
CSRMatrixInternal<IdType, IdType> *block_csr_array, const DType *B,
const DType *E, DType *C, bool has_idx, IdType N, IdType num_M_blocks,
IdType num_K_blocks, IdType M_block_size,
libxsmm_meltwfunction_opreduce_vecs_idx kernel) {
DType (*in_matrix1)[N] = (DType (*)[N])B;
DType (*in_matrix2)[N] = (DType (*)[N])E;
DType (*output)[N] = (DType (*)[N])C;
DType(*in_matrix1)[N] = (DType(*)[N])B;
DType(*in_matrix2)[N] = (DType(*)[N])E;
DType(*output)[N] = (DType(*)[N])C;
#pragma omp parallel
{
for (IdType k = 0; k < num_K_blocks; k++) {
#pragma omp for schedule(dynamic)
for (IdType m = 0; m < num_M_blocks; m++) {
CSRMatrixInternal<IdType, IdType> cur_csr = block_csr_array[m * num_K_blocks + k];
CSRMatrixInternal<IdType, IdType> cur_csr =
block_csr_array[m * num_K_blocks + k];
const IdType M_start = m * M_block_size;
for (IdType i = 0; i < cur_csr.num_rows; i++) {
const IdType row_start = cur_csr.indptr[i];
const IdType row_end = cur_csr.indptr[i + 1];
const IdType row_end = cur_csr.indptr[i + 1];
const IdType dst = i + M_start;
libxsmm_meltw_opreduce_vecs_idx_param params;
......@@ -335,36 +336,37 @@ inline void SpMMBlockwiseOpSum(
* \param argE Arg-Min/Max on edges.
* \param has_idx For the edge features, are there indices available.
* \param N Feature size.
* \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
* \param num_M_blocks Number of blocks to create along the rows of adjacency
* matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency
* matrix.
* \param M_block_size block size along the rows of adjacency matrix.
* \param kernel The libxsmm kernel.
*/
template <typename IdType, typename DType, typename Op, typename Cmp>
inline void SpMMBlockwiseOpCmp(
CSRMatrixInternal<IdType, IdType> *block_csr_array,
const DType *B, const DType *E, DType *C, IdType *argB, IdType *argE,
bool has_idx, IdType N,
IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
CSRMatrixInternal<IdType, IdType> *block_csr_array, const DType *B,
const DType *E, DType *C, IdType *argB, IdType *argE, bool has_idx,
IdType N, IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
libxsmm_meltwfunction_opreduce_vecs_idx kernel) {
DType (*in_matrix1)[N] = (DType (*)[N])B;
DType (*in_matrix2)[N] = (DType (*)[N])E;
DType (*output)[N] = (DType (*)[N])C;
IdType (*out_matrix1)[N] = (IdType (*)[N])argB;
IdType (*out_matrix2)[N] = (IdType (*)[N])argE;
DType(*in_matrix1)[N] = (DType(*)[N])B;
DType(*in_matrix2)[N] = (DType(*)[N])E;
DType(*output)[N] = (DType(*)[N])C;
IdType(*out_matrix1)[N] = (IdType(*)[N])argB;
IdType(*out_matrix2)[N] = (IdType(*)[N])argE;
#pragma omp parallel
{
for (IdType k = 0; k < num_K_blocks; k++) {
#pragma omp for schedule(dynamic)
for (IdType m = 0; m < num_M_blocks; m++) {
CSRMatrixInternal<IdType, IdType> cur_csr = block_csr_array[m * num_K_blocks + k];
CSRMatrixInternal<IdType, IdType> cur_csr =
block_csr_array[m * num_K_blocks + k];
const IdType M_start = m * M_block_size;
for (IdType i = 0; i < cur_csr.num_rows; i++) {
const IdType row_start = cur_csr.indptr[i];
const IdType row_end = cur_csr.indptr[i + 1];
const IdType row_end = cur_csr.indptr[i + 1];
const IdType dst = i + M_start;
libxsmm_meltw_opreduce_vecs_idx_param params;
......@@ -391,23 +393,21 @@ inline void SpMMBlockwiseOpCmp(
/*!
* \brief Free the tiled CSR matrix data.
* \param block_csr_array The array containing csr matrices of all blocks.
* \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
* \param num_M_blocks Number of blocks to create along the rows of adjacency
* matrix.
* \param num_K_blocks Number of blocks to create along the columns of adjacency
* matrix.
* \param use_lhs Whether to use lhs.
* \param use_rhs Whether to use rhs.
*/
template <typename IdType>
inline void SpMMFreeBlocks(
CSRMatrixInternal<IdType, IdType> *block_csr_array,
IdType num_M_blocks, IdType num_K_blocks,
bool use_lhs, bool use_rhs) {
CSRMatrixInternal<IdType, IdType> *block_csr_array, IdType num_M_blocks,
IdType num_K_blocks, bool use_lhs, bool use_rhs) {
if (num_K_blocks > 1) {
free(block_csr_array[0].indptr);
if (use_lhs)
free(block_csr_array[0].indices);
if (use_rhs)
free(block_csr_array[0].data);
if (use_lhs) free(block_csr_array[0].indices);
if (use_rhs) free(block_csr_array[0].data);
}
free(block_csr_array);
}
......@@ -425,12 +425,8 @@ inline void SpMMFreeBlocks(
*/
template <typename IdType, typename DType, typename Op, typename Redop>
void SpMMRedopCsrOpt(
const BcastOff& bcast,
const CSRMatrix& csr,
NDArray ufeat, NDArray efeat,
NDArray out,
NDArray argu, NDArray arge) {
const BcastOff &bcast, const CSRMatrix &csr, NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge) {
int32_t llc_size = GetLLCSize();
#ifdef DEBUG
......@@ -440,11 +436,12 @@ void SpMMRedopCsrOpt(
const bool has_idx = !IsNullArray(csr.data);
DType* C = out.Ptr<DType>();
const DType* B = ufeat.Ptr<DType>();
const DType* E = efeat.Ptr<DType>();
DType *C = out.Ptr<DType>();
const DType *B = ufeat.Ptr<DType>();
const DType *E = efeat.Ptr<DType>();
IdType *argB, *argE;
if (std::is_same<Redop, op::Max<DType>>::value || std::is_same<Redop, op::Min<DType>>::value) {
if (std::is_same<Redop, op::Max<DType>>::value ||
std::is_same<Redop, op::Min<DType>>::value) {
argB = argu.Ptr<IdType>();
argE = arge.Ptr<IdType>();
}
......@@ -453,7 +450,7 @@ void SpMMRedopCsrOpt(
const IdType M = csr.num_rows;
const IdType N = bcast.out_len;
const IdType K = csr.num_cols;
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType *indptr = csr.indptr.Ptr<IdType>();
CHECK_NOTNULL(indptr);
const IdType total_nnz = indptr[M];
if (M <= 0 || K <= 0 || N <= 0 || total_nnz <= 0) return;
......@@ -461,8 +458,9 @@ void SpMMRedopCsrOpt(
const double avg_degree = total_nnz * 1.0 / M;
const double nnz_prob = avg_degree / K;
IdType K_block_size = std::min((int64_t)K, (int64_t)(llc_size / (N * sizeof(DType) *
nnz_prob * BLOCKING_HEURISTIC_PARAM)));
IdType K_block_size = std::min(
(int64_t)K,
(int64_t)(llc_size / (N * sizeof(DType) * nnz_prob * BLOCKING_HEURISTIC_PARAM)));
IdType M_block_size = M / (nthreads * NUM_BLOCKS_PER_THREAD);
if (M_block_size == 0) M_block_size = 1;
if (K_block_size == 0) K_block_size = 1;
......@@ -471,8 +469,9 @@ void SpMMRedopCsrOpt(
IdType num_K_blocks = (K + K_block_size - 1) / K_block_size;
CSRMatrixInternal<IdType, IdType> *block_csr_array =
(CSRMatrixInternal<IdType, IdType> *)aligned_alloc(64,
sizeof(CSRMatrixInternal<IdType, IdType>) * num_M_blocks * num_K_blocks);
(CSRMatrixInternal<IdType, IdType> *)aligned_alloc(
64, sizeof(CSRMatrixInternal<IdType, IdType>) * num_M_blocks *
num_K_blocks);
#ifdef DEBUG
endTick = __rdtsc();
......@@ -489,14 +488,17 @@ void SpMMRedopCsrOpt(
LOG(INFO) << "total_nnz = " << total_nnz << ", avg_degree = " << avg_degree;
LOG(INFO) << "has_idx = " << has_idx;
LOG(INFO) << "nnz_prob = " << nnz_prob;
LOG(INFO) << "K_block_size = " << K_block_size << ", M_block_size = " << M_block_size;
LOG(INFO) << "num_K_blocks = " << num_K_blocks << ", num_M_blocks = " << num_M_blocks;
LOG(INFO) << "K_block_size = " << K_block_size
<< ", M_block_size = " << M_block_size;
LOG(INFO) << "num_K_blocks = " << num_K_blocks
<< ", num_M_blocks = " << num_M_blocks;
LOG(INFO) << "stage0 ticks = " << (endTick - startTick);
startTick = __rdtsc();
#endif // DEBUG
SpMMCreateBlocks(csr, block_csr_array, num_M_blocks, num_K_blocks, M_block_size, K_block_size,
Op::use_lhs, Op::use_rhs);
SpMMCreateBlocks(
csr, block_csr_array, num_M_blocks, num_K_blocks, M_block_size,
K_block_size, Op::use_lhs, Op::use_rhs);
#ifdef DEBUG
endTick = __rdtsc();
......@@ -506,17 +508,14 @@ void SpMMRedopCsrOpt(
libxsmm_meltwfunction_opreduce_vecs_idx kernel = nullptr;
if (std::is_same<Redop, op::Max<DType>>::value) {
kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(has_idx, N,
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MAX,
true);
kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(
has_idx, N, LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MAX, true);
} else if (std::is_same<Redop, op::Min<DType>>::value) {
kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(has_idx, N,
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MIN,
true);
kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(
has_idx, N, LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MIN, true);
} else if (std::is_same<Redop, op::Add<DType>>::value) {
kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(has_idx, N,
LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_SUM,
false);
kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(
has_idx, N, LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_SUM, false);
}
#ifdef DEBUG
......@@ -525,12 +524,15 @@ void SpMMRedopCsrOpt(
startTick = __rdtsc();
#endif // DEBUG
if (std::is_same<Redop, op::Max<DType>>::value || std::is_same<Redop, op::Min<DType>>::value) {
SpMMBlockwiseOpCmp<IdType, DType, Op, Redop>(block_csr_array, B, E, C, argB, argE, has_idx, N,
num_M_blocks, num_K_blocks, M_block_size, kernel);
if (std::is_same<Redop, op::Max<DType>>::value ||
std::is_same<Redop, op::Min<DType>>::value) {
SpMMBlockwiseOpCmp<IdType, DType, Op, Redop>(
block_csr_array, B, E, C, argB, argE, has_idx, N, num_M_blocks,
num_K_blocks, M_block_size, kernel);
} else {
SpMMBlockwiseOpSum(block_csr_array, B, E, C, has_idx, N, num_M_blocks, num_K_blocks,
M_block_size, kernel);
SpMMBlockwiseOpSum(
block_csr_array, B, E, C, has_idx, N, num_M_blocks, num_K_blocks,
M_block_size, kernel);
}
#ifdef DEBUG
......@@ -539,7 +541,8 @@ void SpMMRedopCsrOpt(
startTick = __rdtsc();
#endif // DEBUG
SpMMFreeBlocks(block_csr_array, num_M_blocks, num_K_blocks, Op::use_lhs, Op::use_rhs);
SpMMFreeBlocks(
block_csr_array, num_M_blocks, num_K_blocks, Op::use_lhs, Op::use_rhs);
#ifdef DEBUG
endTick = __rdtsc();
......@@ -557,10 +560,12 @@ void SpMMRedopCsrOpt(
* \note it uses libxsmm, blocking and dynamic thread scheduling.
*/
template <typename IdType, typename DType, typename Op>
void SpMMSumCsrLibxsmm(const BcastOff& bcast, const CSRMatrix& csr,
NDArray ufeat, NDArray efeat, NDArray out) {
void SpMMSumCsrLibxsmm(
const BcastOff &bcast, const CSRMatrix &csr, NDArray ufeat, NDArray efeat,
NDArray out) {
NDArray dummy;
SpMMRedopCsrOpt<IdType, DType, Op, op::Add<DType>>(bcast, csr, ufeat, efeat, out, dummy, dummy);
SpMMRedopCsrOpt<IdType, DType, Op, op::Add<DType>>(
bcast, csr, ufeat, efeat, out, dummy, dummy);
}
/*!
......@@ -575,9 +580,11 @@ void SpMMSumCsrLibxsmm(const BcastOff& bcast, const CSRMatrix& csr,
* \note it uses libxsmm, blocking and dynamic thread scheduling.
*/
template <typename IdType, typename DType, typename Op, typename Cmp>
void SpMMCmpCsrLibxsmm(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
SpMMRedopCsrOpt<IdType, DType, Op, Cmp>(bcast, csr, ufeat, efeat, out, argu, arge);
void SpMMCmpCsrLibxsmm(
const BcastOff &bcast, const CSRMatrix &csr, NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge) {
SpMMRedopCsrOpt<IdType, DType, Op, Cmp>(
bcast, csr, ufeat, efeat, out, argu, arge);
}
} // namespace cpu
......
......@@ -4,58 +4,49 @@
* \brief Graph traversal implementation
*/
#include "./traversal.h"
#include <dgl/graph_traversal.h>
#include <algorithm>
#include <queue>
#include "./traversal.h"
namespace dgl {
namespace aten {
namespace impl {
namespace {
// A utility view class to wrap a vector into a queue.
template<typename DType>
template <typename DType>
struct VectorQueueWrapper {
std::vector<DType>* vec;
size_t head = 0;
explicit VectorQueueWrapper(std::vector<DType>* vec): vec(vec) {}
explicit VectorQueueWrapper(std::vector<DType>* vec) : vec(vec) {}
void push(const DType& elem) {
vec->push_back(elem);
}
void push(const DType& elem) { vec->push_back(elem); }
DType top() const {
return vec->operator[](head);
}
DType top() const { return vec->operator[](head); }
void pop() {
++head;
}
void pop() { ++head; }
bool empty() const {
return head == vec->size();
}
bool empty() const { return head == vec->size(); }
size_t size() const {
return vec->size() - head;
}
size_t size() const { return vec->size() - head; }
};
// Internal function to merge multiple traversal traces into one ndarray.
// It is similar to zip the vectors together.
template<typename DType>
IdArray MergeMultipleTraversals(
const std::vector<std::vector<DType>>& traces) {
template <typename DType>
IdArray MergeMultipleTraversals(const std::vector<std::vector<DType>>& traces) {
int64_t max_len = 0, total_len = 0;
for (size_t i = 0; i < traces.size(); ++i) {
const int64_t tracelen = traces[i].size();
max_len = std::max(max_len, tracelen);
total_len += traces[i].size();
}
IdArray ret = IdArray::Empty({total_len},
DGLDataType{kDGLInt, sizeof(DType) * 8, 1},
DGLContext{kDGLCPU, 0});
IdArray ret = IdArray::Empty(
{total_len}, DGLDataType{kDGLInt, sizeof(DType) * 8, 1},
DGLContext{kDGLCPU, 0});
DType* ret_data = static_cast<DType*>(ret->data);
for (int64_t i = 0; i < max_len; ++i) {
for (size_t j = 0; j < traces.size(); ++j) {
......@@ -71,15 +62,15 @@ IdArray MergeMultipleTraversals(
// Internal function to compute sections if multiple traversal traces
// are merged into one ndarray.
template<typename DType>
IdArray ComputeMergedSections(
const std::vector<std::vector<DType>>& traces) {
template <typename DType>
IdArray ComputeMergedSections(const std::vector<std::vector<DType>>& traces) {
int64_t max_len = 0;
for (size_t i = 0; i < traces.size(); ++i) {
const int64_t tracelen = traces[i].size();
max_len = std::max(max_len, tracelen);
}
IdArray ret = IdArray::Empty({max_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
IdArray ret = IdArray::Empty(
{max_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
int64_t* ret_data = static_cast<int64_t*>(ret->data);
for (int64_t i = 0; i < max_len; ++i) {
int64_t sec_len = 0;
......@@ -101,13 +92,13 @@ Frontiers BFSNodesFrontiers(const CSRMatrix& csr, IdArray source) {
std::vector<IdType> ids;
std::vector<int64_t> sections;
VectorQueueWrapper<IdType> queue(&ids);
auto visit = [&] (const int64_t v) { };
auto make_frontier = [&] () {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
auto visit = [&](const int64_t v) {};
auto make_frontier = [&]() {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
BFSTraverseNodes<IdType>(csr, source, &queue, visit, make_frontier);
Frontiers front;
......@@ -116,8 +107,10 @@ Frontiers BFSNodesFrontiers(const CSRMatrix& csr, IdArray source) {
return front;
}
template Frontiers BFSNodesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
template Frontiers BFSNodesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
template Frontiers BFSNodesFrontiers<kDGLCPU, int32_t>(
const CSRMatrix&, IdArray);
template Frontiers BFSNodesFrontiers<kDGLCPU, int64_t>(
const CSRMatrix&, IdArray);
template <DGLDeviceType XPU, typename IdType>
Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
......@@ -126,16 +119,16 @@ Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
// NOTE: std::queue has no top() method.
std::vector<IdType> nodes;
VectorQueueWrapper<IdType> queue(&nodes);
auto visit = [&] (const IdType e) { ids.push_back(e); };
auto visit = [&](const IdType e) { ids.push_back(e); };
bool first_frontier = true;
auto make_frontier = [&] {
if (first_frontier) {
first_frontier = false; // do not push the first section when doing edges
} else if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
if (first_frontier) {
first_frontier = false; // do not push the first section when doing edges
} else if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
BFSTraverseEdges<IdType>(csr, source, &queue, visit, make_frontier);
Frontiers front;
......@@ -144,21 +137,23 @@ Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
return front;
}
template Frontiers BFSEdgesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
template Frontiers BFSEdgesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
template Frontiers BFSEdgesFrontiers<kDGLCPU, int32_t>(
const CSRMatrix&, IdArray);
template Frontiers BFSEdgesFrontiers<kDGLCPU, int64_t>(
const CSRMatrix&, IdArray);
template <DGLDeviceType XPU, typename IdType>
Frontiers TopologicalNodesFrontiers(const CSRMatrix& csr) {
std::vector<IdType> ids;
std::vector<int64_t> sections;
VectorQueueWrapper<IdType> queue(&ids);
auto visit = [&] (const uint64_t v) { };
auto make_frontier = [&] () {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
auto visit = [&](const uint64_t v) {};
auto make_frontier = [&]() {
if (!queue.empty()) {
// do not push zero-length frontier
sections.push_back(queue.size());
}
};
TopologicalNodes<IdType>(csr, &queue, visit, make_frontier);
Frontiers front;
......@@ -167,8 +162,10 @@ Frontiers TopologicalNodesFrontiers(const CSRMatrix& csr) {
return front;
}
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&);
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&);
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int32_t>(
const CSRMatrix&);
template Frontiers TopologicalNodesFrontiers<kDGLCPU, int64_t>(
const CSRMatrix&);
template <DGLDeviceType XPU, typename IdType>
Frontiers DGLDFSEdges(const CSRMatrix& csr, IdArray source) {
......@@ -177,7 +174,7 @@ Frontiers DGLDFSEdges(const CSRMatrix& csr, IdArray source) {
std::vector<std::vector<IdType>> edges(len);
for (int64_t i = 0; i < len; ++i) {
auto visit = [&] (IdType e, int tag) { edges[i].push_back(e); };
auto visit = [&](IdType e, int tag) { edges[i].push_back(e); };
DFSLabeledEdges<IdType>(csr, src_data[i], false, false, visit);
}
......@@ -191,11 +188,9 @@ template Frontiers DGLDFSEdges<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
template Frontiers DGLDFSEdges<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
template <DGLDeviceType XPU, typename IdType>
Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
IdArray source,
const bool has_reverse_edge,
const bool has_nontree_edge,
const bool return_labels) {
Frontiers DGLDFSLabeledEdges(
const CSRMatrix& csr, IdArray source, const bool has_reverse_edge,
const bool has_nontree_edge, const bool return_labels) {
const int64_t len = source->shape[0];
const IdType* src_data = static_cast<IdType*>(source->data);
std::vector<std::vector<IdType>> edges(len);
......@@ -206,14 +201,14 @@ Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
}
for (int64_t i = 0; i < len; ++i) {
auto visit = [&] (IdType e, int64_t tag) {
auto visit = [&](IdType e, int64_t tag) {
edges[i].push_back(e);
if (return_labels) {
tags[i].push_back(tag);
}
};
DFSLabeledEdges<IdType>(csr, src_data[i],
has_reverse_edge, has_nontree_edge, visit);
DFSLabeledEdges<IdType>(
csr, src_data[i], has_reverse_edge, has_nontree_edge, visit);
}
Frontiers front;
......@@ -226,16 +221,10 @@ Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
return front;
}
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int32_t>(const CSRMatrix&,
IdArray,
const bool,
const bool,
const bool);
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int64_t>(const CSRMatrix&,
IdArray,
const bool,
const bool,
const bool);
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int32_t>(
const CSRMatrix&, IdArray, const bool, const bool, const bool);
template Frontiers DGLDFSLabeledEdges<kDGLCPU, int64_t>(
const CSRMatrix&, IdArray, const bool, const bool, const bool);
} // namespace impl
} // namespace aten
......
......@@ -3,15 +3,16 @@
* \file array/cpu/traversal.h
* \brief Graph traversal routines.
*
* Traversal routines generate frontiers. Frontiers can be node frontiers or edge
* frontiers depending on the traversal function. Each frontier is a
* list of nodes/edges (specified by their ids). An optional tag can be specified
* for each node/edge (represented by an int value).
* Traversal routines generate frontiers. Frontiers can be node frontiers or
* edge frontiers depending on the traversal function. Each frontier is a list
* of nodes/edges (specified by their ids). An optional tag can be specified for
* each node/edge (represented by an int value).
*/
#ifndef DGL_ARRAY_CPU_TRAVERSAL_H_
#define DGL_ARRAY_CPU_TRAVERSAL_H_
#include <dgl/graph_interface.h>
#include <stack>
#include <tuple>
#include <vector>
......@@ -43,16 +44,16 @@ namespace impl {
* \param reversed If true, BFS follows the in-edge direction
* \param queue The queue used to do bfs.
* \param visit The function to call when a node is visited.
* \param make_frontier The function to indicate that a new froniter can be made;
* \param make_frontier The function to indicate that a new froniter can be
* made;
*/
template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseNodes(const CSRMatrix& csr,
IdArray source,
Queue* queue,
VisitFn visit,
FrontierFn make_frontier) {
template <
typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseNodes(
const CSRMatrix &csr, IdArray source, Queue *queue, VisitFn visit,
FrontierFn make_frontier) {
const int64_t len = source->shape[0];
const IdType *src_data = static_cast<IdType*>(source->data);
const IdType *src_data = static_cast<IdType *>(source->data);
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
......@@ -71,7 +72,7 @@ void BFSTraverseNodes(const CSRMatrix& csr,
for (size_t i = 0; i < size; ++i) {
const IdType u = queue->top();
queue->pop();
for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
auto v = indices_data[idx];
if (!visited[v]) {
visited[v] = true;
......@@ -109,16 +110,16 @@ void BFSTraverseNodes(const CSRMatrix& csr,
* \param queue The queue used to do bfs.
* \param visit The function to call when a node is visited.
* The argument would be edge ID.
* \param make_frontier The function to indicate that a new frontier can be made;
* \param make_frontier The function to indicate that a new frontier can be
* made;
*/
template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseEdges(const CSRMatrix& csr,
IdArray source,
Queue* queue,
VisitFn visit,
FrontierFn make_frontier) {
template <
typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void BFSTraverseEdges(
const CSRMatrix &csr, IdArray source, Queue *queue, VisitFn visit,
FrontierFn make_frontier) {
const int64_t len = source->shape[0];
const IdType* src_data = static_cast<IdType*>(source->data);
const IdType *src_data = static_cast<IdType *>(source->data);
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
......@@ -138,7 +139,7 @@ void BFSTraverseEdges(const CSRMatrix& csr,
for (size_t i = 0; i < size; ++i) {
const IdType u = queue->top();
queue->pop();
for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
auto e = eid_data ? eid_data[idx] : idx;
const IdType v = indices_data[idx];
if (!visited[v]) {
......@@ -174,13 +175,14 @@ void BFSTraverseEdges(const CSRMatrix& csr,
* \param reversed If true, follows the in-edge direction
* \param queue The queue used to do bfs.
* \param visit The function to call when a node is visited.
* \param make_frontier The function to indicate that a new froniter can be made;
* \param make_frontier The function to indicate that a new froniter can be
* made;
*/
template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void TopologicalNodes(const CSRMatrix& csr,
Queue* queue,
VisitFn visit,
FrontierFn make_frontier) {
template <
typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
void TopologicalNodes(
const CSRMatrix &csr, Queue *queue, VisitFn visit,
FrontierFn make_frontier) {
int64_t num_visited_nodes = 0;
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
......@@ -206,7 +208,7 @@ void TopologicalNodes(const CSRMatrix& csr,
for (size_t i = 0; i < size; ++i) {
const IdType u = queue->top();
queue->pop();
for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
const IdType v = indices_data[idx];
if (--(degrees[v]) == 0) {
visit(v);
......@@ -219,7 +221,8 @@ void TopologicalNodes(const CSRMatrix& csr,
}
if (num_visited_nodes != num_nodes) {
LOG(FATAL) << "Error in topological traversal: loop detected in the given graph.";
LOG(FATAL)
<< "Error in topological traversal: loop detected in the given graph.";
}
}
......@@ -236,32 +239,29 @@ enum DFSEdgeTag {
* FORWARD(0), REVERSE(1), NONTREE(2)
*
* A FORWARD edge is one in which `u` has been visisted but `v` has not.
* A REVERSE edge is one in which both `u` and `v` have been visisted and the edge
* is in the DFS tree.
* A NONTREE edge is one in which both `u` and `v` have been visisted but the edge
* is NOT in the DFS tree.
* A REVERSE edge is one in which both `u` and `v` have been visisted and the
* edge is in the DFS tree. A NONTREE edge is one in which both `u` and `v` have
* been visisted but the edge is NOT in the DFS tree.
*
* \param source Source node.
* \param reversed If true, DFS follows the in-edge direction
* \param has_reverse_edge If true, REVERSE edges are included
* \param has_nontree_edge If true, NONTREE edges are included
* \param visit The function to call when an edge is visited; the edge id and its
* tag will be given as the arguments.
* \param visit The function to call when an edge is visited; the edge id and
* its tag will be given as the arguments.
*/
template<typename IdType, typename VisitFn>
void DFSLabeledEdges(const CSRMatrix& csr,
IdType source,
bool has_reverse_edge,
bool has_nontree_edge,
VisitFn visit) {
template <typename IdType, typename VisitFn>
void DFSLabeledEdges(
const CSRMatrix &csr, IdType source, bool has_reverse_edge,
bool has_nontree_edge, VisitFn visit) {
const int64_t num_nodes = csr.num_rows;
CHECK_GE(num_nodes, source) << "source " << source <<
" is out of range [0," << num_nodes << "]";
CHECK_GE(num_nodes, source)
<< "source " << source << " is out of range [0," << num_nodes << "]";
const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
const IdType *eid_data = static_cast<IdType *>(csr.data->data);
if (indptr_data[source+1]-indptr_data[source] == 0) {
if (indptr_data[source + 1] - indptr_data[source] == 0) {
// no out-going edges from the source node
return;
}
......@@ -278,7 +278,8 @@ void DFSLabeledEdges(const CSRMatrix& csr,
while (!stack.empty()) {
std::tie(u, i, on_tree) = stack.top();
const IdType v = indices_data[indptr_data[u] + i];
const IdType uv = eid_data ? eid_data[indptr_data[u] + i] : indptr_data[u] + i;
const IdType uv =
eid_data ? eid_data[indptr_data[u] + i] : indptr_data[u] + i;
if (visited[v]) {
if (!on_tree && has_nontree_edge) {
visit(uv, kNonTree);
......@@ -288,7 +289,7 @@ void DFSLabeledEdges(const CSRMatrix& csr,
stack.pop();
// find next one.
if (indptr_data[u] + i < indptr_data[u + 1] - 1) {
stack.push(std::make_tuple(u, i+1, false));
stack.push(std::make_tuple(u, i + 1, false));
}
} else {
visited[v] = true;
......
......@@ -4,9 +4,10 @@
* \brief Array cumsum GPU implementation
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -17,7 +18,8 @@ template <DGLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero) {
const int64_t len = array.NumElements();
if (len == 0)
return !prepend_zero ? array : aten::Full(0, 1, array->dtype.bits, array->ctx);
return !prepend_zero ? array
: aten::Full(0, 1, array->dtype.bits, array->ctx);
auto device = runtime::DeviceAPI::Get(array->ctx);
cudaStream_t stream = runtime::getCurrentCUDAStream();
......
......@@ -5,9 +5,10 @@
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -16,14 +17,11 @@ namespace impl {
template <typename IdType>
struct IsNonZeroIndex {
explicit IsNonZeroIndex(const IdType * array) : array_(array) {
}
explicit IsNonZeroIndex(const IdType* array) : array_(array) {}
__device__ bool operator() (const int64_t index) {
return array_[index] != 0;
}
__device__ bool operator()(const int64_t index) { return array_[index] != 0; }
const IdType * array_;
const IdType* array_;
};
template <DGLDeviceType XPU, typename IdType>
......@@ -36,22 +34,23 @@ IdArray NonZero(IdArray array) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
const IdType * const in_data = static_cast<const IdType*>(array->data);
int64_t * const out_data = static_cast<int64_t*>(ret->data);
const IdType* const in_data = static_cast<const IdType*>(array->data);
int64_t* const out_data = static_cast<int64_t*>(ret->data);
IsNonZeroIndex<IdType> comp(in_data);
cub::CountingInputIterator<int64_t> counter(0);
// room for cub to output on GPU
int64_t * d_num_nonzeros = static_cast<int64_t*>(
device->AllocWorkspace(ctx, sizeof(int64_t)));
int64_t* d_num_nonzeros =
static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
size_t temp_size = 0;
CUDA_CALL(cub::DeviceSelect::If(nullptr, temp_size, counter, out_data,
d_num_nonzeros, len, comp, stream));
void * temp = device->AllocWorkspace(ctx, temp_size);
CUDA_CALL(cub::DeviceSelect::If(temp, temp_size, counter, out_data,
d_num_nonzeros, len, comp, stream));
CUDA_CALL(cub::DeviceSelect::If(
nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
stream));
void* temp = device->AllocWorkspace(ctx, temp_size);
CUDA_CALL(cub::DeviceSelect::If(
temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
device->FreeWorkspace(ctx, temp);
// copy number of selected elements from GPU to CPU
......
......@@ -4,9 +4,10 @@
* \brief Array sort GPU implementation
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
......@@ -29,26 +30,30 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
if (num_bits == 0) {
num_bits = sizeof(IdType)*8;
num_bits = sizeof(IdType) * 8;
}
// Allocate workspace
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr, workspace_size,
keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream));
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
0, num_bits, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
CUDA_CALL(cub::DeviceRadixSort::SortPairs(workspace, workspace_size,
keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream));
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
workspace, workspace_size, keys_in, keys_out, values_in, values_out,
nitems, 0, num_bits, stream));
device->FreeWorkspace(ctx, workspace);
return std::make_pair(sorted_array, sorted_idx);
}
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int32_t>(IdArray, int num_bits);
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int64_t>(IdArray, int num_bits);
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int32_t>(
IdArray, int num_bits);
template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int64_t>(
IdArray, int num_bits);
} // namespace impl
} // namespace aten
......
......@@ -4,6 +4,7 @@
* \brief COO2CSR
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
......@@ -46,18 +47,15 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
if (!COOHasData(coo))
coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
NDArray indptr =
aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
CUSPARSE_CALL(cusparseXcoo2csr(
thr_entry->cusparse_handle,
coo.row.Ptr<int32_t>(),
nnz,
coo.num_rows,
indptr_ptr,
CUSPARSE_INDEX_BASE_ZERO));
return CSRMatrix(coo.num_rows, coo.num_cols,
indptr, coo.col, coo.data, col_sorted);
thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
indptr_ptr, CUSPARSE_INDEX_BASE_ZERO));
return CSRMatrix(
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
}
/*!
......@@ -77,9 +75,8 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
*/
template <typename IdType>
__global__ void _SortedSearchKernelUpperBound(
const IdType* hay, int64_t hay_size,
const IdType* needles, int64_t num_needles,
IdType* pos) {
const IdType* hay, int64_t hay_size, const IdType* needles,
int64_t num_needles, IdType* pos) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < num_needles) {
......@@ -123,14 +120,12 @@ CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
const int nt = cuda::FindNumThreads(coo.num_rows);
const int nb = (coo.num_rows + nt - 1) / nt;
IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
CUDA_KERNEL_CALL(_SortedSearchKernelUpperBound,
nb, nt, 0, stream,
coo.row.Ptr<int64_t>(), nnz,
rowids.Ptr<int64_t>(), coo.num_rows,
indptr.Ptr<int64_t>() + 1);
return CSRMatrix(coo.num_rows, coo.num_cols,
indptr, coo.col, coo.data, col_sorted);
CUDA_KERNEL_CALL(
_SortedSearchKernelUpperBound, nb, nt, 0, stream, coo.row.Ptr<int64_t>(),
nnz, rowids.Ptr<int64_t>(), coo.num_rows, indptr.Ptr<int64_t>() + 1);
return CSRMatrix(
coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
}
template CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo);
......
......@@ -4,8 +4,9 @@
* \brief Sort COO index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../c_api_common.h"
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
......@@ -18,21 +19,20 @@ namespace impl {
///////////////////////////// COOSort_ /////////////////////////////
/**
* @brief Encode row and column IDs into a single scalar per edge.
*
* @tparam IdType The type to encode as.
* @param row The row (src) IDs per edge.
* @param col The column (dst) IDs per edge.
* @param nnz The number of edges.
* @param col_bits The number of bits used to encode the destination. The row
* information is packed into the remaining bits.
* @param key The encoded edges (output).
*/
* @brief Encode row and column IDs into a single scalar per edge.
*
* @tparam IdType The type to encode as.
* @param row The row (src) IDs per edge.
* @param col The column (dst) IDs per edge.
* @param nnz The number of edges.
* @param col_bits The number of bits used to encode the destination. The row
* information is packed into the remaining bits.
* @param key The encoded edges (output).
*/
template <typename IdType>
__global__ void _COOEncodeEdgesKernel(
const IdType* const row, const IdType* const col,
const int64_t nnz, const int col_bits, IdType * const key) {
const IdType* const row, const IdType* const col, const int64_t nnz,
const int col_bits, IdType* const key) {
int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
if (tx < nnz) {
......@@ -41,20 +41,19 @@ __global__ void _COOEncodeEdgesKernel(
}
/**
* @brief Decode row and column IDs from the encoded edges.
*
* @tparam IdType The type the edges are encoded as.
* @param key The encoded edges.
* @param nnz The number of edges.
* @param col_bits The number of bits used to store the column/dst ID.
* @param row The row (src) IDs per edge (output).
* @param col The col (dst) IDs per edge (output).
*/
* @brief Decode row and column IDs from the encoded edges.
*
* @tparam IdType The type the edges are encoded as.
* @param key The encoded edges.
* @param nnz The number of edges.
* @param col_bits The number of bits used to store the column/dst ID.
* @param row The row (src) IDs per edge (output).
* @param col The col (dst) IDs per edge (output).
*/
template <typename IdType>
__global__ void _COODecodeEdgesKernel(
const IdType* const key, const int64_t nnz, const int col_bits,
IdType * const row, IdType * const col) {
IdType* const row, IdType* const col) {
int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
if (tx < nnz) {
......@@ -64,9 +63,7 @@ __global__ void _COODecodeEdgesKernel(
}
}
template<typename T>
template <typename T>
int _NumberOfBits(const T& range) {
if (range <= 1) {
// ranges of 0 or 1 require no bits to store
......@@ -74,12 +71,12 @@ int _NumberOfBits(const T& range) {
}
int bits = 1;
while (bits < static_cast<int>(sizeof(T)*8) && (1 << bits) < range) {
while (bits < static_cast<int>(sizeof(T) * 8) && (1 << bits) < range) {
++bits;
}
CHECK_EQ((range-1) >> bits, 0);
CHECK_NE((range-1) >> (bits-1), 0);
CHECK_EQ((range - 1) >> bits, 0);
CHECK_NE((range - 1) >> (bits - 1), 0);
return bits;
}
......@@ -95,20 +92,20 @@ void COOSort_(COOMatrix* coo, bool sort_column) {
const int num_bits = row_bits + col_bits;
const int nt = 256;
const int nb = (nnz+nt-1)/nt;
CHECK(static_cast<int64_t>(nb)*nt >= nnz);
const int nb = (nnz + nt - 1) / nt;
CHECK(static_cast<int64_t>(nb) * nt >= nnz);
IdArray pos = aten::NewIdArray(nnz, coo->row->ctx, coo->row->dtype.bits);
CUDA_KERNEL_CALL(_COOEncodeEdgesKernel, nb, nt, 0, stream,
coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>(),
nnz, col_bits, pos.Ptr<IdType>());
CUDA_KERNEL_CALL(
_COOEncodeEdgesKernel, nb, nt, 0, stream, coo->row.Ptr<IdType>(),
coo->col.Ptr<IdType>(), nnz, col_bits, pos.Ptr<IdType>());
auto sorted = Sort(pos, num_bits);
CUDA_KERNEL_CALL(_COODecodeEdgesKernel, nb, nt, 0, stream,
sorted.first.Ptr<IdType>(), nnz, col_bits,
coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());
CUDA_KERNEL_CALL(
_COODecodeEdgesKernel, nb, nt, 0, stream, sorted.first.Ptr<IdType>(),
nnz, col_bits, coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());
if (aten::COOHasData(*coo))
coo->data = IndexSelect(coo->data, sorted.second);
......@@ -138,8 +135,8 @@ template void COOSort_<kDGLCUDA, int64_t>(COOMatrix* coo, bool sort_column);
template <typename IdType>
__global__ void _COOIsSortedKernel(
const IdType* row, const IdType* col,
int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
const IdType* row, const IdType* col, int64_t nnz, int8_t* row_sorted,
int8_t* col_sorted) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < nnz) {
......@@ -148,8 +145,8 @@ __global__ void _COOIsSortedKernel(
col_sorted[0] = 1;
} else {
row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
col_sorted[tx] = static_cast<int8_t>(
row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
col_sorted[tx] =
static_cast<int8_t>(row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
}
tx += stride_x;
}
......@@ -161,18 +158,19 @@ std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const auto& ctx = coo.row->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
// be fine.
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
// should be fine.
int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
const int nt = cuda::FindNumThreads(nnz);
const int nb = (nnz + nt - 1) / nt;
CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, stream,
coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
nnz, row_flags, col_flags);
CUDA_KERNEL_CALL(
_COOIsSortedKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
coo.col.Ptr<IdType>(), nnz, row_flags, col_flags);
const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
const bool col_sorted =
row_sorted ? cuda::AllTrue(col_flags, nnz, ctx) : false;
device->FreeWorkspace(ctx, row_flags);
device->FreeWorkspace(ctx, col_flags);
......
......@@ -4,6 +4,7 @@
* \brief CSR2COO
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
......@@ -32,20 +33,16 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
NDArray row = aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
NDArray row =
aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
int32_t* row_ptr = static_cast<int32_t*>(row->data);
CUSPARSE_CALL(cusparseXcsr2coo(
thr_entry->cusparse_handle,
indptr_ptr,
indices->shape[0],
csr.num_rows,
row_ptr,
CUSPARSE_INDEX_BASE_ZERO));
return COOMatrix(csr.num_rows, csr.num_cols,
row, indices, data,
true, csr.sorted);
thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
row_ptr, CUSPARSE_INDEX_BASE_ZERO));
return COOMatrix(
csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
}
/*!
......@@ -65,8 +62,8 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
*/
template <typename DType, typename IdType>
__global__ void _RepeatKernel(
const DType* val, const IdType* pos,
DType* out, int64_t n_row, int64_t length) {
const DType* val, const IdType* pos, DType* out, int64_t n_row,
int64_t length) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
......@@ -88,15 +85,13 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
const int nt = 256;
const int nb = (nnz + nt - 1) / nt;
CUDA_KERNEL_CALL(_RepeatKernel,
nb, nt, 0, stream,
rowids.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
csr.num_rows, nnz);
return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
CUDA_KERNEL_CALL(
_RepeatKernel, nb, nt, 0, stream, rowids.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(), csr.num_rows, nnz);
return COOMatrix(
csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
csr.sorted);
}
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
......@@ -111,8 +106,7 @@ COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
template <>
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
COOMatrix coo = CSRToCOO<kDGLCUDA, int32_t>(csr);
if (aten::IsNullArray(coo.data))
return coo;
if (aten::IsNullArray(coo.data)) return coo;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
......@@ -130,21 +124,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
data_ptr,
row_ptr,
&workspace_size));
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, &workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
data_ptr,
row_ptr,
col_ptr,
workspace));
thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
data_ptr, row_ptr, col_ptr, workspace));
device->FreeWorkspace(row->ctx, workspace);
// The row and column field have already been reordered according
......@@ -158,8 +143,7 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
template <>
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr) {
COOMatrix coo = CSRToCOO<kDGLCUDA, int64_t>(csr);
if (aten::IsNullArray(coo.data))
return coo;
if (aten::IsNullArray(coo.data)) return coo;
const auto& sorted = Sort(coo.data);
coo.row = IndexSelect(coo.row, sorted.second);
......
......@@ -4,9 +4,10 @@
* \brief Sort CSR index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
#include "./dgl_cub.cuh"
#include "./utils.h"
namespace dgl {
......@@ -20,8 +21,8 @@ namespace impl {
*/
template <typename IdType>
__global__ void _SegmentIsSorted(
const IdType* indptr, const IdType* indices,
int64_t num_rows, int8_t* flags) {
const IdType* indptr, const IdType* indices, int64_t num_rows,
int8_t* flags) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < num_rows) {
......@@ -39,15 +40,15 @@ bool CSRIsSorted(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
cudaStream_t stream = runtime::getCurrentCUDAStream();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
// be fine.
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
// We allocate a workspace of num_rows bytes. It wastes a little bit memory
// but should be fine.
int8_t* flags =
static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
CUDA_KERNEL_CALL(_SegmentIsSorted,
nb, nt, 0, stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
CUDA_KERNEL_CALL(
_SegmentIsSorted, nb, nt, 0, stream, csr.indptr.Ptr<IdType>(),
csr.indices.Ptr<IdType>(), csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
device->FreeWorkspace(ctx, flags);
return ret;
......@@ -82,10 +83,8 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
&workspace_size));
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
cusparseMatDescr_t descr;
......@@ -93,11 +92,8 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
data.Ptr<int32_t>(),
thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
workspace));
csr->sorted = true;
......@@ -115,8 +111,7 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
const auto& ctx = csr->indptr->ctx;
const int64_t nnz = csr->indices->shape[0];
const auto nbits = csr->indptr->dtype.bits;
if (!aten::CSRHasData(*csr))
csr->data = aten::Range(0, nnz, nbits, ctx);
if (!aten::CSRHasData(*csr)) csr->data = aten::Range(0, nnz, nbits, ctx);
IdArray new_indices = csr->indices.Clone();
IdArray new_data = csr->data.Clone();
......@@ -129,15 +124,15 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
// Allocate workspace
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(nullptr, workspace_size,
key_in, key_out, value_in, value_out,
nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t)*8, stream));
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
// Compute
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(workspace, workspace_size,
key_in, key_out, value_in, value_out,
nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t)*8, stream));
CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
csr->sorted = true;
csr->indices = new_indices;
......
......@@ -4,6 +4,7 @@
* \brief CSR transpose (convert to CSC)
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
namespace dgl {
......@@ -33,14 +34,13 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
const int64_t nnz = indices->shape[0];
const auto& ctx = indptr->ctx;
const auto bits = indptr->dtype.bits;
if (aten::IsNullArray(data))
data = aten::Range(0, nnz, bits, ctx);
if (aten::IsNullArray(data)) data = aten::Range(0, nnz, bits, ctx);
const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
const int32_t* indices_ptr = static_cast<int32_t*>(indices->data);
const void* data_ptr = data->data;
// (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz == 0.
// We need to do it ourselves.
// (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz
// == 0. We need to do it ourselves.
NDArray t_indptr = aten::Full(0, csr.num_cols + 1, bits, ctx);
NDArray t_indices = aten::NewIdArray(nnz, ctx, bits);
NDArray t_data = aten::NewIdArray(nnz, ctx, bits);
......@@ -53,40 +53,29 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
// workspace
size_t workspace_size;
CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize(
thr_entry->cusparse_handle,
csr.num_rows, csr.num_cols, nnz,
data_ptr, indptr_ptr, indices_ptr,
t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F,
CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO,
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference
&workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUSPARSE_CALL(cusparseCsr2cscEx2(
thr_entry->cusparse_handle,
csr.num_rows, csr.num_cols, nnz,
data_ptr, indptr_ptr, indices_ptr,
t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F,
CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO,
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference
workspace));
device->FreeWorkspace(ctx, workspace);
#else
CUSPARSE_CALL(cusparseScsr2csc(
thr_entry->cusparse_handle,
csr.num_rows, csr.num_cols, nnz,
thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz,
static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr,
static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr,
CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
#endif
return CSRMatrix(csr.num_cols, csr.num_rows,
t_indptr, t_indices, t_data,
false);
return CSRMatrix(
csr.num_cols, csr.num_rows, t_indptr, t_indices, t_data, false);
}
template <>
......
......@@ -7,8 +7,8 @@
#include <dgl/runtime/device_api.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../filter.h"
#include "../../runtime/cuda/cuda_hashtable.cuh"
#include "../filter.h"
#include "./dgl_cub.cuh"
using namespace dgl::runtime::cuda;
......@@ -20,35 +20,29 @@ namespace {
cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
template<typename IdType, bool include>
template <typename IdType, bool include>
__global__ void _IsInKernel(
DeviceOrderedHashTable<IdType> table,
const IdType * const array,
const int64_t size,
IdType * const mark) {
const int64_t idx = threadIdx.x + blockDim.x*blockIdx.x;
DeviceOrderedHashTable<IdType> table, const IdType* const array,
const int64_t size, IdType* const mark) {
const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < size) {
mark[idx] = table.Contains(array[idx]) ^ (!include);
}
}
template<typename IdType>
template <typename IdType>
__global__ void _InsertKernel(
const IdType * const prefix,
const int64_t size,
IdType * const result) {
const int64_t idx = threadIdx.x + blockDim.x*blockIdx.x;
const IdType* const prefix, const int64_t size, IdType* const result) {
const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < size) {
if (prefix[idx] != prefix[idx+1]) {
if (prefix[idx] != prefix[idx + 1]) {
result[prefix[idx]] = idx;
}
}
}
template<typename IdType, bool include>
IdArray _PerformFilter(
const OrderedHashTable<IdType>& table,
IdArray test) {
template <typename IdType, bool include>
IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
const auto& ctx = test->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
const int64_t size = test->shape[0];
......@@ -60,22 +54,20 @@ IdArray _PerformFilter(
// we need two arrays: 1) to act as a prefixsum
// for the number of entries that will be inserted, and
// 2) to collect the included items.
IdType * prefix = static_cast<IdType*>(
device->AllocWorkspace(ctx, sizeof(IdType)*(size+1)));
IdType* prefix = static_cast<IdType*>(
device->AllocWorkspace(ctx, sizeof(IdType) * (size + 1)));
// will resize down later
IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType)*8);
IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType) * 8);
// mark each index based on it's existence in the hashtable
{
const dim3 block(256);
const dim3 grid((size+block.x-1)/block.x);
const dim3 grid((size + block.x - 1) / block.x);
CUDA_KERNEL_CALL((_IsInKernel<IdType, include>),
grid, block, 0, cudaStream,
table.DeviceHandle(),
static_cast<const IdType*>(test->data),
size,
CUDA_KERNEL_CALL(
(_IsInKernel<IdType, include>), grid, block, 0, cudaStream,
table.DeviceHandle(), static_cast<const IdType*>(test->data), size,
prefix);
}
......@@ -83,40 +75,28 @@ IdArray _PerformFilter(
{
size_t workspace_bytes;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
nullptr,
workspace_bytes,
static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr),
size+1, cudaStream));
void * workspace = device->AllocWorkspace(ctx, workspace_bytes);
nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
static_cast<IdType*>(nullptr), size + 1, cudaStream));
void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(
workspace,
workspace_bytes,
prefix,
prefix,
size+1, cudaStream));
workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
device->FreeWorkspace(ctx, workspace);
}
// copy number using the internal current stream;
IdType num_unique;
device->CopyDataFromTo(prefix+size, 0,
&num_unique, 0,
sizeof(num_unique),
ctx,
DGLContext{kDGLCPU, 0},
test->dtype);
device->CopyDataFromTo(
prefix + size, 0, &num_unique, 0, sizeof(num_unique), ctx,
DGLContext{kDGLCPU, 0}, test->dtype);
// insert items into set
{
const dim3 block(256);
const dim3 grid((size+block.x-1)/block.x);
const dim3 grid((size + block.x - 1) / block.x);
CUDA_KERNEL_CALL(_InsertKernel,
grid, block, 0, cudaStream,
prefix,
size,
CUDA_KERNEL_CALL(
_InsertKernel, grid, block, 0, cudaStream, prefix, size,
static_cast<IdType*>(result->data));
}
device->FreeWorkspace(ctx, prefix);
......@@ -124,16 +104,13 @@ IdArray _PerformFilter(
return result.CreateView({num_unique}, result->dtype);
}
template<typename IdType>
template <typename IdType>
class CudaFilterSet : public Filter {
public:
explicit CudaFilterSet(IdArray array) :
table_(array->shape[0], array->ctx, cudaStream) {
explicit CudaFilterSet(IdArray array)
: table_(array->shape[0], array->ctx, cudaStream) {
table_.FillWithUnique(
static_cast<const IdType*>(array->data),
array->shape[0],
cudaStream);
static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
}
IdArray find_included_indices(IdArray test) override {
......@@ -150,7 +127,7 @@ class CudaFilterSet : public Filter {
} // namespace
template<DGLDeviceType XPU, typename IdType>
template <DGLDeviceType XPU, typename IdType>
FilterRef CreateSetFilter(IdArray set) {
return FilterRef(std::make_shared<CudaFilterSet<IdType>>(set));
}
......
/*!
* Copyright (c) 2021 by Contributors
* \file cuda_common.h
* \brief Wrapper to place cub in dgl namespace.
* \brief Wrapper to place cub in dgl namespace.
*/
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
......
/**
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file array/gpu/disjoint_union.cu
* \brief Disjoint union GPU implementation.
*/
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file array/gpu/disjoint_union.cu
* \brief Disjoint union GPU implementation.
*/
#include <dgl/runtime/parallel_for.h>
#include <dgl/array.h>
#include <vector>
#include <dgl/runtime/parallel_for.h>
#include <tuple>
#include <vector>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
......@@ -31,8 +33,8 @@ namespace impl {
template <typename IdType>
__global__ void _DisjointUnionKernel(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
int64_t n_arrs, int n_elms) {
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
int n_elms) {
IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < n_elms) {
......@@ -48,7 +50,8 @@ __global__ void _DisjointUnionKernel(
}
template <DGLDeviceType XPU, typename IdType>
std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMatrix>& coos) {
std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
const std::vector<COOMatrix>& coos) {
IdType n = coos.size(), nbits = coos[0].row->dtype.bits;
IdArray n_rows = NewIdArray(n, CPU, nbits);
IdArray n_cols = NewIdArray(n, CPU, nbits);
......@@ -58,7 +61,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMa
IdType* n_cols_data = n_cols.Ptr<IdType>();
IdType* n_elms_data = n_elms.Ptr<IdType>();
dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e) {
for (IdType i = b; i < e; ++i) {
n_rows_data[i] = coos[i].num_rows;
n_cols_data[i] = coos[i].num_cols;
......@@ -66,30 +69,30 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMa
}
});
return std::make_tuple(CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
return std::make_tuple(
CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
}
template <DGLDeviceType XPU, typename IdType>
void _Merge(IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
int64_t n_arrs, int n_elms,
DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
void _Merge(
IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
auto device = runtime::DeviceAPI::Get(ctx);
int nt = 256;
int nb = (n_elms + nt - 1) / nt;
IdType** arrs_dev = static_cast<IdType**>(
device->AllocWorkspace(ctx, n_arrs*sizeof(IdType*)));
device->AllocWorkspace(ctx, n_arrs * sizeof(IdType*)));
device->CopyDataFromTo(
arrs, 0, arrs_dev, 0, sizeof(IdType*)*n_arrs,
DGLContext{kDGLCPU, 0}, ctx, dtype);
arrs, 0, arrs_dev, 0, sizeof(IdType*) * n_arrs, DGLContext{kDGLCPU, 0},
ctx, dtype);
CUDA_KERNEL_CALL(_DisjointUnionKernel,
nb, nt, 0, stream,
arrs_dev, prefix, offset,
out, n_arrs, n_elms);
CUDA_KERNEL_CALL(
_DisjointUnionKernel, nb, nt, 0, stream, arrs_dev, prefix, offset, out,
n_arrs, n_elms);
device->FreeWorkspace(ctx, arrs_dev);
}
......@@ -132,52 +135,50 @@ COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
IdType n_elements = 0;
device->CopyDataFromTo(
&prefix_elm[coos.size()], 0, &n_elements, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
coos[0].row->dtype);
&prefix_elm[coos.size()], 0, &n_elements, 0, sizeof(IdType),
coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
device->CopyDataFromTo(
&prefix_src[coos.size()], 0, &src_offset, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
coos[0].row->dtype);
&prefix_src[coos.size()], 0, &src_offset, 0, sizeof(IdType),
coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
device->CopyDataFromTo(
&prefix_dst[coos.size()], 0, &dst_offset, 0,
sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
coos[0].row->dtype);
&prefix_dst[coos.size()], 0, &dst_offset, 0, sizeof(IdType),
coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
// Union src array
IdArray result_src = NewIdArray(
n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
IdArray result_src =
NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(
rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(), coos.size(),
n_elements, ctx, dtype, stream);
// Union dst array
IdArray result_dst = NewIdArray(
n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
_Merge<XPU, IdType>(cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
IdArray result_dst =
NewIdArray(n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
_Merge<XPU, IdType>(
cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(), coos.size(),
n_elements, ctx, dtype, stream);
// Union data array if exists and fetch number of elements
IdArray result_dat = NullArray();
if (has_data) {
result_dat = NewIdArray(
n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
result_dat =
NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
_Merge<XPU, IdType>(
data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
coos.size(), n_elements, ctx, dtype, stream);
}
return COOMatrix(
src_offset, dst_offset,
result_src,
result_dst,
result_dat,
row_sorted,
col_sorted);
src_offset, dst_offset, result_src, result_dst, result_dat, row_sorted,
col_sorted);
}
template COOMatrix DisjointUnionCoo<kDGLCUDA, int32_t>(const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDGLCUDA, int64_t>(const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDGLCUDA, int32_t>(
const std::vector<COOMatrix>& coos);
template COOMatrix DisjointUnionCoo<kDGLCUDA, int64_t>(
const std::vector<COOMatrix>& coos);
} // namespace impl
} // namespace aten
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment