Unverified Commit f5183820 authored by Tomasz Patejko's avatar Tomasz Patejko Committed by GitHub
Browse files

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for

* [CPU, Parallel] Decrease number of calls to task function

* c[CPU, Parallel] Modify calls to new interface of parallel_for
parent 21a40279
......@@ -74,14 +74,12 @@ void parallel_for(
auto chunk_size = divup((end - begin), num_threads);
auto begin_tid = begin + tid * chunk_size;
if (begin_tid < end) {
for (auto i = begin_tid; i < std::min(end, chunk_size + begin_tid); i++) {
f(i);
}
auto end_tid = std::min(end, chunk_size + begin_tid);
f(begin_tid, end_tid);
}
}
#else
for (auto i = begin; i < end; i++)
f(i);
f(begin, end);
#endif
}
......@@ -98,7 +96,7 @@ void parallel_for(
const size_t begin,
const size_t end,
F&& f) {
parallel_for(begin, end, default_grain_size(), f);
parallel_for(begin, end, default_grain_size(), std::forward<F>(f));
}
} // namespace runtime
} // namespace dgl
......
......@@ -5,11 +5,13 @@
*/
#include <dgl/array.h>
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric>
#include "../arith.h"
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
for (size_t i = 0; i < lhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]);
}
return ret;
......@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
for (size_t i = 0; i < lhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs_data[i], rhs);
}
return ret;
......@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < rhs->shape[0]; ++i) {
for (size_t i = 0; i < rhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs, rhs_data[i]);
}
return ret;
......@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
for (size_t i = 0; i < lhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs_data[i]);
}
return ret;
......
......@@ -4,11 +4,13 @@
* \brief Array index select CPU implementation
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple>
#include <utility>
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx);
DType *concat_data = static_cast<DType *>(concat->data);
#pragma omp parallel for
for (int64_t i = 0; i < rows; ++i) {
for (int64_t j = 0; j < length_data[i]; ++j)
concat_data[offsets_data[i] + j] = array_data[i * stride + j];
}
parallel_for(0, rows, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
for (int64_t j = 0; j < length_data[i]; ++j)
concat_data[offsets_data[i] + j] = array_data[i * stride + j];
}
});
return std::make_pair(concat, offsets);
}
......@@ -56,16 +59,17 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
IdArray length = NewIdArray(rows, array->ctx);
int64_t *length_data = static_cast<int64_t *>(length->data);
#pragma omp parallel for
for (int64_t i = 0; i < rows; ++i) {
int64_t j;
for (j = 0; j < cols; ++j) {
const DType val = array_data[i * cols + j];
if (val == pad_value)
break;
parallel_for(0, rows, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
int64_t j;
for (j = 0; j < cols; ++j) {
const DType val = array_data[i * cols + j];
if (val == pad_value)
break;
}
length_data[i] = j;
}
length_data[i] = j;
}
});
auto ret = ConcatSlices<XPU, DType, int64_t>(array, length);
return std::make_tuple(ret.first, length, ret.second);
......
......@@ -4,6 +4,7 @@
* \brief Array scatter CPU implementation
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl {
using runtime::NDArray;
......@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
const IdType* idx = index.Ptr<IdType>();
const DType* val = value.Ptr<DType>();
DType* outd = out.Ptr<DType>();
#pragma omp parallel for
for (int64_t i = 0; i < len; ++i)
outd[idx[i]] = val[i];
runtime::parallel_for(0, len, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
outd[idx[i]] = val[i];
}
});
}
template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray);
......
......@@ -4,6 +4,7 @@
* \brief Retrieve entries of a CSR matrix
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <numeric>
......@@ -12,7 +13,7 @@
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -70,35 +71,37 @@ NDArray CSRGetData(
if (csr.sorted) {
// use binary search on each row
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
const IdType *start_ptr = indices_data + indptr_data[row_id];
const IdType *end_ptr = indices_data + indptr_data[row_id + 1];
auto it = std::lower_bound(start_ptr, end_ptr, col_id);
if (it != end_ptr && *it == col_id) {
const IdType idx = it - indices_data;
IdType eid = data ? data[idx] : idx;
ret_data[p] = return_eids ? eid : weight_data[eid];
parallel_for(0, retlen, [&](size_t b, size_t e) {
for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
const IdType *start_ptr = indices_data + indptr_data[row_id];
const IdType *end_ptr = indices_data + indptr_data[row_id + 1];
auto it = std::lower_bound(start_ptr, end_ptr, col_id);
if (it != end_ptr && *it == col_id) {
const IdType idx = it - indices_data;
IdType eid = data ? data[idx] : idx;
ret_data[p] = return_eids ? eid : weight_data[eid];
}
}
}
});
} else {
// linear search on each row
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
for (IdType idx = indptr_data[row_id]; idx < indptr_data[row_id + 1]; ++idx) {
if (indices_data[idx] == col_id) {
IdType eid = data ? data[idx] : idx;
ret_data[p] = return_eids ? eid : weight_data[eid];
break;
parallel_for(0, retlen, [&](size_t b, size_t e) {
for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
for (IdType idx = indptr_data[row_id]; idx < indptr_data[row_id + 1]; ++idx) {
if (indices_data[idx] == col_id) {
IdType eid = data ? data[idx] : idx;
ret_data[p] = return_eids ? eid : weight_data[eid];
break;
}
}
}
}
});
}
return ret;
}
......
......@@ -5,6 +5,7 @@
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h>
#include <vector>
#include "array_utils.h"
......@@ -12,6 +13,7 @@
namespace dgl {
using dgl::runtime::NDArray;
using dgl::runtime::parallel_for;
namespace aten {
......@@ -26,17 +28,17 @@ void CountNNZPerRow(
const IdType* B_indices,
IdType* C_indptr_data,
int64_t M) {
phmap::flat_hash_set<IdType> set;
#pragma omp parallel for firstprivate(set)
for (int64_t i = 0; i < M; ++i) {
set.clear();
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u];
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
set.insert(B_indices[v]);
parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_set<IdType> set;
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u];
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
set.insert(B_indices[v]);
}
C_indptr_data[i] = set.size();
}
C_indptr_data[i] = set.size();
}
});
}
template <typename IdType>
......@@ -66,27 +68,27 @@ void ComputeIndicesAndData(
IdType* C_indices_data,
DType* C_weights_data,
int64_t M) {
phmap::flat_hash_map<IdType, DType> map;
#pragma omp parallel for firstprivate(map)
for (int64_t i = 0; i < M; ++i) {
map.clear();
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u];
DType vA = A_data[A_eids ? A_eids[u] : u];
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) {
IdType t = B_indices[v];
DType vB = B_data[B_eids ? B_eids[v] : v];
map[t] += vA * vB;
parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_map<IdType, DType> map;
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u];
DType vA = A_data[A_eids ? A_eids[u] : u];
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) {
IdType t = B_indices[v];
DType vB = B_data[B_eids ? B_eids[v] : v];
map[t] += vA * vB;
}
}
}
IdType v = C_indptr_data[i];
for (auto it : map) {
C_indices_data[v] = it.first;
C_weights_data[v] = it.second;
++v;
IdType v = C_indptr_data[i];
for (auto it : map) {
C_indices_data[v] = it.first;
C_weights_data[v] = it.second;
++v;
}
}
}
});
}
}; // namespace
......
......@@ -4,6 +4,7 @@
* \brief CSR sorting
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric>
#include <algorithm>
#include <vector>
......@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
}
IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{
std::vector<ShufflePair> reorder_vec;
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
runtime::parallel_for(0, num_rows, [=](size_t b, size_t e) {
for (auto row = b; row < e; ++row) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
std::vector<ShufflePair> reorder_vec(num_cols);
IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i];
......@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) {
eid[i] = reorder_vec[i].second;
}
}
}
});
csr->sorted = true;
}
......@@ -101,37 +101,38 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
auto out_indices_data = static_cast<IdType *>(output.indices->data);
auto out_eid_data = static_cast<IdType *>(output.data->data);
#pragma omp parallel for
for (IdType src = 0 ; src < num_rows ; ++src) {
const IdType start = indptr_data[src];
const IdType end = indptr_data[src + 1];
auto tag_pos_row = tag_pos_data + src * (num_tags + 1);
std::vector<IdType> pointer(num_tags, 0);
for (IdType ptr = start ; ptr < end ; ++ptr) {
const IdType dst = indices_data[ptr];
const TagType tag = tag_data[dst];
CHECK_LT(tag, num_tags);
++tag_pos_row[tag + 1];
} // count
for (TagType tag = 1 ; tag <= num_tags; ++tag) {
tag_pos_row[tag] += tag_pos_row[tag - 1];
} // cumulate
for (IdType ptr = start ; ptr < end ; ++ptr) {
const IdType dst = indices_data[ptr];
const IdType eid = eid_data[ptr];
const TagType tag = tag_data[dst];
const IdType offset = tag_pos_row[tag] + pointer[tag];
CHECK_LT(offset, tag_pos_row[tag + 1]);
++pointer[tag];
out_indices_data[start + offset] = dst;
out_eid_data[start + offset] = eid;
runtime::parallel_for(0, num_rows, [&](size_t b, size_t e) {
for (auto src = b; src < e; ++src) {
const IdType start = indptr_data[src];
const IdType end = indptr_data[src + 1];
auto tag_pos_row = tag_pos_data + src * (num_tags + 1);
std::vector<IdType> pointer(num_tags, 0);
for (IdType ptr = start ; ptr < end ; ++ptr) {
const IdType dst = indices_data[ptr];
const TagType tag = tag_data[dst];
CHECK_LT(tag, num_tags);
++tag_pos_row[tag + 1];
} // count
for (TagType tag = 1 ; tag <= num_tags; ++tag) {
tag_pos_row[tag] += tag_pos_row[tag - 1];
} // cumulate
for (IdType ptr = start ; ptr < end ; ++ptr) {
const IdType dst = indices_data[ptr];
const IdType eid = eid_data[ptr];
const TagType tag = tag_data[dst];
const IdType offset = tag_pos_row[tag] + pointer[tag];
CHECK_LT(offset, tag_pos_row[tag + 1]);
++pointer[tag];
out_indices_data[start + offset] = dst;
out_eid_data[start + offset] = eid;
}
}
}
});
output.sorted = false;
return std::make_pair(output, tag_pos);
}
......
......@@ -5,6 +5,7 @@
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h>
#include <vector>
#include "array_utils.h"
......@@ -25,16 +26,17 @@ void CountNNZPerRow(
IdType* C_indptr_data,
int64_t M) {
int64_t n = A_indptr.size();
phmap::flat_hash_set<IdType> set;
#pragma omp parallel for firstprivate(set)
for (IdType i = 0; i < M; ++i) {
set.clear();
for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
set.insert(A_indices[k][u]);
runtime::parallel_for(0, M, [=](size_t b, size_t e) {
for (size_t i = b; i < e; ++i) {
phmap::flat_hash_set<IdType> set;
for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
set.insert(A_indices[k][u]);
}
C_indptr_data[i] = set.size();
}
C_indptr_data[i] = set.size();
}
});
}
template <typename IdType>
......@@ -61,25 +63,24 @@ void ComputeIndicesAndData(
DType* C_weights_data,
int64_t M) {
int64_t n = A_indptr.size();
phmap::flat_hash_map<IdType, DType> map;
#pragma omp parallel for firstprivate(map)
for (int64_t i = 0; i < M; ++i) {
map.clear();
for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
IdType kA = A_indices[k][u];
DType vA = A_data[k][A_eids[k] ? A_eids[k][u] : u];
map[kA] += vA;
runtime::parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_map<IdType, DType> map;
for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
IdType kA = A_indices[k][u];
DType vA = A_data[k][A_eids[k] ? A_eids[k][u] : u];
map[kA] += vA;
}
}
IdType j = C_indptr_data[i];
for (auto it : map) {
C_indices_data[j] = it.first;
C_weights_data[j] = it.second;
++j;
}
}
IdType j = C_indptr_data[i];
for (auto it : map) {
C_indices_data[j] = it.first;
C_weights_data[j] = it.second;
++j;
}
}
});
}
}; // namespace
......
......@@ -4,7 +4,7 @@
* \brief COO sorting
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric>
#include <algorithm>
#include <vector>
......@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
for (int64_t i = 1; i <= csrs[0].num_rows; ++i) {
std::vector<int64_t> indices_off;
res_indptr[i] = indptr_data[0][i];
indices_off.push_back(indptr_data[0][i-1]);
for (size_t j = 1; j < csrs.size(); ++j) {
res_indptr[i] += indptr_data[j][i];
......@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
}
} // for check out of bound
} // for
res_indices[off] = min;
res_data[off] = data_data[min_idx][indices_off[min_idx]];
indices_off[min_idx] += 1;
......
......@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
global_prefix[t+1] += global_prefix[t];
}
}
#pragma omp barrier
const IdxType thread_offset = global_prefix[thread_id];
......
......@@ -8,6 +8,7 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include "../selector.h"
namespace dgl {
......@@ -40,24 +41,27 @@ void SDDMMCsr(const BcastOff& bcast,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx? edges[j] : j;
DType* out_off = O + eid * dim;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs?
X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr;
const DType* rhs_off = Op::use_rhs?
Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx? edges[j] : j;
DType* out_off = O + eid * dim;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs
? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
: nullptr;
const DType* rhs_off = Op::use_rhs
? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
}
}
}
}
});
}
/*!
......@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
const int64_t nnz = coo.row->shape[0];
#pragma omp parallel for
for (IdType i = 0; i < nnz; ++i) {
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType rid = row[i];
const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i;
......
......@@ -7,6 +7,7 @@
#define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl {
namespace aten {
......@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) {
out_data[i * dim + k] += feat_data[j * dim + k];
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) {
out_data[i * dim + k] += feat_data[j * dim + k];
}
}
}
}
});
}
/*!
......@@ -58,18 +60,19 @@ void SegmentCmp(NDArray feat, NDArray offsets,
IdType *arg_data = arg.Ptr<IdType>();
std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
std::fill(arg_data, arg_data + arg.NumElements(), -1);
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) {
const DType val = feat_data[j * dim + k];
if (Cmp::Call(out_data[i * dim + k], val)) {
out_data[i * dim + k] = val;
arg_data[i * dim + k] = j;
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) {
const DType val = feat_data[j * dim + k];
if (Cmp::Call(out_data[i * dim + k], val)) {
out_data[i * dim + k] = val;
arg_data[i * dim + k] = j;
}
}
}
}
}
});
}
/*!
......@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
for (int k = 0; k < dim; ++k) {
int write_row = arg_data[i * dim + k];
if (write_row >= 0)
out_data[write_row * dim + k] = feat_data[i * dim + k];
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (int k = 0; k < dim; ++k) {
int write_row = arg_data[i * dim + k];
if (write_row >= 0)
out_data[write_row * dim + k] = feat_data[i * dim + k];
}
}
}
});
}
} // namespace cpu
......
......@@ -4,6 +4,7 @@
* \brief CPU implementation of COO sparse matrix operators
*/
#include <dmlc/omp.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <unordered_map>
......@@ -14,6 +15,7 @@
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t kmax = std::max(rowlen, collen);
#pragma omp parallel for
for (int64_t k = 0; k < kmax; ++k) {
int64_t i = row_stride * k;
int64_t j = col_stride * k;
rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
}
parallel_for(0, kmax, [=](size_t b, size_t e) {
for (auto k = b; k < e; ++k) {
int64_t i = row_stride * k;
int64_t j = col_stride * k;
rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
}
});
return rst;
}
......@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data);
#pragma omp parallel for
for (int64_t i = 0; i < len; ++i)
for (int64_t i = 0; i < len; ++i) {
rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
}
return rst;
}
......@@ -178,18 +182,19 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
// the choice.
if (coo.row_sorted) {
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
for (; it < coo_row + nnz && *it == row_id; ++it) {
const auto idx = it - coo_row;
if (coo_col[idx] == col_id) {
ret_data[p] = data? data[idx] : idx;
break;
parallel_for(0, retlen, [&](size_t b, size_t e) {
for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
for (; it < coo_row + nnz && *it == row_id; ++it) {
const auto idx = it - coo_row;
if (coo_col[idx] == col_id) {
ret_data[p] = data? data[idx] : idx;
break;
}
}
}
}
});
} else {
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
......@@ -328,67 +333,66 @@ CSRMatrix COOToCSR(COOMatrix coo) {
IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);
if (NNZ > 0) {
#pragma omp parallel
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
// We partition the set the of non-zeros among the threads
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
const int64_t nz_start = thread_id*nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
// Each thread searchs the row array for a change, and marks it's
// location in Bp. Threads, other than the first, start at the last
// index covered by the previous, in order to detect changes in the row
// array between thread partitions. This means that each thread after
// the first, searches the range [nz_start-1, nz_end). That is,
// if we had 10 non-zeros, and 4 threads, the indexes searched by each
// thread would be:
// 0: [0, 1, 2]
// 1: [2, 3, 4, 5]
// 2: [5, 6, 7, 8]
// 3: [8, 9]
//
// That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
// change in row would be captured by one thread:
//
// 0: [0, 0, 1] - row 0
// 1: [1, 2, 2, 2] - row 1
// 2: [2, 4, 5, 5] - rows 2, 3, and 4
// 3: [5, 6] - rows 5 and 6
//
int64_t row = 0;
if (nz_start < nz_end) {
row = nz_start == 0 ? 0 : row_data[nz_start-1];
for (int64_t i = nz_start; i < nz_end; ++i) {
while (row != row_data[i]) {
++row;
Bp[row] = i;
auto num_threads = omp_get_max_threads();
parallel_for(0, num_threads, [&](int b, int e) {
for (auto thread_id = b; thread_id < e; ++thread_id) {
// We partition the set the of non-zeros among the threads
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
const int64_t nz_start = thread_id*nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
// Each thread searchs the row array for a change, and marks it's
// location in Bp. Threads, other than the first, start at the last
// index covered by the previous, in order to detect changes in the row
// array between thread partitions. This means that each thread after
// the first, searches the range [nz_start-1, nz_end). That is,
// if we had 10 non-zeros, and 4 threads, the indexes searched by each
// thread would be:
// 0: [0, 1, 2]
// 1: [2, 3, 4, 5]
// 2: [5, 6, 7, 8]
// 3: [8, 9]
//
// That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
// change in row would be captured by one thread:
//
// 0: [0, 0, 1] - row 0
// 1: [1, 2, 2, 2] - row 1
// 2: [2, 4, 5, 5] - rows 2, 3, and 4
// 3: [5, 6] - rows 5 and 6
//
int64_t row = 0;
if (nz_start < nz_end) {
row = nz_start == 0 ? 0 : row_data[nz_start-1];
for (int64_t i = nz_start; i < nz_end; ++i) {
while (row != row_data[i]) {
++row;
Bp[row] = i;
}
}
}
// We will not detect the row change for the last row, nor any empty
// rows at the end of the matrix, so the last active thread needs
// mark all remaining rows in Bp with NNZ.
if (nz_end == NNZ) {
while (row < N) {
++row;
Bp[row] = NNZ;
// We will not detect the row change for the last row, nor any empty
// rows at the end of the matrix, so the last active thread needs
// mark all remaining rows in Bp with NNZ.
if (nz_end == NNZ) {
while (row < N) {
++row;
Bp[row] = NNZ;
}
}
}
if (fill_data) {
// TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
std::iota(fill_data+nz_start,
fill_data+nz_end,
nz_start);
if (fill_data) {
// TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
std::iota(fill_data+nz_start,
fill_data+nz_end,
nz_start);
}
}
}
}
});
} else {
std::fill(Bp, Bp+N+1, 0);
}
......@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
IdType *out_row = static_cast<IdType*>(out_row_arr->data);
IdType *out_col = static_cast<IdType*>(out_col_arr->data);
#pragma omp parallel for
for (int64_t i = 0; i < nnz; i++) {
out_row[i] = new_row_ids[in_rows[i]];
out_col[i] = new_col_ids[in_cols[i]];
}
parallel_for(0, nnz, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
out_row[i] = new_row_ids[in_rows[i]];
out_col[i] = new_col_ids[in_cols[i]];
}
});
return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
}
......
......@@ -4,6 +4,7 @@
* \brief CSR matrix operator CPU implementation
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <numeric>
......@@ -12,6 +13,7 @@
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
// Compute the length of rows for the new matrix.
std::vector<IdType> new_row_lens(num_rows, -1);
#pragma omp parallel for
for (int64_t i = 0; i < num_rows; i++) {
int64_t new_row_id = new_row_ids[i];
new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
}
parallel_for(0, num_rows, [=, &new_row_lens](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
int64_t new_row_id = new_row_ids[i];
new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
}
});
// Compute the starting location of each row in the new matrix.
out_indptr[0] = 0;
// This is sequential. It should be pretty fast.
......@@ -506,23 +509,24 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
CHECK_EQ(out_indptr[num_rows], nnz);
// Copy indieces and data with the new order.
// Here I iterate rows in the order of the old matrix.
#pragma omp parallel for
for (int64_t i = 0; i < num_rows; i++) {
const IdType *in_row = in_indices + in_indptr[i];
const IdType *in_row_data = in_data + in_indptr[i];
int64_t new_row_id = new_row_ids[i];
IdType *out_row = out_indices + out_indptr[new_row_id];
IdType *out_row_data = out_data + out_indptr[new_row_id];
int64_t row_len = new_row_lens[new_row_id];
// Here I iterate col indices in a row in the order of the old matrix.
for (int64_t j = 0; j < row_len; j++) {
out_row[j] = new_col_ids[in_row[j]];
out_row_data[j] = in_row_data[j];
parallel_for(0, num_rows, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const IdType *in_row = in_indices + in_indptr[i];
const IdType *in_row_data = in_data + in_indptr[i];
int64_t new_row_id = new_row_ids[i];
IdType *out_row = out_indices + out_indptr[new_row_id];
IdType *out_row_data = out_data + out_indptr[new_row_id];
int64_t row_len = new_row_lens[new_row_id];
// Here I iterate col indices in a row in the order of the old matrix.
for (int64_t j = 0; j < row_len; j++) {
out_row[j] = new_col_ids[in_row[j]];
out_row_data[j] = in_row_data[j];
}
// TODO(zhengda) maybe we should sort the column indices.
}
// TODO(zhengda) maybe we should sort the column indices.
}
});
return CSRMatrix(num_rows, num_cols,
out_indptr_arr, out_indices_arr, out_data_arr);
}
......
......@@ -8,6 +8,7 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm>
#include <limits>
#include <memory>
......@@ -46,16 +47,18 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
}
}
}
});
}
#endif // USE_AVX
#endif // _WIN32
......@@ -79,24 +82,25 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
out_off[k] += Op::Call(lhs_off, rhs_off);
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
out_off[k] += Op::Call(lhs_off, rhs_off);
}
}
}
}
});
}
/*!
......@@ -270,31 +274,32 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
#endif // USE_AVX
#endif // _WIN32
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
IdType* argx_off = argX + rid * dim;
IdType* argw_off = argW + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off);
if (Cmp::Call(out_off[k], val)) {
out_off[k] = val;
if (Op::use_lhs) argx_off[k] = cid;
if (Op::use_rhs) argw_off[k] = eid;
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
IdType* argx_off = argX + rid * dim;
IdType* argw_off = argW + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off);
if (Cmp::Call(out_off[k], val)) {
out_off[k] = val;
if (Op::use_lhs) argx_off[k] = cid;
if (Op::use_rhs) argw_off[k] = eid;
}
}
}
}
}
}
});
#if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM
......
......@@ -8,6 +8,7 @@
#include <dgl/immutable_graph.h>
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm>
#include "../c_api_common.h"
......@@ -261,34 +262,36 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len);
if (is_sorted) {
#pragma omp parallel for
for (int64_t i = 0; i < query_len; i++) {
const dgl_id_t id = query_data[i];
const auto it = std::find(parent_data, parent_data + parent_len, id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1.
if (it != parent_data + parent_len) {
rst_data[i] = it - parent_data;
} else {
rst_data[i] = -1;
runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i];
const auto it = std::find(parent_data, parent_data + parent_len, id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1.
if (it != parent_data + parent_len) {
rst_data[i] = it - parent_data;
} else {
rst_data[i] = -1;
}
}
}
});
} else {
std::unordered_map<dgl_id_t, dgl_id_t> parent_map;
for (int64_t i = 0; i < parent_len; i++) {
const dgl_id_t id = parent_data[i];
parent_map[id] = i;
}
#pragma omp parallel for
for (int64_t i = 0; i < query_len; i++) {
const dgl_id_t id = query_data[i];
auto it = parent_map.find(id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1.
if (it != parent_map.end()) {
rst_data[i] = it->second;
} else {
rst_data[i] = -1;
runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i];
auto it = parent_map.find(id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1.
if (it != parent_map.end()) {
rst_data[i] = it->second;
} else {
rst_data[i] = -1;
}
}
}
});
}
return rst;
}
......@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo")
graph_ptr->GetInCSR();
std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1);
int num_partitions = part_nodes.size();
#pragma omp parallel for
for (int i = 0; i < num_partitions; i++) {
auto nodes = aten::VecToIdArray(part_nodes[i]);
HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
int part_id = part_ids[i];
subgs[part_id] = subg_ptr;
}
runtime::parallel_for(0, num_partitions, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
auto nodes = aten::VecToIdArray(part_nodes[i]);
HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
int part_id = part_ids[i];
subgs[part_id] = subg_ptr;
}
});
List<SubgraphRef> ret_list;
for (size_t i = 0; i < subgs.size(); i++) {
ret_list.push_back(SubgraphRef(subgs[i]));
......@@ -732,24 +736,25 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
const IdType *typed_map_data = static_cast<IdType *>(typed_map->data);
IdType *types_data = static_cast<IdType *>(ret->data);
IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids;
#pragma omp parallel for
for (int64_t i = 0; i < ids->shape[0]; i++) {
IdType id = ids_data[i];
auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
// The range must exist.
BUG_IF_FAIL(it != range_end_data + num_ranges);
size_t range_id = it - range_end_data;
int type_id = range_id % num_types;
types_data[i] = type_id;
int part_id = range_id / num_types;
BUG_IF_FAIL(part_id < num_parts);
if (part_id == 0) {
per_type_ids_data[i] = id - range_start_data[range_id];
} else {
per_type_ids_data[i] = id - range_start_data[range_id]
runtime::parallel_for(0, ids->shape[0], [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
IdType id = ids_data[i];
auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
// The range must exist.
BUG_IF_FAIL(it != range_end_data + num_ranges);
size_t range_id = it - range_end_data;
int type_id = range_id % num_types;
types_data[i] = type_id;
int part_id = range_id / num_types;
BUG_IF_FAIL(part_id < num_parts);
if (part_id == 0) {
per_type_ids_data[i] = id - range_start_data[range_id];
} else {
per_type_ids_data[i] = id - range_start_data[range_id]
+ typed_map_data[num_parts * type_id + part_id - 1];
}
}
}
});
return ret;
}
......
......@@ -8,6 +8,7 @@
#include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <set>
#include "../c_api_common.h"
......@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
HeteroGraphRef hg = args[0];
dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
#if !defined(DGL_USE_CUDA)
#pragma omp parallel for
auto get_format_f = [&](size_t etype_b, size_t etype_e) {
for (auto etype = etype_b; etype < etype_e; ++etype) {
auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
for (auto format : CodeToSparseFormats(code))
bg->GetFormat(format);
}
};
#if !(defined(DGL_USE_CUDA))
runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
#else
get_format_f(0, hg->NumEdgeTypes());
#endif
for (int64_t etype = 0; etype < hg->NumEdgeTypes(); ++etype) {
auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
for (auto format : CodeToSparseFormats(code))
bg->GetFormat(format);
}
});
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph")
......
......@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h>
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h>
#include <dgl/nodeflow.h>
......@@ -829,15 +830,16 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
char *return_data = new char[ID_size*row_size];
const int64_t local_ids_size = local_ids.size();
// Copy local data
#pragma omp parallel for
for (int64_t i = 0; i < local_ids_size; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0);
memcpy(return_data + local_ids_orginal[i] * row_size,
local_data_char + local_ids[i] * row_size,
row_size);
}
runtime::parallel_for(0, local_ids_size, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0);
memcpy(return_data + local_ids_orginal[i] * row_size,
local_data_char + local_ids[i] * row_size,
row_size);
}
});
// Recv remote message
for (int i = 0; i < msg_count; ++i) {
KVStoreMsg *kv_msg = recv_kv_message(receiver);
......
......@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h>
#include <dgl/packed_func_ext.h>
#include <dgl/random.h>
#include <dgl/runtime/parallel_for.h>
#include <dmlc/omp.h>
#include <algorithm>
#include <cstdlib>
......@@ -850,19 +851,20 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
BuildCsr(*gptr, neigh_type);
// generate node flows
std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for
for (int i = 0; i < num_workers; i++) {
// create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds);
// TODO(minjie): the vector allocation/copy is unnecessary
std::vector<dgl_id_t> worker_seeds(end - start);
std::copy(seed_nodes_data + start, seed_nodes_data + end,
worker_seeds.begin());
nflows[i] = SamplerOp::NeighborSample(
runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
// create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds);
// TODO(minjie): the vector allocation/copy is unnecessary
std::vector<dgl_id_t> worker_seeds(end - start);
std::copy(seed_nodes_data + start, seed_nodes_data + end,
worker_seeds.begin());
nflows[i] = SamplerOp::NeighborSample(
gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor,
add_self_loop, probability);
}
}
});
return nflows;
}
......@@ -977,18 +979,19 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
BuildCsr(*gptr, neigh_type);
// generate node flows
std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for
for (int i = 0; i < num_workers; i++) {
// create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds);
// TODO(minjie): the vector allocation/copy is unnecessary
std::vector<dgl_id_t> worker_seeds(end - start);
std::copy(seed_nodes_data + start, seed_nodes_data + end,
worker_seeds.begin());
nflows[i] = SamplerOp::LayerUniformSample(
gptr.get(), worker_seeds, neigh_type, layer_sizes);
}
runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
// create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds);
// TODO(minjie): the vector allocation/copy is unnecessary
std::vector<dgl_id_t> worker_seeds(end - start);
std::copy(seed_nodes_data + start, seed_nodes_data + end,
worker_seeds.begin());
nflows[i] = SamplerOp::LayerUniformSample(
gptr.get(), worker_seeds, neigh_type, layer_sizes);
}
});
*rv = List<NodeFlow>(nflows);
});
......@@ -1466,54 +1469,55 @@ public:
std::vector<SubgraphRef> positive_subgs(num_workers);
std::vector<SubgraphRef> negative_subgs(num_workers);
#pragma omp parallel for
for (int64_t i = 0; i < num_workers; i++) {
const int64_t start = (batch_curr_id_ + i) * batch_size_;
const int64_t end = std::min(start + batch_size_, num_seeds_);
const int64_t num_edges = end - start;
IdArray worker_seeds;
if (replacement_ == false) {
worker_seeds = seed_edges_.CreateView({num_edges}, DLDataType{kDLInt, 64, 1},
sizeof(dgl_id_t) * start);
} else {
std::vector<dgl_id_t> seeds;
const dgl_id_t *seed_edge_ids = static_cast<const dgl_id_t *>(seed_edges_->data);
// sampling of each edge is a standalone event
for (int64_t i = 0; i < num_edges; ++i) {
int64_t seed = static_cast<const int64_t>(
RandomEngine::ThreadLocal()->RandInt(num_seeds_));
seeds.push_back(seed_edge_ids[seed]);
}
runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const int64_t start = (batch_curr_id_ + i) * batch_size_;
const int64_t end = std::min(start + batch_size_, num_seeds_);
const int64_t num_edges = end - start;
IdArray worker_seeds;
worker_seeds = aten::VecToIdArray(seeds, seed_edges_->dtype.bits);
}
if (replacement_ == false) {
worker_seeds = seed_edges_.CreateView({num_edges}, DLDataType{kDLInt, 64, 1},
sizeof(dgl_id_t) * start);
} else {
std::vector<dgl_id_t> seeds;
const dgl_id_t *seed_edge_ids = static_cast<const dgl_id_t *>(seed_edges_->data);
// sampling of each edge is a standalone event
for (int64_t i = 0; i < num_edges; ++i) {
int64_t seed = static_cast<const int64_t>(
RandomEngine::ThreadLocal()->RandInt(num_seeds_));
seeds.push_back(seed_edge_ids[seed]);
}
EdgeArray arr = gptr_->FindEdges(worker_seeds);
const dgl_id_t *src_ids = static_cast<const dgl_id_t *>(arr.src->data);
const dgl_id_t *dst_ids = static_cast<const dgl_id_t *>(arr.dst->data);
std::vector<dgl_id_t> src_vec(src_ids, src_ids + num_edges);
std::vector<dgl_id_t> dst_vec(dst_ids, dst_ids + num_edges);
// TODO(zhengda) what if there are duplicates in the src and dst vectors.
worker_seeds = aten::VecToIdArray(seeds, seed_edges_->dtype.bits);
}
Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false);
positive_subgs[i] = ConvertRef(subg);
// For chunked negative sampling, we accept "chunk-head" for corrupting head
// nodes and "chunk-tail" for corrupting tail nodes.
if (neg_mode_.substr(0, 5) == "chunk") {
NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6),
neg_sample_size_,
exclude_positive_,
check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
} else if (neg_mode_ == "head" || neg_mode_ == "tail") {
NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_,
neg_sample_size_,
exclude_positive_,
check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
EdgeArray arr = gptr_->FindEdges(worker_seeds);
const dgl_id_t *src_ids = static_cast<const dgl_id_t *>(arr.src->data);
const dgl_id_t *dst_ids = static_cast<const dgl_id_t *>(arr.dst->data);
std::vector<dgl_id_t> src_vec(src_ids, src_ids + num_edges);
std::vector<dgl_id_t> dst_vec(dst_ids, dst_ids + num_edges);
// TODO(zhengda) what if there are duplicates in the src and dst vectors.
Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false);
positive_subgs[i] = ConvertRef(subg);
// For chunked negative sampling, we accept "chunk-head" for corrupting head
// nodes and "chunk-tail" for corrupting tail nodes.
if (neg_mode_.substr(0, 5) == "chunk") {
NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6),
neg_sample_size_,
exclude_positive_,
check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
} else if (neg_mode_ == "head" || neg_mode_ == "tail") {
NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_,
neg_sample_size_,
exclude_positive_,
check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
}
}
}
});
if (neg_mode_.size() > 0) {
positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end());
}
......
......@@ -9,6 +9,7 @@
#include <dgl/base_heterograph.h>
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple>
#include <utility>
#include "randomwalks_impl.h"
......@@ -47,25 +48,26 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
IdxType *traces_data = traces.Ptr<IdxType>();
IdxType *eids_data = eids.Ptr<IdxType>();
#pragma omp parallel for
for (int64_t seed_id = 0; seed_id < num_seeds; ++seed_id) {
int64_t i;
dgl_id_t curr = seed_data[seed_id];
traces_data[seed_id * trace_length] = curr;
for (i = 0; i < max_num_steps; ++i) {
const auto &succ = step(traces_data + seed_id * max_num_steps, curr, i);
traces_data[seed_id * trace_length + i + 1] = curr = std::get<0>(succ);
eids_data[seed_id * max_num_steps + i] = std::get<1>(succ);
if (std::get<2>(succ))
break;
runtime::parallel_for(0, num_seeds, [&](size_t seed_begin, size_t seed_end) {
for (auto seed_id = seed_begin; seed_id < seed_end; seed_id++) {
int64_t i;
dgl_id_t curr = seed_data[seed_id];
traces_data[seed_id * trace_length] = curr;
for (i = 0; i < max_num_steps; ++i) {
const auto &succ = step(traces_data + seed_id * max_num_steps, curr, i);
traces_data[seed_id * trace_length + i + 1] = curr = std::get<0>(succ);
eids_data[seed_id * max_num_steps + i] = std::get<1>(succ);
if (std::get<2>(succ))
break;
}
for (; i < max_num_steps; ++i) {
traces_data[seed_id * trace_length + i + 1] = -1;
eids_data[seed_id * max_num_steps + i] = -1;
}
}
for (; i < max_num_steps; ++i) {
traces_data[seed_id * trace_length + i + 1] = -1;
eids_data[seed_id * max_num_steps + i] = -1;
}
}
});
return std::make_pair(traces, eids);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment