Unverified Commit f5183820 authored by Tomasz Patejko's avatar Tomasz Patejko Committed by GitHub
Browse files

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for

* [CPU, Parallel] Decrease number of calls to task function

* c[CPU, Parallel] Modify calls to new interface of parallel_for
parent 21a40279
......@@ -74,14 +74,12 @@ void parallel_for(
auto chunk_size = divup((end - begin), num_threads);
auto begin_tid = begin + tid * chunk_size;
if (begin_tid < end) {
for (auto i = begin_tid; i < std::min(end, chunk_size + begin_tid); i++) {
f(i);
}
auto end_tid = std::min(end, chunk_size + begin_tid);
f(begin_tid, end_tid);
}
}
#else
for (auto i = begin; i < end; i++)
f(i);
f(begin, end);
#endif
}
......@@ -98,7 +96,7 @@ void parallel_for(
const size_t begin,
const size_t end,
F&& f) {
parallel_for(begin, end, default_grain_size(), f);
parallel_for(begin, end, default_grain_size(), std::forward<F>(f));
}
} // namespace runtime
} // namespace dgl
......
......@@ -5,11 +5,13 @@
*/
#include <dgl/array.h>
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric>
#include "../arith.h"
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
for (size_t i = 0; i < lhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]);
}
return ret;
......@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
for (size_t i = 0; i < lhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs_data[i], rhs);
}
return ret;
......@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < rhs->shape[0]; ++i) {
for (size_t i = 0; i < rhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs, rhs_data[i]);
}
return ret;
......@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) {
IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
for (size_t i = 0; i < lhs->shape[0]; i++) {
ret_data[i] = Op::Call(lhs_data[i]);
}
return ret;
......
......@@ -4,11 +4,13 @@
* \brief Array index select CPU implementation
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple>
#include <utility>
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx);
DType *concat_data = static_cast<DType *>(concat->data);
#pragma omp parallel for
for (int64_t i = 0; i < rows; ++i) {
parallel_for(0, rows, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
for (int64_t j = 0; j < length_data[i]; ++j)
concat_data[offsets_data[i] + j] = array_data[i * stride + j];
}
});
return std::make_pair(concat, offsets);
}
......@@ -56,8 +59,8 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
IdArray length = NewIdArray(rows, array->ctx);
int64_t *length_data = static_cast<int64_t *>(length->data);
#pragma omp parallel for
for (int64_t i = 0; i < rows; ++i) {
parallel_for(0, rows, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
int64_t j;
for (j = 0; j < cols; ++j) {
const DType val = array_data[i * cols + j];
......@@ -66,6 +69,7 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
}
length_data[i] = j;
}
});
auto ret = ConcatSlices<XPU, DType, int64_t>(array, length);
return std::make_tuple(ret.first, length, ret.second);
......
......@@ -4,6 +4,7 @@
* \brief Array scatter CPU implementation
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl {
using runtime::NDArray;
......@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
const IdType* idx = index.Ptr<IdType>();
const DType* val = value.Ptr<DType>();
DType* outd = out.Ptr<DType>();
#pragma omp parallel for
for (int64_t i = 0; i < len; ++i)
runtime::parallel_for(0, len, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
outd[idx[i]] = val[i];
}
});
}
template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray);
......
......@@ -4,6 +4,7 @@
* \brief Retrieve entries of a CSR matrix
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <numeric>
......@@ -12,7 +13,7 @@
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -70,8 +71,8 @@ NDArray CSRGetData(
if (csr.sorted) {
// use binary search on each row
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
parallel_for(0, retlen, [&](size_t b, size_t e) {
for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
......@@ -84,10 +85,11 @@ NDArray CSRGetData(
ret_data[p] = return_eids ? eid : weight_data[eid];
}
}
});
} else {
// linear search on each row
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
parallel_for(0, retlen, [&](size_t b, size_t e) {
for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
......@@ -99,6 +101,7 @@ NDArray CSRGetData(
}
}
}
});
}
return ret;
}
......
......@@ -5,6 +5,7 @@
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h>
#include <vector>
#include "array_utils.h"
......@@ -12,6 +13,7 @@
namespace dgl {
using dgl::runtime::NDArray;
using dgl::runtime::parallel_for;
namespace aten {
......@@ -26,10 +28,9 @@ void CountNNZPerRow(
const IdType* B_indices,
IdType* C_indptr_data,
int64_t M) {
parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_set<IdType> set;
#pragma omp parallel for firstprivate(set)
for (int64_t i = 0; i < M; ++i) {
set.clear();
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u];
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
......@@ -37,6 +38,7 @@ void CountNNZPerRow(
}
C_indptr_data[i] = set.size();
}
});
}
template <typename IdType>
......@@ -66,10 +68,9 @@ void ComputeIndicesAndData(
IdType* C_indices_data,
DType* C_weights_data,
int64_t M) {
parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_map<IdType, DType> map;
#pragma omp parallel for firstprivate(map)
for (int64_t i = 0; i < M; ++i) {
map.clear();
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u];
DType vA = A_data[A_eids ? A_eids[u] : u];
......@@ -87,6 +88,7 @@ void ComputeIndicesAndData(
++v;
}
}
});
}
}; // namespace
......
......@@ -4,6 +4,7 @@
* \brief CSR sorting
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric>
#include <algorithm>
#include <vector>
......@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
}
IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{
std::vector<ShufflePair> reorder_vec;
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
runtime::parallel_for(0, num_rows, [=](size_t b, size_t e) {
for (auto row = b; row < e; ++row) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
std::vector<ShufflePair> reorder_vec(num_cols);
IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i];
......@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) {
eid[i] = reorder_vec[i].second;
}
}
}
});
csr->sorted = true;
}
......@@ -101,8 +101,8 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
auto out_indices_data = static_cast<IdType *>(output.indices->data);
auto out_eid_data = static_cast<IdType *>(output.data->data);
#pragma omp parallel for
for (IdType src = 0 ; src < num_rows ; ++src) {
runtime::parallel_for(0, num_rows, [&](size_t b, size_t e) {
for (auto src = b; src < e; ++src) {
const IdType start = indptr_data[src];
const IdType end = indptr_data[src + 1];
......@@ -132,6 +132,7 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
out_eid_data[start + offset] = eid;
}
}
});
output.sorted = false;
return std::make_pair(output, tag_pos);
}
......
......@@ -5,6 +5,7 @@
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h>
#include <vector>
#include "array_utils.h"
......@@ -25,16 +26,17 @@ void CountNNZPerRow(
IdType* C_indptr_data,
int64_t M) {
int64_t n = A_indptr.size();
runtime::parallel_for(0, M, [=](size_t b, size_t e) {
for (size_t i = b; i < e; ++i) {
phmap::flat_hash_set<IdType> set;
#pragma omp parallel for firstprivate(set)
for (IdType i = 0; i < M; ++i) {
set.clear();
for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
set.insert(A_indices[k][u]);
}
C_indptr_data[i] = set.size();
}
});
}
template <typename IdType>
......@@ -61,10 +63,9 @@ void ComputeIndicesAndData(
DType* C_weights_data,
int64_t M) {
int64_t n = A_indptr.size();
runtime::parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_map<IdType, DType> map;
#pragma omp parallel for firstprivate(map)
for (int64_t i = 0; i < M; ++i) {
map.clear();
for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
IdType kA = A_indices[k][u];
......@@ -72,7 +73,6 @@ void ComputeIndicesAndData(
map[kA] += vA;
}
}
IdType j = C_indptr_data[i];
for (auto it : map) {
C_indices_data[j] = it.first;
......@@ -80,6 +80,7 @@ void ComputeIndicesAndData(
++j;
}
}
});
}
}; // namespace
......
......@@ -4,7 +4,7 @@
* \brief COO sorting
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric>
#include <algorithm>
#include <vector>
......@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
for (int64_t i = 1; i <= csrs[0].num_rows; ++i) {
std::vector<int64_t> indices_off;
res_indptr[i] = indptr_data[0][i];
indices_off.push_back(indptr_data[0][i-1]);
for (size_t j = 1; j < csrs.size(); ++j) {
res_indptr[i] += indptr_data[j][i];
......@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
}
} // for check out of bound
} // for
res_indices[off] = min;
res_data[off] = data_data[min_idx][indices_off[min_idx]];
indices_off[min_idx] += 1;
......
......@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
global_prefix[t+1] += global_prefix[t];
}
}
#pragma omp barrier
const IdxType thread_offset = global_prefix[thread_id];
......
......@@ -8,6 +8,7 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include "../selector.h"
namespace dgl {
......@@ -40,8 +41,8 @@ void SDDMMCsr(const BcastOff& bcast,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
......@@ -50,14 +51,17 @@ void SDDMMCsr(const BcastOff& bcast,
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs?
X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr;
const DType* rhs_off = Op::use_rhs?
Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr;
const DType* lhs_off = Op::use_lhs
? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
: nullptr;
const DType* rhs_off = Op::use_rhs
? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
}
}
}
});
}
/*!
......@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast,
rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>();
const int64_t nnz = coo.row->shape[0];
#pragma omp parallel for
for (IdType i = 0; i < nnz; ++i) {
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType rid = row[i];
const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i;
......
......@@ -7,6 +7,7 @@
#define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl {
namespace aten {
......@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>();
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) {
out_data[i * dim + k] += feat_data[j * dim + k];
}
}
}
});
}
/*!
......@@ -58,8 +60,8 @@ void SegmentCmp(NDArray feat, NDArray offsets,
IdType *arg_data = arg.Ptr<IdType>();
std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
std::fill(arg_data, arg_data + arg.NumElements(), -1);
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) {
const DType val = feat_data[j * dim + k];
......@@ -70,6 +72,7 @@ void SegmentCmp(NDArray feat, NDArray offsets,
}
}
}
});
}
/*!
......@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>();
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
runtime::parallel_for(0, n, [=](int b, int e) {
for (auto i = b; i < e; ++i) {
for (int k = 0; k < dim; ++k) {
int write_row = arg_data[i * dim + k];
if (write_row >= 0)
out_data[write_row * dim + k] = feat_data[i * dim + k];
}
}
});
}
} // namespace cpu
......
......@@ -4,6 +4,7 @@
* \brief CPU implementation of COO sparse matrix operators
*/
#include <dmlc/omp.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <unordered_map>
......@@ -14,6 +15,7 @@
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t kmax = std::max(rowlen, collen);
#pragma omp parallel for
for (int64_t k = 0; k < kmax; ++k) {
parallel_for(0, kmax, [=](size_t b, size_t e) {
for (auto k = b; k < e; ++k) {
int64_t i = row_stride * k;
int64_t j = col_stride * k;
rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
}
});
return rst;
}
......@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data);
#pragma omp parallel for
for (int64_t i = 0; i < len; ++i)
for (int64_t i = 0; i < len; ++i) {
rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
}
return rst;
}
......@@ -178,8 +182,8 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
// the choice.
if (coo.row_sorted) {
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
parallel_for(0, retlen, [&](size_t b, size_t e) {
for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
for (; it < coo_row + nnz && *it == row_id; ++it) {
......@@ -190,6 +194,7 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
}
}
}
});
} else {
#pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) {
......@@ -328,11 +333,9 @@ CSRMatrix COOToCSR(COOMatrix coo) {
IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);
if (NNZ > 0) {
#pragma omp parallel
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
auto num_threads = omp_get_max_threads();
parallel_for(0, num_threads, [&](int b, int e) {
for (auto thread_id = b; thread_id < e; ++thread_id) {
// We partition the set the of non-zeros among the threads
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
const int64_t nz_start = thread_id*nz_chunk;
......@@ -389,6 +392,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
}
}
}
});
} else {
std::fill(Bp, Bp+N+1, 0);
}
......@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
IdType *out_row = static_cast<IdType*>(out_row_arr->data);
IdType *out_col = static_cast<IdType*>(out_col_arr->data);
#pragma omp parallel for
for (int64_t i = 0; i < nnz; i++) {
parallel_for(0, nnz, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
out_row[i] = new_row_ids[in_rows[i]];
out_col[i] = new_col_ids[in_cols[i]];
}
});
return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
}
......
......@@ -4,6 +4,7 @@
* \brief CSR matrix operator CPU implementation
*/
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector>
#include <unordered_set>
#include <numeric>
......@@ -12,6 +13,7 @@
namespace dgl {
using runtime::NDArray;
using runtime::parallel_for;
namespace aten {
namespace impl {
......@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
// Compute the length of rows for the new matrix.
std::vector<IdType> new_row_lens(num_rows, -1);
#pragma omp parallel for
for (int64_t i = 0; i < num_rows; i++) {
parallel_for(0, num_rows, [=, &new_row_lens](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
int64_t new_row_id = new_row_ids[i];
new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
}
});
// Compute the starting location of each row in the new matrix.
out_indptr[0] = 0;
// This is sequential. It should be pretty fast.
......@@ -506,8 +509,8 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
CHECK_EQ(out_indptr[num_rows], nnz);
// Copy indieces and data with the new order.
// Here I iterate rows in the order of the old matrix.
#pragma omp parallel for
for (int64_t i = 0; i < num_rows; i++) {
parallel_for(0, num_rows, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const IdType *in_row = in_indices + in_indptr[i];
const IdType *in_row_data = in_data + in_indptr[i];
......@@ -523,6 +526,7 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
}
// TODO(zhengda) maybe we should sort the column indices.
}
});
return CSRMatrix(num_rows, num_cols,
out_indptr_arr, out_indices_arr, out_data_arr);
}
......
......@@ -8,6 +8,7 @@
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm>
#include <limits>
#include <memory>
......@@ -46,8 +47,9 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
......@@ -56,6 +58,7 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
}
}
});
}
#endif // USE_AVX
#endif // _WIN32
......@@ -79,8 +82,8 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
......@@ -97,6 +100,7 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
}
}
}
});
}
/*!
......@@ -270,8 +274,8 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
#endif // USE_AVX
#endif // _WIN32
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
IdType* argx_off = argX + rid * dim;
......@@ -295,6 +299,7 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
}
}
}
});
#if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM
......
......@@ -8,6 +8,7 @@
#include <dgl/immutable_graph.h>
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm>
#include "../c_api_common.h"
......@@ -261,8 +262,8 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len);
if (is_sorted) {
#pragma omp parallel for
for (int64_t i = 0; i < query_len; i++) {
runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i];
const auto it = std::find(parent_data, parent_data + parent_len, id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1.
......@@ -272,14 +273,15 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
rst_data[i] = -1;
}
}
});
} else {
std::unordered_map<dgl_id_t, dgl_id_t> parent_map;
for (int64_t i = 0; i < parent_len; i++) {
const dgl_id_t id = parent_data[i];
parent_map[id] = i;
}
#pragma omp parallel for
for (int64_t i = 0; i < query_len; i++) {
runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i];
auto it = parent_map.find(id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1.
......@@ -289,6 +291,7 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
rst_data[i] = -1;
}
}
});
}
return rst;
}
......@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo")
graph_ptr->GetInCSR();
std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1);
int num_partitions = part_nodes.size();
#pragma omp parallel for
for (int i = 0; i < num_partitions; i++) {
runtime::parallel_for(0, num_partitions, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
auto nodes = aten::VecToIdArray(part_nodes[i]);
HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
int part_id = part_ids[i];
subgs[part_id] = subg_ptr;
}
});
List<SubgraphRef> ret_list;
for (size_t i = 0; i < subgs.size(); i++) {
ret_list.push_back(SubgraphRef(subgs[i]));
......@@ -732,8 +736,8 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
const IdType *typed_map_data = static_cast<IdType *>(typed_map->data);
IdType *types_data = static_cast<IdType *>(ret->data);
IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids;
#pragma omp parallel for
for (int64_t i = 0; i < ids->shape[0]; i++) {
runtime::parallel_for(0, ids->shape[0], [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
IdType id = ids_data[i];
auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
// The range must exist.
......@@ -750,6 +754,7 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
+ typed_map_data[num_parts * type_id + part_id - 1];
}
}
});
return ret;
}
......
......@@ -8,6 +8,7 @@
#include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <set>
#include "../c_api_common.h"
......@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
HeteroGraphRef hg = args[0];
dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
#if !defined(DGL_USE_CUDA)
#pragma omp parallel for
#endif
for (int64_t etype = 0; etype < hg->NumEdgeTypes(); ++etype) {
auto get_format_f = [&](size_t etype_b, size_t etype_e) {
for (auto etype = etype_b; etype < etype_e; ++etype) {
auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
for (auto format : CodeToSparseFormats(code))
bg->GetFormat(format);
}
};
#if !(defined(DGL_USE_CUDA))
runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
#else
get_format_f(0, hg->NumEdgeTypes());
#endif
});
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph")
......
......@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h>
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h>
#include <dgl/nodeflow.h>
......@@ -829,8 +830,8 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
char *return_data = new char[ID_size*row_size];
const int64_t local_ids_size = local_ids.size();
// Copy local data
#pragma omp parallel for
for (int64_t i = 0; i < local_ids_size; ++i) {
runtime::parallel_for(0, local_ids_size, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0);
......@@ -838,6 +839,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
local_data_char + local_ids[i] * row_size,
row_size);
}
});
// Recv remote message
for (int i = 0; i < msg_count; ++i) {
KVStoreMsg *kv_msg = recv_kv_message(receiver);
......
......@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h>
#include <dgl/packed_func_ext.h>
#include <dgl/random.h>
#include <dgl/runtime/parallel_for.h>
#include <dmlc/omp.h>
#include <algorithm>
#include <cstdlib>
......@@ -850,8 +851,8 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
BuildCsr(*gptr, neigh_type);
// generate node flows
std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for
for (int i = 0; i < num_workers; i++) {
runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
// create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds);
......@@ -863,6 +864,7 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor,
add_self_loop, probability);
}
});
return nflows;
}
......@@ -977,8 +979,8 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
BuildCsr(*gptr, neigh_type);
// generate node flows
std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for
for (int i = 0; i < num_workers; i++) {
runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
// create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds);
......@@ -989,6 +991,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
nflows[i] = SamplerOp::LayerUniformSample(
gptr.get(), worker_seeds, neigh_type, layer_sizes);
}
});
*rv = List<NodeFlow>(nflows);
});
......@@ -1466,8 +1469,8 @@ public:
std::vector<SubgraphRef> positive_subgs(num_workers);
std::vector<SubgraphRef> negative_subgs(num_workers);
#pragma omp parallel for
for (int64_t i = 0; i < num_workers; i++) {
runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
const int64_t start = (batch_curr_id_ + i) * batch_size_;
const int64_t end = std::min(start + batch_size_, num_seeds_);
const int64_t num_edges = end - start;
......@@ -1514,6 +1517,7 @@ public:
negative_subgs[i] = ConvertRef(neg_subg);
}
}
});
if (neg_mode_.size() > 0) {
positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end());
}
......
......@@ -9,6 +9,7 @@
#include <dgl/base_heterograph.h>
#include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple>
#include <utility>
#include "randomwalks_impl.h"
......@@ -47,8 +48,8 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
IdxType *traces_data = traces.Ptr<IdxType>();
IdxType *eids_data = eids.Ptr<IdxType>();
#pragma omp parallel for
for (int64_t seed_id = 0; seed_id < num_seeds; ++seed_id) {
runtime::parallel_for(0, num_seeds, [&](size_t seed_begin, size_t seed_end) {
for (auto seed_id = seed_begin; seed_id < seed_end; seed_id++) {
int64_t i;
dgl_id_t curr = seed_data[seed_id];
traces_data[seed_id * trace_length] = curr;
......@@ -66,6 +67,7 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
eids_data[seed_id * max_num_steps + i] = -1;
}
}
});
return std::make_pair(traces, eids);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment