Unverified Commit f5183820 authored by Tomasz Patejko's avatar Tomasz Patejko Committed by GitHub
Browse files

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for

* [CPU, Parallel] Decrease number of calls to task function

* c[CPU, Parallel] Modify calls to new interface of parallel_for
parent 21a40279
...@@ -74,14 +74,12 @@ void parallel_for( ...@@ -74,14 +74,12 @@ void parallel_for(
auto chunk_size = divup((end - begin), num_threads); auto chunk_size = divup((end - begin), num_threads);
auto begin_tid = begin + tid * chunk_size; auto begin_tid = begin + tid * chunk_size;
if (begin_tid < end) { if (begin_tid < end) {
for (auto i = begin_tid; i < std::min(end, chunk_size + begin_tid); i++) { auto end_tid = std::min(end, chunk_size + begin_tid);
f(i); f(begin_tid, end_tid);
}
} }
} }
#else #else
for (auto i = begin; i < end; i++) f(begin, end);
f(i);
#endif #endif
} }
...@@ -98,7 +96,7 @@ void parallel_for( ...@@ -98,7 +96,7 @@ void parallel_for(
const size_t begin, const size_t begin,
const size_t end, const size_t end,
F&& f) { F&& f) {
parallel_for(begin, end, default_grain_size(), f); parallel_for(begin, end, default_grain_size(), std::forward<F>(f));
} }
} // namespace runtime } // namespace runtime
} // namespace dgl } // namespace dgl
......
...@@ -5,11 +5,13 @@ ...@@ -5,11 +5,13 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric> #include <numeric>
#include "../arith.h" #include "../arith.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) { ...@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < lhs->shape[0]; i++) {
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]); ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]);
} }
return ret; return ret;
...@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) { ...@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < lhs->shape[0]; i++) {
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs_data[i], rhs); ret_data[i] = Op::Call(lhs_data[i], rhs);
} }
return ret; return ret;
...@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) { ...@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < rhs->shape[0]; i++) {
for (int64_t i = 0; i < rhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs, rhs_data[i]); ret_data[i] = Op::Call(lhs, rhs_data[i]);
} }
return ret; return ret;
...@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) { ...@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < lhs->shape[0]; i++) {
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs_data[i]); ret_data[i] = Op::Call(lhs_data[i]);
} }
return ret; return ret;
......
...@@ -4,11 +4,13 @@ ...@@ -4,11 +4,13 @@
* \brief Array index select CPU implementation * \brief Array index select CPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple> #include <tuple>
#include <utility> #include <utility>
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) { ...@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx); NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx);
DType *concat_data = static_cast<DType *>(concat->data); DType *concat_data = static_cast<DType *>(concat->data);
#pragma omp parallel for parallel_for(0, rows, [=](size_t b, size_t e) {
for (int64_t i = 0; i < rows; ++i) { for (auto i = b; i < e; ++i) {
for (int64_t j = 0; j < length_data[i]; ++j) for (int64_t j = 0; j < length_data[i]; ++j)
concat_data[offsets_data[i] + j] = array_data[i * stride + j]; concat_data[offsets_data[i] + j] = array_data[i * stride + j];
} }
});
return std::make_pair(concat, offsets); return std::make_pair(concat, offsets);
} }
...@@ -56,8 +59,8 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) { ...@@ -56,8 +59,8 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
IdArray length = NewIdArray(rows, array->ctx); IdArray length = NewIdArray(rows, array->ctx);
int64_t *length_data = static_cast<int64_t *>(length->data); int64_t *length_data = static_cast<int64_t *>(length->data);
#pragma omp parallel for parallel_for(0, rows, [=](size_t b, size_t e) {
for (int64_t i = 0; i < rows; ++i) { for (auto i = b; i < e; ++i) {
int64_t j; int64_t j;
for (j = 0; j < cols; ++j) { for (j = 0; j < cols; ++j) {
const DType val = array_data[i * cols + j]; const DType val = array_data[i * cols + j];
...@@ -66,6 +69,7 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) { ...@@ -66,6 +69,7 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
} }
length_data[i] = j; length_data[i] = j;
} }
});
auto ret = ConcatSlices<XPU, DType, int64_t>(array, length); auto ret = ConcatSlices<XPU, DType, int64_t>(array, length);
return std::make_tuple(ret.first, length, ret.second); return std::make_tuple(ret.first, length, ret.second);
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief Array scatter CPU implementation * \brief Array scatter CPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) { ...@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
const IdType* idx = index.Ptr<IdType>(); const IdType* idx = index.Ptr<IdType>();
const DType* val = value.Ptr<DType>(); const DType* val = value.Ptr<DType>();
DType* outd = out.Ptr<DType>(); DType* outd = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, len, [&](size_t b, size_t e) {
for (int64_t i = 0; i < len; ++i) for (auto i = b; i < e; ++i) {
outd[idx[i]] = val[i]; outd[idx[i]] = val[i];
}
});
} }
template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray); template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray);
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief Retrieve entries of a CSR matrix * \brief Retrieve entries of a CSR matrix
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <numeric> #include <numeric>
...@@ -12,7 +13,7 @@ ...@@ -12,7 +13,7 @@
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -70,8 +71,8 @@ NDArray CSRGetData( ...@@ -70,8 +71,8 @@ NDArray CSRGetData(
if (csr.sorted) { if (csr.sorted) {
// use binary search on each row // use binary search on each row
#pragma omp parallel for parallel_for(0, retlen, [&](size_t b, size_t e) {
for (int64_t p = 0; p < retlen; ++p) { for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride]; const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id; CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id; CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
...@@ -84,10 +85,11 @@ NDArray CSRGetData( ...@@ -84,10 +85,11 @@ NDArray CSRGetData(
ret_data[p] = return_eids ? eid : weight_data[eid]; ret_data[p] = return_eids ? eid : weight_data[eid];
} }
} }
});
} else { } else {
// linear search on each row // linear search on each row
#pragma omp parallel for parallel_for(0, retlen, [&](size_t b, size_t e) {
for (int64_t p = 0; p < retlen; ++p) { for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride]; const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id; CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id; CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
...@@ -99,6 +101,7 @@ NDArray CSRGetData( ...@@ -99,6 +101,7 @@ NDArray CSRGetData(
} }
} }
} }
});
} }
return ret; return ret;
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h> #include <parallel_hashmap/phmap.h>
#include <vector> #include <vector>
#include "array_utils.h" #include "array_utils.h"
...@@ -12,6 +13,7 @@ ...@@ -12,6 +13,7 @@
namespace dgl { namespace dgl {
using dgl::runtime::NDArray; using dgl::runtime::NDArray;
using dgl::runtime::parallel_for;
namespace aten { namespace aten {
...@@ -26,10 +28,9 @@ void CountNNZPerRow( ...@@ -26,10 +28,9 @@ void CountNNZPerRow(
const IdType* B_indices, const IdType* B_indices,
IdType* C_indptr_data, IdType* C_indptr_data,
int64_t M) { int64_t M) {
parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_set<IdType> set; phmap::flat_hash_set<IdType> set;
#pragma omp parallel for firstprivate(set)
for (int64_t i = 0; i < M; ++i) {
set.clear();
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) { for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u]; IdType w = A_indices[u];
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
...@@ -37,6 +38,7 @@ void CountNNZPerRow( ...@@ -37,6 +38,7 @@ void CountNNZPerRow(
} }
C_indptr_data[i] = set.size(); C_indptr_data[i] = set.size();
} }
});
} }
template <typename IdType> template <typename IdType>
...@@ -66,10 +68,9 @@ void ComputeIndicesAndData( ...@@ -66,10 +68,9 @@ void ComputeIndicesAndData(
IdType* C_indices_data, IdType* C_indices_data,
DType* C_weights_data, DType* C_weights_data,
int64_t M) { int64_t M) {
parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_map<IdType, DType> map; phmap::flat_hash_map<IdType, DType> map;
#pragma omp parallel for firstprivate(map)
for (int64_t i = 0; i < M; ++i) {
map.clear();
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) { for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
IdType w = A_indices[u]; IdType w = A_indices[u];
DType vA = A_data[A_eids ? A_eids[u] : u]; DType vA = A_data[A_eids ? A_eids[u] : u];
...@@ -87,6 +88,7 @@ void ComputeIndicesAndData( ...@@ -87,6 +88,7 @@ void ComputeIndicesAndData(
++v; ++v;
} }
} }
});
} }
}; // namespace }; // namespace
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief CSR sorting * \brief CSR sorting
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric> #include <numeric>
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
...@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) { ...@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx); csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
} }
IdType* eid_data = static_cast<IdType*>(csr->data->data); IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{ runtime::parallel_for(0, num_rows, [=](size_t b, size_t e) {
std::vector<ShufflePair> reorder_vec; for (auto row = b; row < e; ++row) {
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row]; const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
std::vector<ShufflePair> reorder_vec(num_cols);
IdType *col = indices_data + indptr_data[row]; IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row]; IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) { for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i]; reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i]; reorder_vec[i].second = eid[i];
...@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) { ...@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) {
eid[i] = reorder_vec[i].second; eid[i] = reorder_vec[i].second;
} }
} }
} });
csr->sorted = true; csr->sorted = true;
} }
...@@ -101,8 +101,8 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag( ...@@ -101,8 +101,8 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
auto out_indices_data = static_cast<IdType *>(output.indices->data); auto out_indices_data = static_cast<IdType *>(output.indices->data);
auto out_eid_data = static_cast<IdType *>(output.data->data); auto out_eid_data = static_cast<IdType *>(output.data->data);
#pragma omp parallel for runtime::parallel_for(0, num_rows, [&](size_t b, size_t e) {
for (IdType src = 0 ; src < num_rows ; ++src) { for (auto src = b; src < e; ++src) {
const IdType start = indptr_data[src]; const IdType start = indptr_data[src];
const IdType end = indptr_data[src + 1]; const IdType end = indptr_data[src + 1];
...@@ -132,6 +132,7 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag( ...@@ -132,6 +132,7 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
out_eid_data[start + offset] = eid; out_eid_data[start + offset] = eid;
} }
} }
});
output.sorted = false; output.sorted = false;
return std::make_pair(output, tag_pos); return std::make_pair(output, tag_pos);
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h> #include <parallel_hashmap/phmap.h>
#include <vector> #include <vector>
#include "array_utils.h" #include "array_utils.h"
...@@ -25,16 +26,17 @@ void CountNNZPerRow( ...@@ -25,16 +26,17 @@ void CountNNZPerRow(
IdType* C_indptr_data, IdType* C_indptr_data,
int64_t M) { int64_t M) {
int64_t n = A_indptr.size(); int64_t n = A_indptr.size();
runtime::parallel_for(0, M, [=](size_t b, size_t e) {
for (size_t i = b; i < e; ++i) {
phmap::flat_hash_set<IdType> set; phmap::flat_hash_set<IdType> set;
#pragma omp parallel for firstprivate(set)
for (IdType i = 0; i < M; ++i) {
set.clear();
for (int64_t k = 0; k < n; ++k) { for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
set.insert(A_indices[k][u]); set.insert(A_indices[k][u]);
} }
C_indptr_data[i] = set.size(); C_indptr_data[i] = set.size();
} }
});
} }
template <typename IdType> template <typename IdType>
...@@ -61,10 +63,9 @@ void ComputeIndicesAndData( ...@@ -61,10 +63,9 @@ void ComputeIndicesAndData(
DType* C_weights_data, DType* C_weights_data,
int64_t M) { int64_t M) {
int64_t n = A_indptr.size(); int64_t n = A_indptr.size();
runtime::parallel_for(0, M, [=](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
phmap::flat_hash_map<IdType, DType> map; phmap::flat_hash_map<IdType, DType> map;
#pragma omp parallel for firstprivate(map)
for (int64_t i = 0; i < M; ++i) {
map.clear();
for (int64_t k = 0; k < n; ++k) { for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) { for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
IdType kA = A_indices[k][u]; IdType kA = A_indices[k][u];
...@@ -72,7 +73,6 @@ void ComputeIndicesAndData( ...@@ -72,7 +73,6 @@ void ComputeIndicesAndData(
map[kA] += vA; map[kA] += vA;
} }
} }
IdType j = C_indptr_data[i]; IdType j = C_indptr_data[i];
for (auto it : map) { for (auto it : map) {
C_indices_data[j] = it.first; C_indices_data[j] = it.first;
...@@ -80,6 +80,7 @@ void ComputeIndicesAndData( ...@@ -80,6 +80,7 @@ void ComputeIndicesAndData(
++j; ++j;
} }
} }
});
} }
}; // namespace }; // namespace
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
* \brief COO sorting * \brief COO sorting
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric> #include <numeric>
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
...@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) { ...@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
for (int64_t i = 1; i <= csrs[0].num_rows; ++i) { for (int64_t i = 1; i <= csrs[0].num_rows; ++i) {
std::vector<int64_t> indices_off; std::vector<int64_t> indices_off;
res_indptr[i] = indptr_data[0][i]; res_indptr[i] = indptr_data[0][i];
indices_off.push_back(indptr_data[0][i-1]); indices_off.push_back(indptr_data[0][i-1]);
for (size_t j = 1; j < csrs.size(); ++j) { for (size_t j = 1; j < csrs.size(); ++j) {
res_indptr[i] += indptr_data[j][i]; res_indptr[i] += indptr_data[j][i];
...@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) { ...@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
} }
} // for check out of bound } // for check out of bound
} // for } // for
res_indices[off] = min; res_indices[off] = min;
res_data[off] = data_data[min_idx][indices_off[min_idx]]; res_data[off] = data_data[min_idx][indices_off[min_idx]];
indices_off[min_idx] += 1; indices_off[min_idx] += 1;
......
...@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows, ...@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
global_prefix[t+1] += global_prefix[t]; global_prefix[t+1] += global_prefix[t];
} }
} }
#pragma omp barrier #pragma omp barrier
const IdxType thread_offset = global_prefix[thread_id]; const IdxType thread_offset = global_prefix[thread_id];
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include "../selector.h" #include "../selector.h"
namespace dgl { namespace dgl {
...@@ -40,8 +41,8 @@ void SDDMMCsr(const BcastOff& bcast, ...@@ -40,8 +41,8 @@ void SDDMMCsr(const BcastOff& bcast,
rhs_dim = bcast.rhs_len, rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size; reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>(); DType* O = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j]; const IdType cid = indices[j];
...@@ -50,14 +51,17 @@ void SDDMMCsr(const BcastOff& bcast, ...@@ -50,14 +51,17 @@ void SDDMMCsr(const BcastOff& bcast,
for (int64_t k = 0; k < dim; ++k) { for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs? const DType* lhs_off = Op::use_lhs
X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr; ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
const DType* rhs_off = Op::use_rhs? : nullptr;
Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr; const DType* rhs_off = Op::use_rhs
? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size); out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
} }
} }
} }
});
} }
/*! /*!
...@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast, ...@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast,
rhs_dim = bcast.rhs_len, rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size; reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>(); DType* O = out.Ptr<DType>();
const int64_t nnz = coo.row->shape[0];
#pragma omp parallel for #pragma omp parallel for
for (IdType i = 0; i < nnz; ++i) { for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType rid = row[i]; const IdType rid = row[i];
const IdType cid = col[i]; const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i; const IdType eid = has_idx? edges[i] : i;
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_ #define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) { ...@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
const DType* feat_data = feat.Ptr<DType>(); const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>(); const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>(); DType *out_data = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, n, [=](int b, int e) {
for (int i = 0; i < n; ++i) { for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) { for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) { for (int k = 0; k < dim; ++k) {
out_data[i * dim + k] += feat_data[j * dim + k]; out_data[i * dim + k] += feat_data[j * dim + k];
} }
} }
} }
});
} }
/*! /*!
...@@ -58,8 +60,8 @@ void SegmentCmp(NDArray feat, NDArray offsets, ...@@ -58,8 +60,8 @@ void SegmentCmp(NDArray feat, NDArray offsets,
IdType *arg_data = arg.Ptr<IdType>(); IdType *arg_data = arg.Ptr<IdType>();
std::fill(out_data, out_data + out.NumElements(), Cmp::zero); std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
std::fill(arg_data, arg_data + arg.NumElements(), -1); std::fill(arg_data, arg_data + arg.NumElements(), -1);
#pragma omp parallel for runtime::parallel_for(0, n, [=](int b, int e) {
for (int i = 0; i < n; ++i) { for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) { for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) { for (int k = 0; k < dim; ++k) {
const DType val = feat_data[j * dim + k]; const DType val = feat_data[j * dim + k];
...@@ -70,6 +72,7 @@ void SegmentCmp(NDArray feat, NDArray offsets, ...@@ -70,6 +72,7 @@ void SegmentCmp(NDArray feat, NDArray offsets,
} }
} }
} }
});
} }
/*! /*!
...@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) { ...@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const DType* feat_data = feat.Ptr<DType>(); const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>(); const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, n, [=](int b, int e) {
for (int i = 0; i < n; ++i) { for (auto i = b; i < e; ++i) {
for (int k = 0; k < dim; ++k) { for (int k = 0; k < dim; ++k) {
int write_row = arg_data[i * dim + k]; int write_row = arg_data[i * dim + k];
if (write_row >= 0) if (write_row >= 0)
out_data[write_row * dim + k] = feat_data[i * dim + k]; out_data[write_row * dim + k] = feat_data[i * dim + k];
} }
} }
});
} }
} // namespace cpu } // namespace cpu
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief CPU implementation of COO sparse matrix operators * \brief CPU implementation of COO sparse matrix operators
*/ */
#include <dmlc/omp.h> #include <dmlc/omp.h>
#include <dgl/runtime/parallel_for.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <unordered_map> #include <unordered_map>
...@@ -14,6 +15,7 @@ ...@@ -14,6 +15,7 @@
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) { ...@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t kmax = std::max(rowlen, collen); const int64_t kmax = std::max(rowlen, collen);
#pragma omp parallel for parallel_for(0, kmax, [=](size_t b, size_t e) {
for (int64_t k = 0; k < kmax; ++k) { for (auto k = b; k < e; ++k) {
int64_t i = row_stride * k; int64_t i = row_stride * k;
int64_t j = col_stride * k; int64_t j = col_stride * k;
rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0; rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
} }
});
return rst; return rst;
} }
...@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) { ...@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx); NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data); IdType* rst_data = static_cast<IdType*>(rst->data);
#pragma omp parallel for #pragma omp parallel for
for (int64_t i = 0; i < len; ++i) for (int64_t i = 0; i < len; ++i) {
rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]); rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
}
return rst; return rst;
} }
...@@ -178,8 +182,8 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) { ...@@ -178,8 +182,8 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
// the choice. // the choice.
if (coo.row_sorted) { if (coo.row_sorted) {
#pragma omp parallel for parallel_for(0, retlen, [&](size_t b, size_t e) {
for (int64_t p = 0; p < retlen; ++p) { for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride]; const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
auto it = std::lower_bound(coo_row, coo_row + nnz, row_id); auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
for (; it < coo_row + nnz && *it == row_id; ++it) { for (; it < coo_row + nnz && *it == row_id; ++it) {
...@@ -190,6 +194,7 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) { ...@@ -190,6 +194,7 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
} }
} }
} }
});
} else { } else {
#pragma omp parallel for #pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) { for (int64_t p = 0; p < retlen; ++p) {
...@@ -328,11 +333,9 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -328,11 +333,9 @@ CSRMatrix COOToCSR(COOMatrix coo) {
IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data); IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);
if (NNZ > 0) { if (NNZ > 0) {
#pragma omp parallel auto num_threads = omp_get_max_threads();
{ parallel_for(0, num_threads, [&](int b, int e) {
const int num_threads = omp_get_num_threads(); for (auto thread_id = b; thread_id < e; ++thread_id) {
const int thread_id = omp_get_thread_num();
// We partition the set the of non-zeros among the threads // We partition the set the of non-zeros among the threads
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads; const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
const int64_t nz_start = thread_id*nz_chunk; const int64_t nz_start = thread_id*nz_chunk;
...@@ -389,6 +392,7 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -389,6 +392,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
} }
} }
} }
});
} else { } else {
std::fill(Bp, Bp+N+1, 0); std::fill(Bp, Bp+N+1, 0);
} }
...@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr, ...@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
IdType *out_row = static_cast<IdType*>(out_row_arr->data); IdType *out_row = static_cast<IdType*>(out_row_arr->data);
IdType *out_col = static_cast<IdType*>(out_col_arr->data); IdType *out_col = static_cast<IdType*>(out_col_arr->data);
#pragma omp parallel for parallel_for(0, nnz, [=](size_t b, size_t e) {
for (int64_t i = 0; i < nnz; i++) { for (auto i = b; i < e; ++i) {
out_row[i] = new_row_ids[in_rows[i]]; out_row[i] = new_row_ids[in_rows[i]];
out_col[i] = new_col_ids[in_cols[i]]; out_col[i] = new_col_ids[in_cols[i]];
} }
});
return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr); return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
} }
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief CSR matrix operator CPU implementation * \brief CSR matrix operator CPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <numeric> #include <numeric>
...@@ -12,6 +13,7 @@ ...@@ -12,6 +13,7 @@
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr, ...@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
// Compute the length of rows for the new matrix. // Compute the length of rows for the new matrix.
std::vector<IdType> new_row_lens(num_rows, -1); std::vector<IdType> new_row_lens(num_rows, -1);
#pragma omp parallel for parallel_for(0, num_rows, [=, &new_row_lens](size_t b, size_t e) {
for (int64_t i = 0; i < num_rows; i++) { for (auto i = b; i < e; ++i) {
int64_t new_row_id = new_row_ids[i]; int64_t new_row_id = new_row_ids[i];
new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i]; new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
} }
});
// Compute the starting location of each row in the new matrix. // Compute the starting location of each row in the new matrix.
out_indptr[0] = 0; out_indptr[0] = 0;
// This is sequential. It should be pretty fast. // This is sequential. It should be pretty fast.
...@@ -506,8 +509,8 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr, ...@@ -506,8 +509,8 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
CHECK_EQ(out_indptr[num_rows], nnz); CHECK_EQ(out_indptr[num_rows], nnz);
// Copy indieces and data with the new order. // Copy indieces and data with the new order.
// Here I iterate rows in the order of the old matrix. // Here I iterate rows in the order of the old matrix.
#pragma omp parallel for parallel_for(0, num_rows, [=](size_t b, size_t e) {
for (int64_t i = 0; i < num_rows; i++) { for (auto i = b; i < e; ++i) {
const IdType *in_row = in_indices + in_indptr[i]; const IdType *in_row = in_indices + in_indptr[i];
const IdType *in_row_data = in_data + in_indptr[i]; const IdType *in_row_data = in_data + in_indptr[i];
...@@ -523,6 +526,7 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr, ...@@ -523,6 +526,7 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
} }
// TODO(zhengda) maybe we should sort the column indices. // TODO(zhengda) maybe we should sort the column indices.
} }
});
return CSRMatrix(num_rows, num_cols, return CSRMatrix(num_rows, num_cols,
out_indptr_arr, out_indices_arr, out_data_arr); out_indptr_arr, out_indices_arr, out_data_arr);
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <memory> #include <memory>
...@@ -46,8 +47,9 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast ...@@ -46,8 +47,9 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
const IdType* indices = csr.indices.Ptr<IdType>(); const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>(); const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len; int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) { runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim; DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
...@@ -56,6 +58,7 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast ...@@ -56,6 +58,7 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim); cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
} }
} }
});
} }
#endif // USE_AVX #endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
...@@ -79,8 +82,8 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X ...@@ -79,8 +82,8 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
const IdType* indices = csr.indices.Ptr<IdType>(); const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>(); const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len; int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim; DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
...@@ -97,6 +100,7 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X ...@@ -97,6 +100,7 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
} }
} }
} }
});
} }
/*! /*!
...@@ -270,8 +274,8 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, ...@@ -270,8 +274,8 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
#endif // USE_AVX #endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
#pragma omp parallel for runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim; DType* out_off = O + rid * dim;
IdType* argx_off = argX + rid * dim; IdType* argx_off = argX + rid * dim;
...@@ -295,6 +299,7 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, ...@@ -295,6 +299,7 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
} }
} }
} }
});
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX #ifdef USE_AVX
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm> #include <algorithm>
#include "../c_api_common.h" #include "../c_api_common.h"
...@@ -261,8 +262,8 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) { ...@@ -261,8 +262,8 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len); const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len);
if (is_sorted) { if (is_sorted) {
#pragma omp parallel for runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (int64_t i = 0; i < query_len; i++) { for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i]; const dgl_id_t id = query_data[i];
const auto it = std::find(parent_data, parent_data + parent_len, id); const auto it = std::find(parent_data, parent_data + parent_len, id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1. // If the vertex Id doesn't exist, the vid in the subgraph is -1.
...@@ -272,14 +273,15 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) { ...@@ -272,14 +273,15 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
rst_data[i] = -1; rst_data[i] = -1;
} }
} }
});
} else { } else {
std::unordered_map<dgl_id_t, dgl_id_t> parent_map; std::unordered_map<dgl_id_t, dgl_id_t> parent_map;
for (int64_t i = 0; i < parent_len; i++) { for (int64_t i = 0; i < parent_len; i++) {
const dgl_id_t id = parent_data[i]; const dgl_id_t id = parent_data[i];
parent_map[id] = i; parent_map[id] = i;
} }
#pragma omp parallel for runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (int64_t i = 0; i < query_len; i++) { for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i]; const dgl_id_t id = query_data[i];
auto it = parent_map.find(id); auto it = parent_map.find(id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1. // If the vertex Id doesn't exist, the vid in the subgraph is -1.
...@@ -289,6 +291,7 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) { ...@@ -289,6 +291,7 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
rst_data[i] = -1; rst_data[i] = -1;
} }
} }
});
} }
return rst; return rst;
} }
...@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo") ...@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo")
graph_ptr->GetInCSR(); graph_ptr->GetInCSR();
std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1); std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1);
int num_partitions = part_nodes.size(); int num_partitions = part_nodes.size();
#pragma omp parallel for runtime::parallel_for(0, num_partitions, [&](size_t b, size_t e) {
for (int i = 0; i < num_partitions; i++) { for (auto i = b; i < e; ++i) {
auto nodes = aten::VecToIdArray(part_nodes[i]); auto nodes = aten::VecToIdArray(part_nodes[i]);
HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops); HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg)); std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
int part_id = part_ids[i]; int part_id = part_ids[i];
subgs[part_id] = subg_ptr; subgs[part_id] = subg_ptr;
} }
});
List<SubgraphRef> ret_list; List<SubgraphRef> ret_list;
for (size_t i = 0; i < subgs.size(); i++) { for (size_t i = 0; i < subgs.size(); i++) {
ret_list.push_back(SubgraphRef(subgs[i])); ret_list.push_back(SubgraphRef(subgs[i]));
...@@ -732,8 +736,8 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty ...@@ -732,8 +736,8 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
const IdType *typed_map_data = static_cast<IdType *>(typed_map->data); const IdType *typed_map_data = static_cast<IdType *>(typed_map->data);
IdType *types_data = static_cast<IdType *>(ret->data); IdType *types_data = static_cast<IdType *>(ret->data);
IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids; IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids;
#pragma omp parallel for runtime::parallel_for(0, ids->shape[0], [&](size_t b, size_t e) {
for (int64_t i = 0; i < ids->shape[0]; i++) { for (auto i = b; i < e; ++i) {
IdType id = ids_data[i]; IdType id = ids_data[i];
auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id); auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
// The range must exist. // The range must exist.
...@@ -750,6 +754,7 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty ...@@ -750,6 +754,7 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
+ typed_map_data[num_parts * type_id + part_id - 1]; + typed_map_data[num_parts * type_id + part_id - 1];
} }
} }
});
return ret; return ret;
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <set> #include <set>
#include "../c_api_common.h" #include "../c_api_common.h"
...@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat") ...@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([] (DGLArgs args, DGLRetValue* rv) {
HeteroGraphRef hg = args[0]; HeteroGraphRef hg = args[0];
dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats(); dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
#if !defined(DGL_USE_CUDA) auto get_format_f = [&](size_t etype_b, size_t etype_e) {
#pragma omp parallel for for (auto etype = etype_b; etype < etype_e; ++etype) {
#endif
for (int64_t etype = 0; etype < hg->NumEdgeTypes(); ++etype) {
auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype)); auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
for (auto format : CodeToSparseFormats(code)) for (auto format : CodeToSparseFormats(code))
bg->GetFormat(format); bg->GetFormat(format);
} }
};
#if !(defined(DGL_USE_CUDA))
runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
#else
get_format_f(0, hg->NumEdgeTypes());
#endif
}); });
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph") DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph")
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/nodeflow.h> #include <dgl/nodeflow.h>
...@@ -829,8 +830,8 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull") ...@@ -829,8 +830,8 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
char *return_data = new char[ID_size*row_size]; char *return_data = new char[ID_size*row_size];
const int64_t local_ids_size = local_ids.size(); const int64_t local_ids_size = local_ids.size();
// Copy local data // Copy local data
#pragma omp parallel for runtime::parallel_for(0, local_ids_size, [&](size_t b, size_t e) {
for (int64_t i = 0; i < local_ids_size; ++i) { for (auto i = b; i < e; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size); CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size); CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0); CHECK_GE(local_ids[i], 0);
...@@ -838,6 +839,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull") ...@@ -838,6 +839,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
local_data_char + local_ids[i] * row_size, local_data_char + local_ids[i] * row_size,
row_size); row_size);
} }
});
// Recv remote message // Recv remote message
for (int i = 0; i < msg_count; ++i) { for (int i = 0; i < msg_count; ++i) {
KVStoreMsg *kv_msg = recv_kv_message(receiver); KVStoreMsg *kv_msg = recv_kv_message(receiver);
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/parallel_for.h>
#include <dmlc/omp.h> #include <dmlc/omp.h>
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
...@@ -850,8 +851,8 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr, ...@@ -850,8 +851,8 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
BuildCsr(*gptr, neigh_type); BuildCsr(*gptr, neigh_type);
// generate node flows // generate node flows
std::vector<NodeFlow> nflows(num_workers); std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (int i = 0; i < num_workers; i++) { for (auto i = b; i < e; ++i) {
// create per-worker seed nodes. // create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size; const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds); const int64_t end = std::min(start + batch_size, num_seeds);
...@@ -863,6 +864,7 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr, ...@@ -863,6 +864,7 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor, gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor,
add_self_loop, probability); add_self_loop, probability);
} }
});
return nflows; return nflows;
} }
...@@ -977,8 +979,8 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling") ...@@ -977,8 +979,8 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
BuildCsr(*gptr, neigh_type); BuildCsr(*gptr, neigh_type);
// generate node flows // generate node flows
std::vector<NodeFlow> nflows(num_workers); std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (int i = 0; i < num_workers; i++) { for (auto i = b; i < e; ++i) {
// create per-worker seed nodes. // create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size; const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds); const int64_t end = std::min(start + batch_size, num_seeds);
...@@ -989,6 +991,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling") ...@@ -989,6 +991,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
nflows[i] = SamplerOp::LayerUniformSample( nflows[i] = SamplerOp::LayerUniformSample(
gptr.get(), worker_seeds, neigh_type, layer_sizes); gptr.get(), worker_seeds, neigh_type, layer_sizes);
} }
});
*rv = List<NodeFlow>(nflows); *rv = List<NodeFlow>(nflows);
}); });
...@@ -1466,8 +1469,8 @@ public: ...@@ -1466,8 +1469,8 @@ public:
std::vector<SubgraphRef> positive_subgs(num_workers); std::vector<SubgraphRef> positive_subgs(num_workers);
std::vector<SubgraphRef> negative_subgs(num_workers); std::vector<SubgraphRef> negative_subgs(num_workers);
#pragma omp parallel for runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (int64_t i = 0; i < num_workers; i++) { for (auto i = b; i < e; ++i) {
const int64_t start = (batch_curr_id_ + i) * batch_size_; const int64_t start = (batch_curr_id_ + i) * batch_size_;
const int64_t end = std::min(start + batch_size_, num_seeds_); const int64_t end = std::min(start + batch_size_, num_seeds_);
const int64_t num_edges = end - start; const int64_t num_edges = end - start;
...@@ -1514,6 +1517,7 @@ public: ...@@ -1514,6 +1517,7 @@ public:
negative_subgs[i] = ConvertRef(neg_subg); negative_subgs[i] = ConvertRef(neg_subg);
} }
} }
});
if (neg_mode_.size() > 0) { if (neg_mode_.size() > 0) {
positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end()); positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end());
} }
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include "randomwalks_impl.h" #include "randomwalks_impl.h"
...@@ -47,8 +48,8 @@ std::pair<IdArray, IdArray> GenericRandomWalk( ...@@ -47,8 +48,8 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
IdxType *traces_data = traces.Ptr<IdxType>(); IdxType *traces_data = traces.Ptr<IdxType>();
IdxType *eids_data = eids.Ptr<IdxType>(); IdxType *eids_data = eids.Ptr<IdxType>();
#pragma omp parallel for runtime::parallel_for(0, num_seeds, [&](size_t seed_begin, size_t seed_end) {
for (int64_t seed_id = 0; seed_id < num_seeds; ++seed_id) { for (auto seed_id = seed_begin; seed_id < seed_end; seed_id++) {
int64_t i; int64_t i;
dgl_id_t curr = seed_data[seed_id]; dgl_id_t curr = seed_data[seed_id];
traces_data[seed_id * trace_length] = curr; traces_data[seed_id * trace_length] = curr;
...@@ -66,6 +67,7 @@ std::pair<IdArray, IdArray> GenericRandomWalk( ...@@ -66,6 +67,7 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
eids_data[seed_id * max_num_steps + i] = -1; eids_data[seed_id * max_num_steps + i] = -1;
} }
} }
});
return std::make_pair(traces, eids); return std::make_pair(traces, eids);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment