Unverified Commit f5183820 authored by Tomasz Patejko's avatar Tomasz Patejko Committed by GitHub
Browse files

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for

* [CPU, Parallel] Decrease number of calls to task function

* c[CPU, Parallel] Modify calls to new interface of parallel_for
parent 21a40279
...@@ -74,14 +74,12 @@ void parallel_for( ...@@ -74,14 +74,12 @@ void parallel_for(
auto chunk_size = divup((end - begin), num_threads); auto chunk_size = divup((end - begin), num_threads);
auto begin_tid = begin + tid * chunk_size; auto begin_tid = begin + tid * chunk_size;
if (begin_tid < end) { if (begin_tid < end) {
for (auto i = begin_tid; i < std::min(end, chunk_size + begin_tid); i++) { auto end_tid = std::min(end, chunk_size + begin_tid);
f(i); f(begin_tid, end_tid);
}
} }
} }
#else #else
for (auto i = begin; i < end; i++) f(begin, end);
f(i);
#endif #endif
} }
...@@ -98,7 +96,7 @@ void parallel_for( ...@@ -98,7 +96,7 @@ void parallel_for(
const size_t begin, const size_t begin,
const size_t end, const size_t end,
F&& f) { F&& f) {
parallel_for(begin, end, default_grain_size(), f); parallel_for(begin, end, default_grain_size(), std::forward<F>(f));
} }
} // namespace runtime } // namespace runtime
} // namespace dgl } // namespace dgl
......
...@@ -5,11 +5,13 @@ ...@@ -5,11 +5,13 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric> #include <numeric>
#include "../arith.h" #include "../arith.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) { ...@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < lhs->shape[0]; i++) {
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]); ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]);
} }
return ret; return ret;
...@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) { ...@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < lhs->shape[0]; i++) {
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs_data[i], rhs); ret_data[i] = Op::Call(lhs_data[i], rhs);
} }
return ret; return ret;
...@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) { ...@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < rhs->shape[0]; i++) {
for (int64_t i = 0; i < rhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs, rhs_data[i]); ret_data[i] = Op::Call(lhs, rhs_data[i]);
} }
return ret; return ret;
...@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) { ...@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) {
IdType* ret_data = static_cast<IdType*>(ret->data); IdType* ret_data = static_cast<IdType*>(ret->data);
// TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling, // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
// etc., especially since the workload is very light. Need to replace with parallel_for. // etc., especially since the workload is very light. Need to replace with parallel_for.
// #pragma omp parallel for for (size_t i = 0; i < lhs->shape[0]; i++) {
for (int64_t i = 0; i < lhs->shape[0]; ++i) {
ret_data[i] = Op::Call(lhs_data[i]); ret_data[i] = Op::Call(lhs_data[i]);
} }
return ret; return ret;
......
...@@ -4,11 +4,13 @@ ...@@ -4,11 +4,13 @@
* \brief Array index select CPU implementation * \brief Array index select CPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple> #include <tuple>
#include <utility> #include <utility>
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) { ...@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx); NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx);
DType *concat_data = static_cast<DType *>(concat->data); DType *concat_data = static_cast<DType *>(concat->data);
#pragma omp parallel for parallel_for(0, rows, [=](size_t b, size_t e) {
for (int64_t i = 0; i < rows; ++i) { for (auto i = b; i < e; ++i) {
for (int64_t j = 0; j < length_data[i]; ++j) for (int64_t j = 0; j < length_data[i]; ++j)
concat_data[offsets_data[i] + j] = array_data[i * stride + j]; concat_data[offsets_data[i] + j] = array_data[i * stride + j];
} }
});
return std::make_pair(concat, offsets); return std::make_pair(concat, offsets);
} }
...@@ -56,16 +59,17 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) { ...@@ -56,16 +59,17 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
IdArray length = NewIdArray(rows, array->ctx); IdArray length = NewIdArray(rows, array->ctx);
int64_t *length_data = static_cast<int64_t *>(length->data); int64_t *length_data = static_cast<int64_t *>(length->data);
#pragma omp parallel for parallel_for(0, rows, [=](size_t b, size_t e) {
for (int64_t i = 0; i < rows; ++i) { for (auto i = b; i < e; ++i) {
int64_t j; int64_t j;
for (j = 0; j < cols; ++j) { for (j = 0; j < cols; ++j) {
const DType val = array_data[i * cols + j]; const DType val = array_data[i * cols + j];
if (val == pad_value) if (val == pad_value)
break; break;
}
length_data[i] = j;
} }
length_data[i] = j; });
}
auto ret = ConcatSlices<XPU, DType, int64_t>(array, length); auto ret = ConcatSlices<XPU, DType, int64_t>(array, length);
return std::make_tuple(ret.first, length, ret.second); return std::make_tuple(ret.first, length, ret.second);
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief Array scatter CPU implementation * \brief Array scatter CPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) { ...@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
const IdType* idx = index.Ptr<IdType>(); const IdType* idx = index.Ptr<IdType>();
const DType* val = value.Ptr<DType>(); const DType* val = value.Ptr<DType>();
DType* outd = out.Ptr<DType>(); DType* outd = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, len, [&](size_t b, size_t e) {
for (int64_t i = 0; i < len; ++i) for (auto i = b; i < e; ++i) {
outd[idx[i]] = val[i]; outd[idx[i]] = val[i];
}
});
} }
template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray); template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray);
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief Retrieve entries of a CSR matrix * \brief Retrieve entries of a CSR matrix
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <numeric> #include <numeric>
...@@ -12,7 +13,7 @@ ...@@ -12,7 +13,7 @@
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -70,35 +71,37 @@ NDArray CSRGetData( ...@@ -70,35 +71,37 @@ NDArray CSRGetData(
if (csr.sorted) { if (csr.sorted) {
// use binary search on each row // use binary search on each row
#pragma omp parallel for parallel_for(0, retlen, [&](size_t b, size_t e) {
for (int64_t p = 0; p < retlen; ++p) { for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride]; const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id; CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id; CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
const IdType *start_ptr = indices_data + indptr_data[row_id]; const IdType *start_ptr = indices_data + indptr_data[row_id];
const IdType *end_ptr = indices_data + indptr_data[row_id + 1]; const IdType *end_ptr = indices_data + indptr_data[row_id + 1];
auto it = std::lower_bound(start_ptr, end_ptr, col_id); auto it = std::lower_bound(start_ptr, end_ptr, col_id);
if (it != end_ptr && *it == col_id) { if (it != end_ptr && *it == col_id) {
const IdType idx = it - indices_data; const IdType idx = it - indices_data;
IdType eid = data ? data[idx] : idx; IdType eid = data ? data[idx] : idx;
ret_data[p] = return_eids ? eid : weight_data[eid]; ret_data[p] = return_eids ? eid : weight_data[eid];
}
} }
} });
} else { } else {
// linear search on each row // linear search on each row
#pragma omp parallel for parallel_for(0, retlen, [&](size_t b, size_t e) {
for (int64_t p = 0; p < retlen; ++p) { for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride]; const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id; CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id; CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
for (IdType idx = indptr_data[row_id]; idx < indptr_data[row_id + 1]; ++idx) { for (IdType idx = indptr_data[row_id]; idx < indptr_data[row_id + 1]; ++idx) {
if (indices_data[idx] == col_id) { if (indices_data[idx] == col_id) {
IdType eid = data ? data[idx] : idx; IdType eid = data ? data[idx] : idx;
ret_data[p] = return_eids ? eid : weight_data[eid]; ret_data[p] = return_eids ? eid : weight_data[eid];
break; break;
}
} }
} }
} });
} }
return ret; return ret;
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h> #include <parallel_hashmap/phmap.h>
#include <vector> #include <vector>
#include "array_utils.h" #include "array_utils.h"
...@@ -12,6 +13,7 @@ ...@@ -12,6 +13,7 @@
namespace dgl { namespace dgl {
using dgl::runtime::NDArray; using dgl::runtime::NDArray;
using dgl::runtime::parallel_for;
namespace aten { namespace aten {
...@@ -26,17 +28,17 @@ void CountNNZPerRow( ...@@ -26,17 +28,17 @@ void CountNNZPerRow(
const IdType* B_indices, const IdType* B_indices,
IdType* C_indptr_data, IdType* C_indptr_data,
int64_t M) { int64_t M) {
phmap::flat_hash_set<IdType> set; parallel_for(0, M, [=](size_t b, size_t e) {
#pragma omp parallel for firstprivate(set) for (auto i = b; i < e; ++i) {
for (int64_t i = 0; i < M; ++i) { phmap::flat_hash_set<IdType> set;
set.clear(); for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) { IdType w = A_indices[u];
IdType w = A_indices[u]; for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) set.insert(B_indices[v]);
set.insert(B_indices[v]); }
C_indptr_data[i] = set.size();
} }
C_indptr_data[i] = set.size(); });
}
} }
template <typename IdType> template <typename IdType>
...@@ -66,27 +68,27 @@ void ComputeIndicesAndData( ...@@ -66,27 +68,27 @@ void ComputeIndicesAndData(
IdType* C_indices_data, IdType* C_indices_data,
DType* C_weights_data, DType* C_weights_data,
int64_t M) { int64_t M) {
phmap::flat_hash_map<IdType, DType> map; parallel_for(0, M, [=](size_t b, size_t e) {
#pragma omp parallel for firstprivate(map) for (auto i = b; i < e; ++i) {
for (int64_t i = 0; i < M; ++i) { phmap::flat_hash_map<IdType, DType> map;
map.clear(); for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) { IdType w = A_indices[u];
IdType w = A_indices[u]; DType vA = A_data[A_eids ? A_eids[u] : u];
DType vA = A_data[A_eids ? A_eids[u] : u]; for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) {
for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) { IdType t = B_indices[v];
IdType t = B_indices[v]; DType vB = B_data[B_eids ? B_eids[v] : v];
DType vB = B_data[B_eids ? B_eids[v] : v]; map[t] += vA * vB;
map[t] += vA * vB; }
} }
}
IdType v = C_indptr_data[i]; IdType v = C_indptr_data[i];
for (auto it : map) { for (auto it : map) {
C_indices_data[v] = it.first; C_indices_data[v] = it.first;
C_weights_data[v] = it.second; C_weights_data[v] = it.second;
++v; ++v;
}
} }
} });
} }
}; // namespace }; // namespace
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief CSR sorting * \brief CSR sorting
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric> #include <numeric>
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
...@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) { ...@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx); csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
} }
IdType* eid_data = static_cast<IdType*>(csr->data->data); IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{ runtime::parallel_for(0, num_rows, [=](size_t b, size_t e) {
std::vector<ShufflePair> reorder_vec; for (auto row = b; row < e; ++row) {
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row]; const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
std::vector<ShufflePair> reorder_vec(num_cols);
IdType *col = indices_data + indptr_data[row]; IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row]; IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) { for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i]; reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i]; reorder_vec[i].second = eid[i];
...@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) { ...@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) {
eid[i] = reorder_vec[i].second; eid[i] = reorder_vec[i].second;
} }
} }
} });
csr->sorted = true; csr->sorted = true;
} }
...@@ -101,37 +101,38 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag( ...@@ -101,37 +101,38 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
auto out_indices_data = static_cast<IdType *>(output.indices->data); auto out_indices_data = static_cast<IdType *>(output.indices->data);
auto out_eid_data = static_cast<IdType *>(output.data->data); auto out_eid_data = static_cast<IdType *>(output.data->data);
#pragma omp parallel for runtime::parallel_for(0, num_rows, [&](size_t b, size_t e) {
for (IdType src = 0 ; src < num_rows ; ++src) { for (auto src = b; src < e; ++src) {
const IdType start = indptr_data[src]; const IdType start = indptr_data[src];
const IdType end = indptr_data[src + 1]; const IdType end = indptr_data[src + 1];
auto tag_pos_row = tag_pos_data + src * (num_tags + 1); auto tag_pos_row = tag_pos_data + src * (num_tags + 1);
std::vector<IdType> pointer(num_tags, 0); std::vector<IdType> pointer(num_tags, 0);
for (IdType ptr = start ; ptr < end ; ++ptr) { for (IdType ptr = start ; ptr < end ; ++ptr) {
const IdType dst = indices_data[ptr]; const IdType dst = indices_data[ptr];
const TagType tag = tag_data[dst]; const TagType tag = tag_data[dst];
CHECK_LT(tag, num_tags); CHECK_LT(tag, num_tags);
++tag_pos_row[tag + 1]; ++tag_pos_row[tag + 1];
} // count } // count
for (TagType tag = 1 ; tag <= num_tags; ++tag) { for (TagType tag = 1 ; tag <= num_tags; ++tag) {
tag_pos_row[tag] += tag_pos_row[tag - 1]; tag_pos_row[tag] += tag_pos_row[tag - 1];
} // cumulate } // cumulate
for (IdType ptr = start ; ptr < end ; ++ptr) { for (IdType ptr = start ; ptr < end ; ++ptr) {
const IdType dst = indices_data[ptr]; const IdType dst = indices_data[ptr];
const IdType eid = eid_data[ptr]; const IdType eid = eid_data[ptr];
const TagType tag = tag_data[dst]; const TagType tag = tag_data[dst];
const IdType offset = tag_pos_row[tag] + pointer[tag]; const IdType offset = tag_pos_row[tag] + pointer[tag];
CHECK_LT(offset, tag_pos_row[tag + 1]); CHECK_LT(offset, tag_pos_row[tag + 1]);
++pointer[tag]; ++pointer[tag];
out_indices_data[start + offset] = dst; out_indices_data[start + offset] = dst;
out_eid_data[start + offset] = eid; out_eid_data[start + offset] = eid;
}
} }
} });
output.sorted = false; output.sorted = false;
return std::make_pair(output, tag_pos); return std::make_pair(output, tag_pos);
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <parallel_hashmap/phmap.h> #include <parallel_hashmap/phmap.h>
#include <vector> #include <vector>
#include "array_utils.h" #include "array_utils.h"
...@@ -25,16 +26,17 @@ void CountNNZPerRow( ...@@ -25,16 +26,17 @@ void CountNNZPerRow(
IdType* C_indptr_data, IdType* C_indptr_data,
int64_t M) { int64_t M) {
int64_t n = A_indptr.size(); int64_t n = A_indptr.size();
phmap::flat_hash_set<IdType> set;
#pragma omp parallel for firstprivate(set) runtime::parallel_for(0, M, [=](size_t b, size_t e) {
for (IdType i = 0; i < M; ++i) { for (size_t i = b; i < e; ++i) {
set.clear(); phmap::flat_hash_set<IdType> set;
for (int64_t k = 0; k < n; ++k) { for (int64_t k = 0; k < n; ++k) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
set.insert(A_indices[k][u]); set.insert(A_indices[k][u]);
}
C_indptr_data[i] = set.size();
} }
C_indptr_data[i] = set.size(); });
}
} }
template <typename IdType> template <typename IdType>
...@@ -61,25 +63,24 @@ void ComputeIndicesAndData( ...@@ -61,25 +63,24 @@ void ComputeIndicesAndData(
DType* C_weights_data, DType* C_weights_data,
int64_t M) { int64_t M) {
int64_t n = A_indptr.size(); int64_t n = A_indptr.size();
phmap::flat_hash_map<IdType, DType> map; runtime::parallel_for(0, M, [=](size_t b, size_t e) {
#pragma omp parallel for firstprivate(map) for (auto i = b; i < e; ++i) {
for (int64_t i = 0; i < M; ++i) { phmap::flat_hash_map<IdType, DType> map;
map.clear(); for (int64_t k = 0; k < n; ++k) {
for (int64_t k = 0; k < n; ++k) { for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) { IdType kA = A_indices[k][u];
IdType kA = A_indices[k][u]; DType vA = A_data[k][A_eids[k] ? A_eids[k][u] : u];
DType vA = A_data[k][A_eids[k] ? A_eids[k][u] : u]; map[kA] += vA;
map[kA] += vA; }
}
IdType j = C_indptr_data[i];
for (auto it : map) {
C_indices_data[j] = it.first;
C_weights_data[j] = it.second;
++j;
} }
} }
});
IdType j = C_indptr_data[i];
for (auto it : map) {
C_indices_data[j] = it.first;
C_weights_data[j] = it.second;
++j;
}
}
} }
}; // namespace }; // namespace
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
* \brief COO sorting * \brief COO sorting
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <numeric> #include <numeric>
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
...@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) { ...@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
for (int64_t i = 1; i <= csrs[0].num_rows; ++i) { for (int64_t i = 1; i <= csrs[0].num_rows; ++i) {
std::vector<int64_t> indices_off; std::vector<int64_t> indices_off;
res_indptr[i] = indptr_data[0][i]; res_indptr[i] = indptr_data[0][i];
indices_off.push_back(indptr_data[0][i-1]); indices_off.push_back(indptr_data[0][i-1]);
for (size_t j = 1; j < csrs.size(); ++j) { for (size_t j = 1; j < csrs.size(); ++j) {
res_indptr[i] += indptr_data[j][i]; res_indptr[i] += indptr_data[j][i];
...@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) { ...@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
} }
} // for check out of bound } // for check out of bound
} // for } // for
res_indices[off] = min; res_indices[off] = min;
res_data[off] = data_data[min_idx][indices_off[min_idx]]; res_data[off] = data_data[min_idx][indices_off[min_idx]];
indices_off[min_idx] += 1; indices_off[min_idx] += 1;
......
...@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows, ...@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
global_prefix[t+1] += global_prefix[t]; global_prefix[t+1] += global_prefix[t];
} }
} }
#pragma omp barrier #pragma omp barrier
const IdxType thread_offset = global_prefix[thread_id]; const IdxType thread_offset = global_prefix[thread_id];
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include "../selector.h" #include "../selector.h"
namespace dgl { namespace dgl {
...@@ -40,24 +41,27 @@ void SDDMMCsr(const BcastOff& bcast, ...@@ -40,24 +41,27 @@ void SDDMMCsr(const BcastOff& bcast,
rhs_dim = bcast.rhs_len, rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size; reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>(); DType* O = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j]; const IdType cid = indices[j];
const IdType eid = has_idx? edges[j] : j; const IdType eid = has_idx? edges[j] : j;
DType* out_off = O + eid * dim; DType* out_off = O + eid * dim;
for (int64_t k = 0; k < dim; ++k) { for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs? const DType* lhs_off = Op::use_lhs
X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr; ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
const DType* rhs_off = Op::use_rhs? : nullptr;
Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr; const DType* rhs_off = Op::use_rhs
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size); ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
: nullptr;
out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
}
} }
} }
} });
} }
/*! /*!
...@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast, ...@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast,
rhs_dim = bcast.rhs_len, rhs_dim = bcast.rhs_len,
reduce_size = bcast.reduce_size; reduce_size = bcast.reduce_size;
DType* O = out.Ptr<DType>(); DType* O = out.Ptr<DType>();
const int64_t nnz = coo.row->shape[0];
#pragma omp parallel for #pragma omp parallel for
for (IdType i = 0; i < nnz; ++i) { for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType rid = row[i]; const IdType rid = row[i];
const IdType cid = col[i]; const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i; const IdType eid = has_idx? edges[i] : i;
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_ #define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) { ...@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
const DType* feat_data = feat.Ptr<DType>(); const DType* feat_data = feat.Ptr<DType>();
const IdType* offsets_data = offsets.Ptr<IdType>(); const IdType* offsets_data = offsets.Ptr<IdType>();
DType *out_data = out.Ptr<DType>(); DType *out_data = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, n, [=](int b, int e) {
for (int i = 0; i < n; ++i) { for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) { for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) { for (int k = 0; k < dim; ++k) {
out_data[i * dim + k] += feat_data[j * dim + k]; out_data[i * dim + k] += feat_data[j * dim + k];
}
} }
} }
} });
} }
/*! /*!
...@@ -58,18 +60,19 @@ void SegmentCmp(NDArray feat, NDArray offsets, ...@@ -58,18 +60,19 @@ void SegmentCmp(NDArray feat, NDArray offsets,
IdType *arg_data = arg.Ptr<IdType>(); IdType *arg_data = arg.Ptr<IdType>();
std::fill(out_data, out_data + out.NumElements(), Cmp::zero); std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
std::fill(arg_data, arg_data + arg.NumElements(), -1); std::fill(arg_data, arg_data + arg.NumElements(), -1);
#pragma omp parallel for runtime::parallel_for(0, n, [=](int b, int e) {
for (int i = 0; i < n; ++i) { for (auto i = b; i < e; ++i) {
for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) { for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
for (int k = 0; k < dim; ++k) { for (int k = 0; k < dim; ++k) {
const DType val = feat_data[j * dim + k]; const DType val = feat_data[j * dim + k];
if (Cmp::Call(out_data[i * dim + k], val)) { if (Cmp::Call(out_data[i * dim + k], val)) {
out_data[i * dim + k] = val; out_data[i * dim + k] = val;
arg_data[i * dim + k] = j; arg_data[i * dim + k] = j;
}
} }
} }
} }
} });
} }
/*! /*!
...@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) { ...@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
const DType* feat_data = feat.Ptr<DType>(); const DType* feat_data = feat.Ptr<DType>();
const IdType* arg_data = arg.Ptr<IdType>(); const IdType* arg_data = arg.Ptr<IdType>();
DType* out_data = out.Ptr<DType>(); DType* out_data = out.Ptr<DType>();
#pragma omp parallel for runtime::parallel_for(0, n, [=](int b, int e) {
for (int i = 0; i < n; ++i) { for (auto i = b; i < e; ++i) {
for (int k = 0; k < dim; ++k) { for (int k = 0; k < dim; ++k) {
int write_row = arg_data[i * dim + k]; int write_row = arg_data[i * dim + k];
if (write_row >= 0) if (write_row >= 0)
out_data[write_row * dim + k] = feat_data[i * dim + k]; out_data[write_row * dim + k] = feat_data[i * dim + k];
}
} }
} });
} }
} // namespace cpu } // namespace cpu
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief CPU implementation of COO sparse matrix operators * \brief CPU implementation of COO sparse matrix operators
*/ */
#include <dmlc/omp.h> #include <dmlc/omp.h>
#include <dgl/runtime/parallel_for.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <unordered_map> #include <unordered_map>
...@@ -14,6 +15,7 @@ ...@@ -14,6 +15,7 @@
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) { ...@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t kmax = std::max(rowlen, collen); const int64_t kmax = std::max(rowlen, collen);
#pragma omp parallel for parallel_for(0, kmax, [=](size_t b, size_t e) {
for (int64_t k = 0; k < kmax; ++k) { for (auto k = b; k < e; ++k) {
int64_t i = row_stride * k; int64_t i = row_stride * k;
int64_t j = col_stride * k; int64_t j = col_stride * k;
rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0; rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
} }
});
return rst; return rst;
} }
...@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) { ...@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx); NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data); IdType* rst_data = static_cast<IdType*>(rst->data);
#pragma omp parallel for #pragma omp parallel for
for (int64_t i = 0; i < len; ++i) for (int64_t i = 0; i < len; ++i) {
rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]); rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
}
return rst; return rst;
} }
...@@ -178,18 +182,19 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) { ...@@ -178,18 +182,19 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
// the choice. // the choice.
if (coo.row_sorted) { if (coo.row_sorted) {
#pragma omp parallel for parallel_for(0, retlen, [&](size_t b, size_t e) {
for (int64_t p = 0; p < retlen; ++p) { for (auto p = b; p < e; ++p) {
const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride]; const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
auto it = std::lower_bound(coo_row, coo_row + nnz, row_id); auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
for (; it < coo_row + nnz && *it == row_id; ++it) { for (; it < coo_row + nnz && *it == row_id; ++it) {
const auto idx = it - coo_row; const auto idx = it - coo_row;
if (coo_col[idx] == col_id) { if (coo_col[idx] == col_id) {
ret_data[p] = data? data[idx] : idx; ret_data[p] = data? data[idx] : idx;
break; break;
}
} }
} }
} });
} else { } else {
#pragma omp parallel for #pragma omp parallel for
for (int64_t p = 0; p < retlen; ++p) { for (int64_t p = 0; p < retlen; ++p) {
...@@ -328,67 +333,66 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -328,67 +333,66 @@ CSRMatrix COOToCSR(COOMatrix coo) {
IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data); IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);
if (NNZ > 0) { if (NNZ > 0) {
#pragma omp parallel auto num_threads = omp_get_max_threads();
{ parallel_for(0, num_threads, [&](int b, int e) {
const int num_threads = omp_get_num_threads(); for (auto thread_id = b; thread_id < e; ++thread_id) {
const int thread_id = omp_get_thread_num(); // We partition the set the of non-zeros among the threads
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
// We partition the set the of non-zeros among the threads const int64_t nz_start = thread_id*nz_chunk;
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads; const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
const int64_t nz_start = thread_id*nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk); // Each thread searchs the row array for a change, and marks it's
// location in Bp. Threads, other than the first, start at the last
// Each thread searchs the row array for a change, and marks it's // index covered by the previous, in order to detect changes in the row
// location in Bp. Threads, other than the first, start at the last // array between thread partitions. This means that each thread after
// index covered by the previous, in order to detect changes in the row // the first, searches the range [nz_start-1, nz_end). That is,
// array between thread partitions. This means that each thread after // if we had 10 non-zeros, and 4 threads, the indexes searched by each
// the first, searches the range [nz_start-1, nz_end). That is, // thread would be:
// if we had 10 non-zeros, and 4 threads, the indexes searched by each // 0: [0, 1, 2]
// thread would be: // 1: [2, 3, 4, 5]
// 0: [0, 1, 2] // 2: [5, 6, 7, 8]
// 1: [2, 3, 4, 5] // 3: [8, 9]
// 2: [5, 6, 7, 8] //
// 3: [8, 9] // That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
// // change in row would be captured by one thread:
// That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each //
// change in row would be captured by one thread: // 0: [0, 0, 1] - row 0
// // 1: [1, 2, 2, 2] - row 1
// 0: [0, 0, 1] - row 0 // 2: [2, 4, 5, 5] - rows 2, 3, and 4
// 1: [1, 2, 2, 2] - row 1 // 3: [5, 6] - rows 5 and 6
// 2: [2, 4, 5, 5] - rows 2, 3, and 4 //
// 3: [5, 6] - rows 5 and 6 int64_t row = 0;
// if (nz_start < nz_end) {
int64_t row = 0; row = nz_start == 0 ? 0 : row_data[nz_start-1];
if (nz_start < nz_end) { for (int64_t i = nz_start; i < nz_end; ++i) {
row = nz_start == 0 ? 0 : row_data[nz_start-1]; while (row != row_data[i]) {
for (int64_t i = nz_start; i < nz_end; ++i) { ++row;
while (row != row_data[i]) { Bp[row] = i;
++row; }
Bp[row] = i;
} }
}
// We will not detect the row change for the last row, nor any empty // We will not detect the row change for the last row, nor any empty
// rows at the end of the matrix, so the last active thread needs // rows at the end of the matrix, so the last active thread needs
// mark all remaining rows in Bp with NNZ. // mark all remaining rows in Bp with NNZ.
if (nz_end == NNZ) { if (nz_end == NNZ) {
while (row < N) { while (row < N) {
++row; ++row;
Bp[row] = NNZ; Bp[row] = NNZ;
}
} }
}
if (fill_data) { if (fill_data) {
// TODO(minjie): Many of our current implementation assumes that CSR must have // TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after: // a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated. // - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated. // - The old binary reduce kernel is deprecated.
std::iota(fill_data+nz_start, std::iota(fill_data+nz_start,
fill_data+nz_end, fill_data+nz_end,
nz_start); nz_start);
}
} }
} }
} });
} else { } else {
std::fill(Bp, Bp+N+1, 0); std::fill(Bp, Bp+N+1, 0);
} }
...@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr, ...@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
IdType *out_row = static_cast<IdType*>(out_row_arr->data); IdType *out_row = static_cast<IdType*>(out_row_arr->data);
IdType *out_col = static_cast<IdType*>(out_col_arr->data); IdType *out_col = static_cast<IdType*>(out_col_arr->data);
#pragma omp parallel for parallel_for(0, nnz, [=](size_t b, size_t e) {
for (int64_t i = 0; i < nnz; i++) { for (auto i = b; i < e; ++i) {
out_row[i] = new_row_ids[in_rows[i]]; out_row[i] = new_row_ids[in_rows[i]];
out_col[i] = new_col_ids[in_cols[i]]; out_col[i] = new_col_ids[in_cols[i]];
} }
});
return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr); return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
} }
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* \brief CSR matrix operator CPU implementation * \brief CSR matrix operator CPU implementation
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <numeric> #include <numeric>
...@@ -12,6 +13,7 @@ ...@@ -12,6 +13,7 @@
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
using runtime::parallel_for;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr, ...@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
// Compute the length of rows for the new matrix. // Compute the length of rows for the new matrix.
std::vector<IdType> new_row_lens(num_rows, -1); std::vector<IdType> new_row_lens(num_rows, -1);
#pragma omp parallel for parallel_for(0, num_rows, [=, &new_row_lens](size_t b, size_t e) {
for (int64_t i = 0; i < num_rows; i++) { for (auto i = b; i < e; ++i) {
int64_t new_row_id = new_row_ids[i]; int64_t new_row_id = new_row_ids[i];
new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i]; new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
} }
});
// Compute the starting location of each row in the new matrix. // Compute the starting location of each row in the new matrix.
out_indptr[0] = 0; out_indptr[0] = 0;
// This is sequential. It should be pretty fast. // This is sequential. It should be pretty fast.
...@@ -506,23 +509,24 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr, ...@@ -506,23 +509,24 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
CHECK_EQ(out_indptr[num_rows], nnz); CHECK_EQ(out_indptr[num_rows], nnz);
// Copy indieces and data with the new order. // Copy indieces and data with the new order.
// Here I iterate rows in the order of the old matrix. // Here I iterate rows in the order of the old matrix.
#pragma omp parallel for parallel_for(0, num_rows, [=](size_t b, size_t e) {
for (int64_t i = 0; i < num_rows; i++) { for (auto i = b; i < e; ++i) {
const IdType *in_row = in_indices + in_indptr[i]; const IdType *in_row = in_indices + in_indptr[i];
const IdType *in_row_data = in_data + in_indptr[i]; const IdType *in_row_data = in_data + in_indptr[i];
int64_t new_row_id = new_row_ids[i]; int64_t new_row_id = new_row_ids[i];
IdType *out_row = out_indices + out_indptr[new_row_id]; IdType *out_row = out_indices + out_indptr[new_row_id];
IdType *out_row_data = out_data + out_indptr[new_row_id]; IdType *out_row_data = out_data + out_indptr[new_row_id];
int64_t row_len = new_row_lens[new_row_id]; int64_t row_len = new_row_lens[new_row_id];
// Here I iterate col indices in a row in the order of the old matrix. // Here I iterate col indices in a row in the order of the old matrix.
for (int64_t j = 0; j < row_len; j++) { for (int64_t j = 0; j < row_len; j++) {
out_row[j] = new_col_ids[in_row[j]]; out_row[j] = new_col_ids[in_row[j]];
out_row_data[j] = in_row_data[j]; out_row_data[j] = in_row_data[j];
}
// TODO(zhengda) maybe we should sort the column indices.
} }
// TODO(zhengda) maybe we should sort the column indices. });
}
return CSRMatrix(num_rows, num_cols, return CSRMatrix(num_rows, num_cols,
out_indptr_arr, out_indices_arr, out_data_arr); out_indptr_arr, out_indices_arr, out_data_arr);
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <memory> #include <memory>
...@@ -46,16 +47,18 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast ...@@ -46,16 +47,18 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
const IdType* indices = csr.indices.Ptr<IdType>(); const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>(); const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len; int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) { runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; for (auto rid = b; rid < e; ++rid) {
DType* out_off = O + rid * dim; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
for (IdType j = row_start; j < row_end; ++j) { DType* out_off = O + rid * dim;
const IdType cid = indices[j]; for (IdType j = row_start; j < row_end; ++j) {
const IdType eid = has_idx ? edges[j] : j; const IdType cid = indices[j];
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim); const IdType eid = has_idx ? edges[j] : j;
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
}
} }
} });
} }
#endif // USE_AVX #endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
...@@ -79,24 +82,25 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X ...@@ -79,24 +82,25 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
const IdType* indices = csr.indices.Ptr<IdType>(); const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>(); const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len; int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
#pragma omp parallel for runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim; DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j]; const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j; const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) { for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr; Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off = const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr; Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
out_off[k] += Op::Call(lhs_off, rhs_off); out_off[k] += Op::Call(lhs_off, rhs_off);
}
} }
} }
} });
} }
/*! /*!
...@@ -270,31 +274,32 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, ...@@ -270,31 +274,32 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
#endif // USE_AVX #endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
#pragma omp parallel for runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim; DType* out_off = O + rid * dim;
IdType* argx_off = argX + rid * dim; IdType* argx_off = argX + rid * dim;
IdType* argw_off = argW + rid * dim; IdType* argw_off = argW + rid * dim;
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j]; const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j; const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) { for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr; Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off = const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr; Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off); const DType val = Op::Call(lhs_off, rhs_off);
if (Cmp::Call(out_off[k], val)) { if (Cmp::Call(out_off[k], val)) {
out_off[k] = val; out_off[k] = val;
if (Op::use_lhs) argx_off[k] = cid; if (Op::use_lhs) argx_off[k] = cid;
if (Op::use_rhs) argw_off[k] = eid; if (Op::use_rhs) argw_off[k] = eid;
}
}
} }
} }
} });
}
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX #ifdef USE_AVX
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <algorithm> #include <algorithm>
#include "../c_api_common.h" #include "../c_api_common.h"
...@@ -261,34 +262,36 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) { ...@@ -261,34 +262,36 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len); const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len);
if (is_sorted) { if (is_sorted) {
#pragma omp parallel for runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (int64_t i = 0; i < query_len; i++) { for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i]; const dgl_id_t id = query_data[i];
const auto it = std::find(parent_data, parent_data + parent_len, id); const auto it = std::find(parent_data, parent_data + parent_len, id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1. // If the vertex Id doesn't exist, the vid in the subgraph is -1.
if (it != parent_data + parent_len) { if (it != parent_data + parent_len) {
rst_data[i] = it - parent_data; rst_data[i] = it - parent_data;
} else { } else {
rst_data[i] = -1; rst_data[i] = -1;
}
} }
} });
} else { } else {
std::unordered_map<dgl_id_t, dgl_id_t> parent_map; std::unordered_map<dgl_id_t, dgl_id_t> parent_map;
for (int64_t i = 0; i < parent_len; i++) { for (int64_t i = 0; i < parent_len; i++) {
const dgl_id_t id = parent_data[i]; const dgl_id_t id = parent_data[i];
parent_map[id] = i; parent_map[id] = i;
} }
#pragma omp parallel for runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
for (int64_t i = 0; i < query_len; i++) { for (auto i = b; i < e; ++i) {
const dgl_id_t id = query_data[i]; const dgl_id_t id = query_data[i];
auto it = parent_map.find(id); auto it = parent_map.find(id);
// If the vertex Id doesn't exist, the vid in the subgraph is -1. // If the vertex Id doesn't exist, the vid in the subgraph is -1.
if (it != parent_map.end()) { if (it != parent_map.end()) {
rst_data[i] = it->second; rst_data[i] = it->second;
} else { } else {
rst_data[i] = -1; rst_data[i] = -1;
}
} }
} });
} }
return rst; return rst;
} }
...@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo") ...@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo")
graph_ptr->GetInCSR(); graph_ptr->GetInCSR();
std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1); std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1);
int num_partitions = part_nodes.size(); int num_partitions = part_nodes.size();
#pragma omp parallel for runtime::parallel_for(0, num_partitions, [&](size_t b, size_t e) {
for (int i = 0; i < num_partitions; i++) { for (auto i = b; i < e; ++i) {
auto nodes = aten::VecToIdArray(part_nodes[i]); auto nodes = aten::VecToIdArray(part_nodes[i]);
HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops); HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg)); std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
int part_id = part_ids[i]; int part_id = part_ids[i];
subgs[part_id] = subg_ptr; subgs[part_id] = subg_ptr;
} }
});
List<SubgraphRef> ret_list; List<SubgraphRef> ret_list;
for (size_t i = 0; i < subgs.size(); i++) { for (size_t i = 0; i < subgs.size(); i++) {
ret_list.push_back(SubgraphRef(subgs[i])); ret_list.push_back(SubgraphRef(subgs[i]));
...@@ -732,24 +736,25 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty ...@@ -732,24 +736,25 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
const IdType *typed_map_data = static_cast<IdType *>(typed_map->data); const IdType *typed_map_data = static_cast<IdType *>(typed_map->data);
IdType *types_data = static_cast<IdType *>(ret->data); IdType *types_data = static_cast<IdType *>(ret->data);
IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids; IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids;
#pragma omp parallel for runtime::parallel_for(0, ids->shape[0], [&](size_t b, size_t e) {
for (int64_t i = 0; i < ids->shape[0]; i++) { for (auto i = b; i < e; ++i) {
IdType id = ids_data[i]; IdType id = ids_data[i];
auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id); auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
// The range must exist. // The range must exist.
BUG_IF_FAIL(it != range_end_data + num_ranges); BUG_IF_FAIL(it != range_end_data + num_ranges);
size_t range_id = it - range_end_data; size_t range_id = it - range_end_data;
int type_id = range_id % num_types; int type_id = range_id % num_types;
types_data[i] = type_id; types_data[i] = type_id;
int part_id = range_id / num_types; int part_id = range_id / num_types;
BUG_IF_FAIL(part_id < num_parts); BUG_IF_FAIL(part_id < num_parts);
if (part_id == 0) { if (part_id == 0) {
per_type_ids_data[i] = id - range_start_data[range_id]; per_type_ids_data[i] = id - range_start_data[range_id];
} else { } else {
per_type_ids_data[i] = id - range_start_data[range_id] per_type_ids_data[i] = id - range_start_data[range_id]
+ typed_map_data[num_parts * type_id + part_id - 1]; + typed_map_data[num_parts * type_id + part_id - 1];
}
} }
} });
return ret; return ret;
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <set> #include <set>
#include "../c_api_common.h" #include "../c_api_common.h"
...@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat") ...@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([] (DGLArgs args, DGLRetValue* rv) {
HeteroGraphRef hg = args[0]; HeteroGraphRef hg = args[0];
dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats(); dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
#if !defined(DGL_USE_CUDA) auto get_format_f = [&](size_t etype_b, size_t etype_e) {
#pragma omp parallel for for (auto etype = etype_b; etype < etype_e; ++etype) {
auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
for (auto format : CodeToSparseFormats(code))
bg->GetFormat(format);
}
};
#if !(defined(DGL_USE_CUDA))
runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
#else
get_format_f(0, hg->NumEdgeTypes());
#endif #endif
for (int64_t etype = 0; etype < hg->NumEdgeTypes(); ++etype) {
auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
for (auto format : CodeToSparseFormats(code))
bg->GetFormat(format);
}
}); });
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph") DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph")
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/runtime/ndarray.h> #include <dgl/runtime/ndarray.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/nodeflow.h> #include <dgl/nodeflow.h>
...@@ -829,15 +830,16 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull") ...@@ -829,15 +830,16 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
char *return_data = new char[ID_size*row_size]; char *return_data = new char[ID_size*row_size];
const int64_t local_ids_size = local_ids.size(); const int64_t local_ids_size = local_ids.size();
// Copy local data // Copy local data
#pragma omp parallel for runtime::parallel_for(0, local_ids_size, [&](size_t b, size_t e) {
for (int64_t i = 0; i < local_ids_size; ++i) { for (auto i = b; i < e; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size); CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size); CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0); CHECK_GE(local_ids[i], 0);
memcpy(return_data + local_ids_orginal[i] * row_size, memcpy(return_data + local_ids_orginal[i] * row_size,
local_data_char + local_ids[i] * row_size, local_data_char + local_ids[i] * row_size,
row_size); row_size);
} }
});
// Recv remote message // Recv remote message
for (int i = 0; i < msg_count; ++i) { for (int i = 0; i < msg_count; ++i) {
KVStoreMsg *kv_msg = recv_kv_message(receiver); KVStoreMsg *kv_msg = recv_kv_message(receiver);
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/parallel_for.h>
#include <dmlc/omp.h> #include <dmlc/omp.h>
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
...@@ -850,19 +851,20 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr, ...@@ -850,19 +851,20 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
BuildCsr(*gptr, neigh_type); BuildCsr(*gptr, neigh_type);
// generate node flows // generate node flows
std::vector<NodeFlow> nflows(num_workers); std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (int i = 0; i < num_workers; i++) { for (auto i = b; i < e; ++i) {
// create per-worker seed nodes. // create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size; const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds); const int64_t end = std::min(start + batch_size, num_seeds);
// TODO(minjie): the vector allocation/copy is unnecessary // TODO(minjie): the vector allocation/copy is unnecessary
std::vector<dgl_id_t> worker_seeds(end - start); std::vector<dgl_id_t> worker_seeds(end - start);
std::copy(seed_nodes_data + start, seed_nodes_data + end, std::copy(seed_nodes_data + start, seed_nodes_data + end,
worker_seeds.begin()); worker_seeds.begin());
nflows[i] = SamplerOp::NeighborSample( nflows[i] = SamplerOp::NeighborSample(
gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor, gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor,
add_self_loop, probability); add_self_loop, probability);
} }
});
return nflows; return nflows;
} }
...@@ -977,18 +979,19 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling") ...@@ -977,18 +979,19 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
BuildCsr(*gptr, neigh_type); BuildCsr(*gptr, neigh_type);
// generate node flows // generate node flows
std::vector<NodeFlow> nflows(num_workers); std::vector<NodeFlow> nflows(num_workers);
#pragma omp parallel for runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (int i = 0; i < num_workers; i++) { for (auto i = b; i < e; ++i) {
// create per-worker seed nodes. // create per-worker seed nodes.
const int64_t start = (batch_start_id + i) * batch_size; const int64_t start = (batch_start_id + i) * batch_size;
const int64_t end = std::min(start + batch_size, num_seeds); const int64_t end = std::min(start + batch_size, num_seeds);
// TODO(minjie): the vector allocation/copy is unnecessary // TODO(minjie): the vector allocation/copy is unnecessary
std::vector<dgl_id_t> worker_seeds(end - start); std::vector<dgl_id_t> worker_seeds(end - start);
std::copy(seed_nodes_data + start, seed_nodes_data + end, std::copy(seed_nodes_data + start, seed_nodes_data + end,
worker_seeds.begin()); worker_seeds.begin());
nflows[i] = SamplerOp::LayerUniformSample( nflows[i] = SamplerOp::LayerUniformSample(
gptr.get(), worker_seeds, neigh_type, layer_sizes); gptr.get(), worker_seeds, neigh_type, layer_sizes);
} }
});
*rv = List<NodeFlow>(nflows); *rv = List<NodeFlow>(nflows);
}); });
...@@ -1466,54 +1469,55 @@ public: ...@@ -1466,54 +1469,55 @@ public:
std::vector<SubgraphRef> positive_subgs(num_workers); std::vector<SubgraphRef> positive_subgs(num_workers);
std::vector<SubgraphRef> negative_subgs(num_workers); std::vector<SubgraphRef> negative_subgs(num_workers);
#pragma omp parallel for runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
for (int64_t i = 0; i < num_workers; i++) { for (auto i = b; i < e; ++i) {
const int64_t start = (batch_curr_id_ + i) * batch_size_; const int64_t start = (batch_curr_id_ + i) * batch_size_;
const int64_t end = std::min(start + batch_size_, num_seeds_); const int64_t end = std::min(start + batch_size_, num_seeds_);
const int64_t num_edges = end - start; const int64_t num_edges = end - start;
IdArray worker_seeds; IdArray worker_seeds;
if (replacement_ == false) {
worker_seeds = seed_edges_.CreateView({num_edges}, DLDataType{kDLInt, 64, 1},
sizeof(dgl_id_t) * start);
} else {
std::vector<dgl_id_t> seeds;
const dgl_id_t *seed_edge_ids = static_cast<const dgl_id_t *>(seed_edges_->data);
// sampling of each edge is a standalone event
for (int64_t i = 0; i < num_edges; ++i) {
int64_t seed = static_cast<const int64_t>(
RandomEngine::ThreadLocal()->RandInt(num_seeds_));
seeds.push_back(seed_edge_ids[seed]);
}
worker_seeds = aten::VecToIdArray(seeds, seed_edges_->dtype.bits); if (replacement_ == false) {
} worker_seeds = seed_edges_.CreateView({num_edges}, DLDataType{kDLInt, 64, 1},
sizeof(dgl_id_t) * start);
} else {
std::vector<dgl_id_t> seeds;
const dgl_id_t *seed_edge_ids = static_cast<const dgl_id_t *>(seed_edges_->data);
// sampling of each edge is a standalone event
for (int64_t i = 0; i < num_edges; ++i) {
int64_t seed = static_cast<const int64_t>(
RandomEngine::ThreadLocal()->RandInt(num_seeds_));
seeds.push_back(seed_edge_ids[seed]);
}
EdgeArray arr = gptr_->FindEdges(worker_seeds); worker_seeds = aten::VecToIdArray(seeds, seed_edges_->dtype.bits);
const dgl_id_t *src_ids = static_cast<const dgl_id_t *>(arr.src->data); }
const dgl_id_t *dst_ids = static_cast<const dgl_id_t *>(arr.dst->data);
std::vector<dgl_id_t> src_vec(src_ids, src_ids + num_edges);
std::vector<dgl_id_t> dst_vec(dst_ids, dst_ids + num_edges);
// TODO(zhengda) what if there are duplicates in the src and dst vectors.
Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false); EdgeArray arr = gptr_->FindEdges(worker_seeds);
positive_subgs[i] = ConvertRef(subg); const dgl_id_t *src_ids = static_cast<const dgl_id_t *>(arr.src->data);
// For chunked negative sampling, we accept "chunk-head" for corrupting head const dgl_id_t *dst_ids = static_cast<const dgl_id_t *>(arr.dst->data);
// nodes and "chunk-tail" for corrupting tail nodes. std::vector<dgl_id_t> src_vec(src_ids, src_ids + num_edges);
if (neg_mode_.substr(0, 5) == "chunk") { std::vector<dgl_id_t> dst_vec(dst_ids, dst_ids + num_edges);
NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6), // TODO(zhengda) what if there are duplicates in the src and dst vectors.
neg_sample_size_,
exclude_positive_, Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false);
check_false_neg_); positive_subgs[i] = ConvertRef(subg);
negative_subgs[i] = ConvertRef(neg_subg); // For chunked negative sampling, we accept "chunk-head" for corrupting head
} else if (neg_mode_ == "head" || neg_mode_ == "tail") { // nodes and "chunk-tail" for corrupting tail nodes.
NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_, if (neg_mode_.substr(0, 5) == "chunk") {
neg_sample_size_, NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6),
exclude_positive_, neg_sample_size_,
check_false_neg_); exclude_positive_,
negative_subgs[i] = ConvertRef(neg_subg); check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
} else if (neg_mode_ == "head" || neg_mode_ == "tail") {
NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_,
neg_sample_size_,
exclude_positive_,
check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
}
} }
} });
if (neg_mode_.size() > 0) { if (neg_mode_.size() > 0) {
positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end()); positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end());
} }
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/runtime/parallel_for.h>
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include "randomwalks_impl.h" #include "randomwalks_impl.h"
...@@ -47,25 +48,26 @@ std::pair<IdArray, IdArray> GenericRandomWalk( ...@@ -47,25 +48,26 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
IdxType *traces_data = traces.Ptr<IdxType>(); IdxType *traces_data = traces.Ptr<IdxType>();
IdxType *eids_data = eids.Ptr<IdxType>(); IdxType *eids_data = eids.Ptr<IdxType>();
#pragma omp parallel for runtime::parallel_for(0, num_seeds, [&](size_t seed_begin, size_t seed_end) {
for (int64_t seed_id = 0; seed_id < num_seeds; ++seed_id) { for (auto seed_id = seed_begin; seed_id < seed_end; seed_id++) {
int64_t i; int64_t i;
dgl_id_t curr = seed_data[seed_id]; dgl_id_t curr = seed_data[seed_id];
traces_data[seed_id * trace_length] = curr; traces_data[seed_id * trace_length] = curr;
for (i = 0; i < max_num_steps; ++i) { for (i = 0; i < max_num_steps; ++i) {
const auto &succ = step(traces_data + seed_id * max_num_steps, curr, i); const auto &succ = step(traces_data + seed_id * max_num_steps, curr, i);
traces_data[seed_id * trace_length + i + 1] = curr = std::get<0>(succ); traces_data[seed_id * trace_length + i + 1] = curr = std::get<0>(succ);
eids_data[seed_id * max_num_steps + i] = std::get<1>(succ); eids_data[seed_id * max_num_steps + i] = std::get<1>(succ);
if (std::get<2>(succ)) if (std::get<2>(succ))
break; break;
}
for (; i < max_num_steps; ++i) {
traces_data[seed_id * trace_length + i + 1] = -1;
eids_data[seed_id * max_num_steps + i] = -1;
}
} }
});
for (; i < max_num_steps; ++i) {
traces_data[seed_id * trace_length + i + 1] = -1;
eids_data[seed_id * max_num_steps + i] = -1;
}
}
return std::make_pair(traces, eids); return std::make_pair(traces, eids);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment