Unverified Commit f5183820 authored by Tomasz Patejko's avatar Tomasz Patejko Committed by GitHub
Browse files

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for

* [CPU, Parallel] Decrease number of calls to task function

* c[CPU, Parallel] Modify calls to new interface of parallel_for
parent 21a40279
......@@ -6,6 +6,7 @@
#include <dgl/runtime/device_api.h>
#include <dgl/random.h>
#include <dgl/runtime/parallel_for.h>
#include <dmlc/omp.h>
#include <vector>
#include <tuple>
......@@ -234,10 +235,11 @@ void KdTreeKNN(const NDArray& data_points, const IdArray& data_offsets,
KDTreeNDArrayAdapter<FloatType, IdType> kdtree(feature_size, current_data_points);
// query
parallel_for(0, q_length, [&](IdType b, IdType e) {
for (auto q = b; q < e; ++q) {
std::vector<IdType> out_buffer(k);
std::vector<FloatType> out_dist_buffer(k);
#pragma omp parallel for firstprivate(out_buffer) firstprivate(out_dist_buffer)
for (IdType q = 0; q < q_length; ++q) {
auto curr_out_offset = k * q + out_offset;
const FloatType* q_point = current_query_pts_data + q * feature_size;
size_t num_matches = kdtree.GetIndex()->knnSearch(
......@@ -249,6 +251,7 @@ void KdTreeKNN(const NDArray& data_points, const IdArray& data_offsets,
curr_out_offset++;
}
}
});
}
}
......@@ -271,8 +274,9 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,
std::vector<FloatType> dist_buffer(k);
#pragma omp parallel for firstprivate(dist_buffer)
for (IdType q_idx = q_start; q_idx < q_end; ++q_idx) {
parallel_for(q_start, q_end, [&](IdType b, IdType e) {
for (auto q_idx = b; q_idx < e; ++q_idx) {
std::vector<FloatType> dist_buffer(k);
for (IdType k_idx = 0; k_idx < k; ++k_idx) {
query_out[q_idx * k + k_idx] = q_idx;
dist_buffer[k_idx] = std::numeric_limits<FloatType>::max();
......@@ -295,6 +299,7 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,
worst_dist = dist_buffer[0];
}
}
});
}
}
} // namespace impl
......@@ -356,8 +361,8 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
IdType segment_size = point_idx_end - point_idx_start;
// random initialization
#pragma omp parallel for
for (IdType i = point_idx_start; i < point_idx_end; ++i) {
runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
IdType local_idx = i - point_idx_start;
dgl::RandomEngine::ThreadLocal()->UniformChoice<IdType>(
......@@ -374,14 +379,15 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
}
impl::BuildHeap<FloatType, IdType>(neighbors + i * k, neighbors_dists + local_idx * k, k);
}
});
size_t num_updates = 0;
for (int iter = 0; iter < num_iters; ++iter) {
num_updates = 0;
// initialize candidates array as empty value
#pragma omp parallel for
for (IdType i = point_idx_start; i < point_idx_end; ++i) {
runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
IdType local_idx = i - point_idx_start;
for (IdType c = 0; c < num_candidates; ++c) {
new_candidates[local_idx * num_candidates + c] = num_nodes;
......@@ -392,13 +398,12 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
std::numeric_limits<FloatType>::max();
}
}
});
// randomly select neighbors as candidates
int tid, num_threads;
#pragma omp parallel private(tid, num_threads)
{
tid = omp_get_thread_num();
num_threads = omp_get_num_threads();
int num_threads = omp_get_max_threads();
runtime::parallel_for(0, num_threads, [&](size_t b, size_t e) {
for (auto tid = b; tid < e; ++tid) {
for (IdType i = point_idx_start; i < point_idx_end; ++i) {
IdType local_idx = i - point_idx_start;
for (IdType n = 0; n < k; ++n) {
......@@ -437,10 +442,11 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
}
}
}
});
// mark all elements in new_candidates as false
#pragma omp parallel for
for (IdType i = point_idx_start; i < point_idx_end; ++i) {
runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
IdType local_idx = i - point_idx_start;
for (IdType n = 0; n < k; ++n) {
IdType n_idx = neighbors[i * k + n];
......@@ -453,6 +459,7 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
}
}
}
});
// update neighbors block by block
for (IdType block_start = point_idx_start;
......@@ -463,8 +470,8 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
nnd_updates_t updates(block_size);
// generate updates
#pragma omp parallel for
for (IdType i = block_start; i < block_end; ++i) {
runtime::parallel_for(block_start, block_end, [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
IdType local_idx = i - point_idx_start;
for (IdType c1 = 0; c1 < num_candidates; ++c1) {
......@@ -511,7 +518,9 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
}
}
}
});
int tid;
#pragma omp parallel private(tid, num_threads) reduction(+:num_updates)
{
tid = omp_get_thread_num();
......
......@@ -4,16 +4,17 @@
* \brief Call Metis partitioning
*/
#if !defined(_WIN32)
#include <GKlib.h>
#endif // !defined(_WIN32)
#include <dgl/base_heterograph.h>
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/parallel_for.h>
#include "../heterograph.h"
#include "../unit_graph.h"
#if !defined(_WIN32)
#include <GKlib.h>
#endif // !defined(_WIN32)
using namespace dgl::runtime;
namespace dgl {
......@@ -252,8 +253,8 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLPartitionWithHalo_Hetero")
ugptr->GetOutCSR();
std::vector<std::shared_ptr<HaloHeteroSubgraph>> subgs(max_part_id + 1);
int num_partitions = part_nodes.size();
#pragma omp parallel for
for (int i = 0; i < num_partitions; i++) {
runtime::parallel_for(0, num_partitions, [&](int b, int e) {
for (auto i = b; i < e; i++) {
auto nodes = aten::VecToIdArray(part_nodes[i]);
HaloHeteroSubgraph subg = GetSubgraphWithHalo(hgptr, nodes, num_hops);
std::shared_ptr<HaloHeteroSubgraph> subg_ptr(
......@@ -261,6 +262,7 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLPartitionWithHalo_Hetero")
int part_id = part_ids[i];
subgs[part_id] = subg_ptr;
}
});
List<HeteroSubgraphRef> ret_list;
for (size_t i = 0; i < subgs.size(); i++) {
ret_list.push_back(HeteroSubgraphRef(subgs[i]));
......
......@@ -7,6 +7,7 @@
#include <dmlc/omp.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/packed_func.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/random.h>
#include <dgl/array.h>
......@@ -21,10 +22,12 @@ namespace dgl {
DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
.set_body([] (DGLArgs args, DGLRetValue *rv) {
const int seed = args[0];
#pragma omp parallel for
for (int i = 0; i < omp_get_max_threads(); ++i) {
runtime::parallel_for(0, omp_get_max_threads(), [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
RandomEngine::ThreadLocal()->SetSeed(seed);
}
});
#ifdef DGL_USE_CUDA
auto* thr_entry = CUDAThreadEntry::ThreadLocal();
if (!thr_entry->curand_gen) {
......
......@@ -11,6 +11,7 @@
#endif
#include <dgl/runtime/container.h>
#include <dgl/runtime/parallel_for.h>
#include <dgl/packed_func_ext.h>
#include <dgl/array.h>
#include <dgl/random.h>
......@@ -454,8 +455,8 @@ DGL_REGISTER_GLOBAL("distributed.rpc._CAPI_DGLRPCFastPull")
DLContext{kDLCPU, 0});
char* return_data = static_cast<char*>(res_tensor->data);
// Copy local data
#pragma omp parallel for
for (int64_t i = 0; i < local_ids.size(); ++i) {
parallel_for(0, local_ids.size(), [&](size_t b, size_t e) {
for (auto i = b; i < e; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i]*row_size+row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0);
......@@ -463,6 +464,7 @@ DGL_REGISTER_GLOBAL("distributed.rpc._CAPI_DGLRPCFastPull")
local_data_char + local_ids[i] * row_size,
row_size);
}
});
// Recv remote message
for (int i = 0; i < msg_count; ++i) {
RPCMessage msg;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment