[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for * [CPU, Parallel] Decrease number of calls to task function * c[CPU, Parallel] Modify calls to new interface of parallel_for

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)
* [CPU, Parallel] Rewriting omp pragmas with parallel_for * [CPU, Parallel] Decrease number of calls to task function * c[CPU, Parallel] Modify calls to new interface of parallel_for
f5183820 · Tomasz Patejko · GitHub · 21a40279 · f5183820 · f5183820
Unverified Commit f5183820 authored Sep 02, 2021 by Tomasz Patejko Committed by GitHub Sep 02, 2021
4 changed files
--- a/src/graph/transform/cpu/knn.cc
+++ b/src/graph/transform/cpu/knn.cc
@@ -6,6 +6,7 @@

 #include <dgl/runtime/device_api.h>
 #include <dgl/random.h>
+#include <dgl/runtime/parallel_for.h>
 #include <dmlc/omp.h>
 #include <vector>
 #include <tuple>
@@ -234,10 +235,11 @@ void KdTreeKNN(const NDArray& data_points, const IdArray& data_offsets,
    KDTreeNDArrayAdapter<FloatType, IdType> kdtree(feature_size, current_data_points);

    // query
+    parallel_for(0, q_length, [&](IdType b, IdType e) {
+      for (auto q = b; q < e; ++q) {
        std::vector<IdType> out_buffer(k);
        std::vector<FloatType> out_dist_buffer(k);
-#pragma omp parallel for firstprivate(out_buffer) firstprivate(out_dist_buffer)
-    for (IdType q = 0; q < q_length; ++q) {
+
        auto curr_out_offset = k * q + out_offset;
        const FloatType* q_point = current_query_pts_data + q * feature_size;
        size_t num_matches = kdtree.GetIndex()->knnSearch(
@@ -249,6 +251,7 @@ void KdTreeKNN(const NDArray& data_points, const IdArray& data_offsets,
          curr_out_offset++;
        }
      }
+    });
  }
 }

@@ -271,8 +274,9 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,

    std::vector<FloatType> dist_buffer(k);

-#pragma omp parallel for firstprivate(dist_buffer)
-    for (IdType q_idx = q_start; q_idx < q_end; ++q_idx) {
+    parallel_for(q_start, q_end, [&](IdType b, IdType e) {
+      for (auto q_idx = b; q_idx < e; ++q_idx) {
+        std::vector<FloatType> dist_buffer(k);
        for (IdType k_idx = 0; k_idx < k; ++k_idx) {
          query_out[q_idx * k + k_idx] = q_idx;
          dist_buffer[k_idx] = std::numeric_limits<FloatType>::max();
@@ -295,6 +299,7 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,
          worst_dist = dist_buffer[0];
        }
      }
+    });
  }
 }
 }  // namespace impl
@@ -356,8 +361,8 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
    IdType segment_size = point_idx_end - point_idx_start;

    // random initialization
-#pragma omp parallel for
-    for (IdType i = point_idx_start; i < point_idx_end; ++i) {
+    runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
        IdType local_idx = i - point_idx_start;

        dgl::RandomEngine::ThreadLocal()->UniformChoice<IdType>(
@@ -374,14 +379,15 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
        }
        impl::BuildHeap<FloatType, IdType>(neighbors + i * k, neighbors_dists + local_idx * k, k);
      }
+    });

    size_t num_updates = 0;
    for (int iter = 0; iter < num_iters; ++iter) {
      num_updates = 0;

      // initialize candidates array as empty value
-#pragma omp parallel for
-      for (IdType i = point_idx_start; i < point_idx_end; ++i) {
+      runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
+        for (auto i = b; i < e; ++i) {
          IdType local_idx = i - point_idx_start;
          for (IdType c = 0; c < num_candidates; ++c) {
            new_candidates[local_idx * num_candidates + c] = num_nodes;
@@ -392,13 +398,12 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
              std::numeric_limits<FloatType>::max();
          }
        }
+      });

      // randomly select neighbors as candidates
-      int tid, num_threads;
-#pragma omp parallel private(tid, num_threads)
-      {
-        tid = omp_get_thread_num();
-        num_threads = omp_get_num_threads();
+      int num_threads = omp_get_max_threads();
+      runtime::parallel_for(0, num_threads, [&](size_t b, size_t e) {
+        for (auto tid = b; tid < e; ++tid) {
          for (IdType i = point_idx_start; i < point_idx_end; ++i) {
            IdType local_idx = i - point_idx_start;
            for (IdType n = 0; n < k; ++n) {
@@ -437,10 +442,11 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
            }
          }
        }
+      });

      // mark all elements in new_candidates as false
-#pragma omp parallel for
-      for (IdType i = point_idx_start; i < point_idx_end; ++i) {
+      runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
+        for (auto i = b; i < e; ++i) {
          IdType local_idx = i - point_idx_start;
          for (IdType n = 0; n < k; ++n) {
            IdType n_idx = neighbors[i * k + n];
@@ -453,6 +459,7 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
            }
          }
        }
+      });

      // update neighbors block by block
      for (IdType block_start = point_idx_start;
@@ -463,8 +470,8 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
        nnd_updates_t updates(block_size);

        // generate updates
-#pragma omp parallel for
-        for (IdType i = block_start; i < block_end; ++i) {
+        runtime::parallel_for(block_start, block_end, [&](size_t b, size_t e) {
+          for (auto i = b; i < e; ++i) {
            IdType local_idx = i - point_idx_start;

            for (IdType c1 = 0; c1 < num_candidates; ++c1) {
@@ -511,7 +518,9 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
              }
            }
          }
+        });

+        int tid;
 #pragma omp parallel private(tid, num_threads) reduction(+:num_updates)
        {
          tid = omp_get_thread_num();

--- a/src/graph/transform/partition_hetero.cc
+++ b/src/graph/transform/partition_hetero.cc
@@ -4,16 +4,17 @@
 * \brief Call Metis partitioning
 */

-#if !defined(_WIN32)
-#include <GKlib.h>
-#endif  // !defined(_WIN32)
-
 #include <dgl/base_heterograph.h>
 #include <dgl/packed_func_ext.h>
+#include <dgl/runtime/parallel_for.h>

 #include "../heterograph.h"
 #include "../unit_graph.h"

+#if !defined(_WIN32)
+#include <GKlib.h>
+#endif  // !defined(_WIN32)
+
 using namespace dgl::runtime;

 namespace dgl {
@@ -252,8 +253,8 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLPartitionWithHalo_Hetero")
    ugptr->GetOutCSR();
    std::vector<std::shared_ptr<HaloHeteroSubgraph>> subgs(max_part_id + 1);
    int num_partitions = part_nodes.size();
-#pragma omp parallel for
-    for (int i = 0; i < num_partitions; i++) {
+    runtime::parallel_for(0, num_partitions, [&](int b, int e) {
+      for (auto i = b; i < e; i++) {
        auto nodes = aten::VecToIdArray(part_nodes[i]);
        HaloHeteroSubgraph subg = GetSubgraphWithHalo(hgptr, nodes, num_hops);
        std::shared_ptr<HaloHeteroSubgraph> subg_ptr(
@@ -261,6 +262,7 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLPartitionWithHalo_Hetero")
        int part_id = part_ids[i];
        subgs[part_id] = subg_ptr;
      }
+    });
    List<HeteroSubgraphRef> ret_list;
    for (size_t i = 0; i < subgs.size(); i++) {
      ret_list.push_back(HeteroSubgraphRef(subgs[i]));

--- a/src/random/random.cc
+++ b/src/random/random.cc
@@ -7,6 +7,7 @@
 #include <dmlc/omp.h>
 #include <dgl/runtime/registry.h>
 #include <dgl/runtime/packed_func.h>
+#include <dgl/runtime/parallel_for.h>
 #include <dgl/random.h>
 #include <dgl/array.h>

@@ -21,10 +22,12 @@ namespace dgl {
 DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
 .set_body([] (DGLArgs args, DGLRetValue *rv) {
    const int seed = args[0];
-#pragma omp parallel for
-    for (int i = 0; i < omp_get_max_threads(); ++i) {
+
+    runtime::parallel_for(0, omp_get_max_threads(), [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
        RandomEngine::ThreadLocal()->SetSeed(seed);
      }
+    });
 #ifdef DGL_USE_CUDA
    auto* thr_entry = CUDAThreadEntry::ThreadLocal();
    if (!thr_entry->curand_gen) {

--- a/src/rpc/rpc.cc
+++ b/src/rpc/rpc.cc
@@ -11,6 +11,7 @@
 #endif

 #include <dgl/runtime/container.h>
+#include <dgl/runtime/parallel_for.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/array.h>
 #include <dgl/random.h>
@@ -454,8 +455,8 @@ DGL_REGISTER_GLOBAL("distributed.rpc._CAPI_DGLRPCFastPull")
                                      DLContext{kDLCPU, 0});
  char* return_data = static_cast<char*>(res_tensor->data);
  // Copy local data
-#pragma omp parallel for
-  for (int64_t i = 0; i < local_ids.size(); ++i) {
+  parallel_for(0, local_ids.size(), [&](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
      CHECK_GE(ID_size*row_size, local_ids_orginal[i]*row_size+row_size);
      CHECK_GE(data_size, local_ids[i] * row_size + row_size);
      CHECK_GE(local_ids[i], 0);
@@ -463,6 +464,7 @@ DGL_REGISTER_GLOBAL("distributed.rpc._CAPI_DGLRPCFastPull")
             local_data_char + local_ids[i] * row_size,
             row_size);
    }
+  });
  // Recv remote message
  for (int i = 0; i < msg_count; ++i) {
    RPCMessage msg;