[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for * [CPU, Parallel] Decrease number of calls to task function * c[CPU, Parallel] Modify calls to new interface of parallel_for

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)
* [CPU, Parallel] Rewriting omp pragmas with parallel_for * [CPU, Parallel] Decrease number of calls to task function * c[CPU, Parallel] Modify calls to new interface of parallel_for
f5183820 · Tomasz Patejko · GitHub · 21a40279 · f5183820 · f5183820
Unverified Commit f5183820 authored Sep 02, 2021 by Tomasz Patejko Committed by GitHub Sep 02, 2021
20 changed files
--- a/include/dgl/runtime/parallel_for.h
+++ b/include/dgl/runtime/parallel_for.h
@@ -74,14 +74,12 @@ void parallel_for(
    auto chunk_size = divup((end - begin), num_threads);
    auto begin_tid = begin + tid * chunk_size;
    if (begin_tid < end) {
-      for (auto i = begin_tid; i < std::min(end, chunk_size + begin_tid); i++) {
+      auto end_tid = std::min(end, chunk_size + begin_tid);
-        f(i);
+      f(begin_tid, end_tid);
-      }
    }
  }
 #else
-  for (auto i = begin; i < end; i++)
+  f(begin, end);
-    f(i);
 #endif
 }
@@ -98,7 +96,7 @@ void parallel_for(
    const size_t begin,
    const size_t end,
    F&& f) {
-  parallel_for(begin, end, default_grain_size(), f);
+  parallel_for(begin, end, default_grain_size(), std::forward<F>(f));
 }
 }  // namespace runtime
 }  // namespace dgl

--- a/src/array/cpu/array_op_impl.cc
+++ b/src/array/cpu/array_op_impl.cc
@@ -5,11 +5,13 @@
 */
 #include <dgl/array.h>
 #include <dgl/runtime/ndarray.h>
+#include <dgl/runtime/parallel_for.h>
 #include <numeric>
 #include "../arith.h"
 namespace dgl {
 using runtime::NDArray;
+using runtime::parallel_for;
 namespace aten {
 namespace impl {
@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
+  for (size_t i = 0; i < lhs->shape[0]; i++) {
-  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
    ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]);
  }
  return ret;
@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
+  for (size_t i = 0; i < lhs->shape[0]; i++) {
-  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
    ret_data[i] = Op::Call(lhs_data[i], rhs);
  }
  return ret;
@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
+  for (size_t i = 0; i < rhs->shape[0]; i++) {
-  for (int64_t i = 0; i < rhs->shape[0]; ++i) {
    ret_data[i] = Op::Call(lhs, rhs_data[i]);
  }
  return ret;
@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
+  for (size_t i = 0; i < lhs->shape[0]; i++) {
-  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
    ret_data[i] = Op::Call(lhs_data[i]);
  }
  return ret;

--- a/src/array/cpu/array_pack.cc
+++ b/src/array/cpu/array_pack.cc
@@ -4,11 +4,13 @@
 * \brief Array index select CPU implementation
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <tuple>
 #include <utility>
 namespace dgl {
 using runtime::NDArray;
+using runtime::parallel_for;
 namespace aten {
 namespace impl {
@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
  NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx);
  DType *concat_data = static_cast<DType *>(concat->data);
-#pragma omp parallel for
+  parallel_for(0, rows, [=](size_t b, size_t e) {
-  for (int64_t i = 0; i < rows; ++i) {
+    for (auto i = b; i < e; ++i) {
      for (int64_t j = 0; j < length_data[i]; ++j)
        concat_data[offsets_data[i] + j] = array_data[i * stride + j];
    }
+  });
  return std::make_pair(concat, offsets);
 }
@@ -56,8 +59,8 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
  IdArray length = NewIdArray(rows, array->ctx);
  int64_t *length_data = static_cast<int64_t *>(length->data);
-#pragma omp parallel for
+  parallel_for(0, rows, [=](size_t b, size_t e) {
-  for (int64_t i = 0; i < rows; ++i) {
+    for (auto i = b; i < e; ++i) {
      int64_t j;
      for (j = 0; j < cols; ++j) {
        const DType val = array_data[i * cols + j];
@@ -66,6 +69,7 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {
      }
      length_data[i] = j;
    }
+  });
  auto ret = ConcatSlices<XPU, DType, int64_t>(array, length);
  return std::make_tuple(ret.first, length, ret.second);

--- a/src/array/cpu/array_scatter.cc
+++ b/src/array/cpu/array_scatter.cc
@@ -4,6 +4,7 @@
 * \brief Array scatter CPU implementation
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 namespace dgl {
 using runtime::NDArray;
@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
  const IdType* idx = index.Ptr<IdType>();
  const DType* val = value.Ptr<DType>();
  DType* outd = out.Ptr<DType>();
-#pragma omp parallel for
+  runtime::parallel_for(0, len, [&](size_t b, size_t e) {
-  for (int64_t i = 0; i < len; ++i)
+    for (auto i = b; i < e; ++i) {
      outd[idx[i]] = val[i];
+    }
+  });
 }
 template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray);

--- a/src/array/cpu/csr_get_data.cc
+++ b/src/array/cpu/csr_get_data.cc
@@ -4,6 +4,7 @@
 * \brief Retrieve entries of a CSR matrix
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <vector>
 #include <unordered_set>
 #include <numeric>
@@ -12,7 +13,7 @@
 namespace dgl {
 using runtime::NDArray;
+using runtime::parallel_for;
 namespace aten {
 namespace impl {
@@ -70,8 +71,8 @@ NDArray CSRGetData(
  if (csr.sorted) {
    // use binary search on each row
-#pragma omp parallel for
+    parallel_for(0, retlen, [&](size_t b, size_t e) {
-    for (int64_t p = 0; p < retlen; ++p) {
+      for (auto p = b; p < e; ++p) {
        const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
        CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
        CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
@@ -84,10 +85,11 @@ NDArray CSRGetData(
          ret_data[p] = return_eids ? eid : weight_data[eid];
        }
      }
+    });
  } else {
    // linear search on each row
-#pragma omp parallel for
+    parallel_for(0, retlen, [&](size_t b, size_t e) {
-    for (int64_t p = 0; p < retlen; ++p) {
+      for (auto p = b; p < e; ++p) {
        const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
        CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
        CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
@@ -99,6 +101,7 @@ NDArray CSRGetData(
          }
        }
      }
+    });
  }
  return ret;
 }

--- a/src/array/cpu/csr_mm.cc
+++ b/src/array/cpu/csr_mm.cc
@@ -5,6 +5,7 @@
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <parallel_hashmap/phmap.h>
 #include <vector>
 #include "array_utils.h"
@@ -12,6 +13,7 @@
 namespace dgl {
 using dgl::runtime::NDArray;
+using dgl::runtime::parallel_for;
 namespace aten {
@@ -26,10 +28,9 @@ void CountNNZPerRow(
    const IdType* B_indices,
    IdType* C_indptr_data,
    int64_t M) {
+  parallel_for(0, M, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
      phmap::flat_hash_set<IdType> set;
-#pragma omp parallel for firstprivate(set)
-  for (int64_t i = 0; i < M; ++i) {
-    set.clear();
      for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
        IdType w = A_indices[u];
        for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
@@ -37,6 +38,7 @@ void CountNNZPerRow(
      }
      C_indptr_data[i] = set.size();
    }
+  });
 }
 template <typename IdType>
@@ -66,10 +68,9 @@ void ComputeIndicesAndData(
    IdType* C_indices_data,
    DType* C_weights_data,
    int64_t M) {
+  parallel_for(0, M, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
      phmap::flat_hash_map<IdType, DType> map;
-#pragma omp parallel for firstprivate(map)
-  for (int64_t i = 0; i < M; ++i) {
-    map.clear();
      for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
        IdType w = A_indices[u];
        DType vA = A_data[A_eids ? A_eids[u] : u];
@@ -87,6 +88,7 @@ void ComputeIndicesAndData(
        ++v;
      }
    }
+  });
 }
 };  // namespace

--- a/src/array/cpu/csr_sort.cc
+++ b/src/array/cpu/csr_sort.cc
@@ -4,6 +4,7 @@
 * \brief CSR sorting
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <numeric>
 #include <algorithm>
 #include <vector>
@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) {
    csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
  }
  IdType* eid_data = static_cast<IdType*>(csr->data->data);
-#pragma omp parallel
-  {
+  runtime::parallel_for(0, num_rows, [=](size_t b, size_t e) {
-    std::vector<ShufflePair> reorder_vec;
+    for (auto row = b; row < e; ++row) {
-#pragma omp for
-    for (int64_t row = 0; row < num_rows; row++) {
      const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
+      std::vector<ShufflePair> reorder_vec(num_cols);
      IdType *col = indices_data + indptr_data[row];
      IdType *eid = eid_data + indptr_data[row];
-      reorder_vec.resize(num_cols);
      for (int64_t i = 0; i < num_cols; i++) {
        reorder_vec[i].first = col[i];
        reorder_vec[i].second = eid[i];
@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) {
        eid[i] = reorder_vec[i].second;
      }
    }
-  }
+  });
  csr->sorted = true;
 }
@@ -101,8 +101,8 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
  auto out_indices_data = static_cast<IdType *>(output.indices->data);
  auto out_eid_data = static_cast<IdType *>(output.data->data);
-#pragma omp parallel for
+  runtime::parallel_for(0, num_rows, [&](size_t b, size_t e) {
-  for (IdType src = 0 ; src < num_rows ; ++src) {
+    for (auto src = b; src < e; ++src) {
      const IdType start = indptr_data[src];
      const IdType end = indptr_data[src + 1];
@@ -132,6 +132,7 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
        out_eid_data[start + offset] = eid;
      }
    }
+  });
  output.sorted = false;
  return std::make_pair(output, tag_pos);
 }

--- a/src/array/cpu/csr_sum.cc
+++ b/src/array/cpu/csr_sum.cc
@@ -5,6 +5,7 @@
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <parallel_hashmap/phmap.h>
 #include <vector>
 #include "array_utils.h"
@@ -25,16 +26,17 @@ void CountNNZPerRow(
    IdType* C_indptr_data,
    int64_t M) {
  int64_t n = A_indptr.size();
+  runtime::parallel_for(0, M, [=](size_t b, size_t e) {
+    for (size_t i = b; i < e; ++i) {
      phmap::flat_hash_set<IdType> set;
-#pragma omp parallel for firstprivate(set)
-  for (IdType i = 0; i < M; ++i) {
-    set.clear();
      for (int64_t k = 0; k < n; ++k) {
        for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
          set.insert(A_indices[k][u]);
      }
      C_indptr_data[i] = set.size();
    }
+  });
 }
 template <typename IdType>
@@ -61,10 +63,9 @@ void ComputeIndicesAndData(
    DType* C_weights_data,
    int64_t M) {
  int64_t n = A_indptr.size();
+  runtime::parallel_for(0, M, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
      phmap::flat_hash_map<IdType, DType> map;
-#pragma omp parallel for firstprivate(map)
-  for (int64_t i = 0; i < M; ++i) {
-    map.clear();
      for (int64_t k = 0; k < n; ++k) {
        for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
          IdType kA = A_indices[k][u];
@@ -72,7 +73,6 @@ void ComputeIndicesAndData(
          map[kA] += vA;
        }
      }
      IdType j = C_indptr_data[i];
      for (auto it : map) {
        C_indices_data[j] = it.first;
@@ -80,6 +80,7 @@ void ComputeIndicesAndData(
        ++j;
      }
    }
+  });
 }
 };  // namespace

--- a/src/array/cpu/csr_union.cc
+++ b/src/array/cpu/csr_union.cc
@@ -4,7 +4,7 @@
 * \brief COO sorting
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <numeric>
 #include <algorithm>
 #include <vector>
@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
    for (int64_t i = 1; i <= csrs[0].num_rows; ++i) {
      std::vector<int64_t> indices_off;
      res_indptr[i] = indptr_data[0][i];
      indices_off.push_back(indptr_data[0][i-1]);
      for (size_t j = 1; j < csrs.size(); ++j) {
        res_indptr[i] += indptr_data[j][i];
@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
            }
          }  // for check out of bound
        }  // for
        res_indices[off] = min;
        res_data[off] = data_data[min_idx][indices_off[min_idx]];
        indices_off[min_idx] += 1;

--- a/src/array/cpu/rowwise_pick.h
+++ b/src/array/cpu/rowwise_pick.h
@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
        global_prefix[t+1] += global_prefix[t];
      }
    }
    #pragma omp barrier
    const IdxType thread_offset = global_prefix[thread_id];

--- a/src/array/cpu/sddmm.h
+++ b/src/array/cpu/sddmm.h
@@ -8,6 +8,7 @@
 #include <dgl/array.h>
 #include <dgl/bcast.h>
+#include <dgl/runtime/parallel_for.h>
 #include "../selector.h"
 namespace dgl {
@@ -40,8 +41,8 @@ void SDDMMCsr(const BcastOff& bcast,
                rhs_dim = bcast.rhs_len,
                reduce_size = bcast.reduce_size;
  DType* O = out.Ptr<DType>();
-#pragma omp parallel for
+  runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
+    for (auto rid = b; rid < e; ++rid) {
      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
      for (IdType j = row_start; j < row_end; ++j) {
        const IdType cid = indices[j];
@@ -50,14 +51,17 @@ void SDDMMCsr(const BcastOff& bcast,
        for (int64_t k = 0; k < dim; ++k) {
          const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-        const DType* lhs_off = Op::use_lhs?
+          const DType* lhs_off = Op::use_lhs
-          X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr;
+            ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
-        const DType* rhs_off = Op::use_rhs?
+            : nullptr;
-          Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr;
+          const DType* rhs_off = Op::use_rhs
+            ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
+            : nullptr;
          out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
        }
      }
    }
+  });
 }
 /*!
@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast,
                rhs_dim = bcast.rhs_len,
                reduce_size = bcast.reduce_size;
  DType* O = out.Ptr<DType>();
-  const int64_t nnz = coo.row->shape[0];
 #pragma omp parallel for
-  for (IdType i = 0; i < nnz; ++i) {
+  for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
    const IdType rid = row[i];
    const IdType cid = col[i];
    const IdType eid = has_idx? edges[i] : i;

--- a/src/array/cpu/segment_reduce.h
+++ b/src/array/cpu/segment_reduce.h
@@ -7,6 +7,7 @@
 #define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 namespace dgl {
 namespace aten {
@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* offsets_data = offsets.Ptr<IdType>();
  DType *out_data = out.Ptr<DType>();
-#pragma omp parallel for
+  runtime::parallel_for(0, n, [=](int b, int e) {
-  for (int i = 0; i < n; ++i) {
+    for (auto i = b; i < e; ++i) {
      for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
        for (int k = 0; k < dim; ++k) {
          out_data[i * dim + k] += feat_data[j * dim + k];
        }
      }
    }
+  });
 }
 /*!
@@ -58,8 +60,8 @@ void SegmentCmp(NDArray feat, NDArray offsets,
  IdType *arg_data = arg.Ptr<IdType>();
  std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
  std::fill(arg_data, arg_data + arg.NumElements(), -1);
-#pragma omp parallel for
+  runtime::parallel_for(0, n, [=](int b, int e) {
-  for (int i = 0; i < n; ++i) {
+    for (auto i = b; i < e; ++i) {
      for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
        for (int k = 0; k < dim; ++k) {
          const DType val = feat_data[j * dim + k];
@@ -70,6 +72,7 @@ void SegmentCmp(NDArray feat, NDArray offsets,
        }
      }
    }
+  });
 }
 /*!
@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* arg_data = arg.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();
-#pragma omp parallel for
+  runtime::parallel_for(0, n, [=](int b, int e) {
-  for (int i = 0; i < n; ++i) {
+    for (auto i = b; i < e; ++i) {
      for (int k = 0; k < dim; ++k) {
        int write_row = arg_data[i * dim + k];
        if (write_row >= 0)
          out_data[write_row * dim + k] = feat_data[i * dim + k];
      }
    }
+  });
 }
 }  // namespace cpu

--- a/src/array/cpu/spmat_op_impl_coo.cc
+++ b/src/array/cpu/spmat_op_impl_coo.cc
@@ -4,6 +4,7 @@
 * \brief CPU implementation of COO sparse matrix operators
 */
 #include <dmlc/omp.h>
+#include <dgl/runtime/parallel_for.h>
 #include <vector>
 #include <unordered_set>
 #include <unordered_map>
@@ -14,6 +15,7 @@
 namespace dgl {
 using runtime::NDArray;
+using runtime::parallel_for;
 namespace aten {
 namespace impl {
@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
  const int64_t kmax = std::max(rowlen, collen);
-#pragma omp parallel for
+  parallel_for(0, kmax, [=](size_t b, size_t e) {
-  for (int64_t k = 0; k < kmax; ++k) {
+    for (auto k = b; k < e; ++k) {
      int64_t i = row_stride * k;
      int64_t j = col_stride * k;
      rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
    }
+  });
  return rst;
 }
@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
  NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
  IdType* rst_data = static_cast<IdType*>(rst->data);
 #pragma omp parallel for
-  for (int64_t i = 0; i < len; ++i)
+  for (int64_t i = 0; i < len; ++i) {
    rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
+  }
  return rst;
 }
@@ -178,8 +182,8 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
  //   the choice.
  if (coo.row_sorted) {
-#pragma omp parallel for
+    parallel_for(0, retlen, [&](size_t b, size_t e) {
-    for (int64_t p = 0; p < retlen; ++p) {
+      for (auto p = b; p < e; ++p) {
        const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
        auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
        for (; it < coo_row + nnz && *it == row_id; ++it) {
@@ -190,6 +194,7 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
          }
        }
      }
+    });
  } else {
 #pragma omp parallel for
    for (int64_t p = 0; p < retlen; ++p) {
@@ -328,11 +333,9 @@ CSRMatrix COOToCSR(COOMatrix coo) {
    IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);
    if (NNZ > 0) {
-      #pragma omp parallel
+      auto num_threads = omp_get_max_threads();
-      {
+      parallel_for(0, num_threads, [&](int b, int e) {
-        const int num_threads = omp_get_num_threads();
+        for (auto thread_id = b; thread_id < e; ++thread_id) {
-        const int thread_id = omp_get_thread_num();
          // We partition the set the of non-zeros among the threads
          const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
          const int64_t nz_start = thread_id*nz_chunk;
@@ -389,6 +392,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
            }
          }
        }
+      });
    } else {
      std::fill(Bp, Bp+N+1, 0);
    }
@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
  IdType *out_row = static_cast<IdType*>(out_row_arr->data);
  IdType *out_col = static_cast<IdType*>(out_col_arr->data);
-#pragma omp parallel for
+  parallel_for(0, nnz, [=](size_t b, size_t e) {
-  for (int64_t i = 0; i < nnz; i++) {
+    for (auto i = b; i < e; ++i) {
      out_row[i] = new_row_ids[in_rows[i]];
      out_col[i] = new_col_ids[in_cols[i]];
    }
+  });
  return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
 }

--- a/src/array/cpu/spmat_op_impl_csr.cc
+++ b/src/array/cpu/spmat_op_impl_csr.cc
@@ -4,6 +4,7 @@
 * \brief CSR matrix operator CPU implementation
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <vector>
 #include <unordered_set>
 #include <numeric>
@@ -12,6 +13,7 @@
 namespace dgl {
 using runtime::NDArray;
+using runtime::parallel_for;
 namespace aten {
 namespace impl {
@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
  // Compute the length of rows for the new matrix.
  std::vector<IdType> new_row_lens(num_rows, -1);
-#pragma omp parallel for
+  parallel_for(0, num_rows, [=, &new_row_lens](size_t b, size_t e) {
-  for (int64_t i = 0; i < num_rows; i++) {
+    for (auto i = b; i < e; ++i) {
      int64_t new_row_id = new_row_ids[i];
      new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
    }
+  });
  // Compute the starting location of each row in the new matrix.
  out_indptr[0] = 0;
  // This is sequential. It should be pretty fast.
@@ -506,8 +509,8 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
  CHECK_EQ(out_indptr[num_rows], nnz);
  // Copy indieces and data with the new order.
  // Here I iterate rows in the order of the old matrix.
-#pragma omp parallel for
+  parallel_for(0, num_rows, [=](size_t b, size_t e) {
-  for (int64_t i = 0; i < num_rows; i++) {
+    for (auto i = b; i < e; ++i) {
      const IdType *in_row = in_indices + in_indptr[i];
      const IdType *in_row_data = in_data + in_indptr[i];
@@ -523,6 +526,7 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
      }
      // TODO(zhengda) maybe we should sort the column indices.
    }
+  });
  return CSRMatrix(num_rows, num_cols,
    out_indptr_arr, out_indices_arr, out_data_arr);
 }

--- a/src/array/cpu/spmm.h
+++ b/src/array/cpu/spmm.h
@@ -8,6 +8,7 @@
 #include <dgl/array.h>
 #include <dgl/bcast.h>
+#include <dgl/runtime/parallel_for.h>
 #include <algorithm>
 #include <limits>
 #include <memory>
@@ -46,8 +47,9 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
  const IdType* indices = csr.indices.Ptr<IdType>();
  const IdType* edges = csr.data.Ptr<IdType>();
  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
-#pragma omp parallel for
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
+  runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
+    for (auto rid = b; rid < e; ++rid) {
      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
      DType* out_off = O + rid * dim;
      for (IdType j = row_start; j < row_end; ++j) {
@@ -56,6 +58,7 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
        cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
      }
    }
+  });
 }
 #endif  // USE_AVX
 #endif  // _WIN32
@@ -79,8 +82,8 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
  const IdType* indices = csr.indices.Ptr<IdType>();
  const IdType* edges = csr.data.Ptr<IdType>();
  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
-#pragma omp parallel for
+  runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
+    for (auto rid = b; rid < e; ++rid) {
      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
      DType* out_off = O + rid * dim;
      for (IdType j = row_start; j < row_end; ++j) {
@@ -97,6 +100,7 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
        }
      }
    }
+  });
 }
 /*!
@@ -270,8 +274,8 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
 #endif  // USE_AVX
 #endif  // _WIN32
-#pragma omp parallel for
+    runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
+      for (auto rid = b; rid < e; ++rid) {
        const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
        DType* out_off = O + rid * dim;
        IdType* argx_off = argX + rid * dim;
@@ -295,6 +299,7 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
          }
        }
      }
+    });
 #if !defined(_WIN32)
 #ifdef USE_AVX
 #ifdef USE_LIBXSMM

--- a/src/graph/graph_op.cc
+++ b/src/graph/graph_op.cc
@@ -8,6 +8,7 @@
 #include <dgl/immutable_graph.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/runtime/container.h>
+#include <dgl/runtime/parallel_for.h>
 #include <algorithm>
 #include "../c_api_common.h"
@@ -261,8 +262,8 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
  const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len);
  if (is_sorted) {
-#pragma omp parallel for
+    runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
-    for (int64_t i = 0; i < query_len; i++) {
+      for (auto i = b; i < e; ++i) {
        const dgl_id_t id = query_data[i];
        const auto it = std::find(parent_data, parent_data + parent_len, id);
        // If the vertex Id doesn't exist, the vid in the subgraph is -1.
@@ -272,14 +273,15 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
          rst_data[i] = -1;
        }
      }
+    });
  } else {
    std::unordered_map<dgl_id_t, dgl_id_t> parent_map;
    for (int64_t i = 0; i < parent_len; i++) {
      const dgl_id_t id = parent_data[i];
      parent_map[id] = i;
    }
-#pragma omp parallel for
+    runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
-    for (int64_t i = 0; i < query_len; i++) {
+      for (auto i = b; i < e; ++i) {
        const dgl_id_t id = query_data[i];
        auto it = parent_map.find(id);
        // If the vertex Id doesn't exist, the vid in the subgraph is -1.
@@ -289,6 +291,7 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {
          rst_data[i] = -1;
        }
      }
+    });
  }
  return rst;
 }
@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo")
    graph_ptr->GetInCSR();
    std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1);
    int num_partitions = part_nodes.size();
-#pragma omp parallel for
+    runtime::parallel_for(0, num_partitions, [&](size_t b, size_t e) {
-    for (int i = 0; i < num_partitions; i++) {
+      for (auto i = b; i < e; ++i) {
        auto nodes = aten::VecToIdArray(part_nodes[i]);
        HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
        std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
        int part_id = part_ids[i];
        subgs[part_id] = subg_ptr;
      }
+    });
    List<SubgraphRef> ret_list;
    for (size_t i = 0; i < subgs.size(); i++) {
      ret_list.push_back(SubgraphRef(subgs[i]));
@@ -732,8 +736,8 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
  const IdType *typed_map_data = static_cast<IdType *>(typed_map->data);
  IdType *types_data = static_cast<IdType *>(ret->data);
  IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids;
-#pragma omp parallel for
+  runtime::parallel_for(0, ids->shape[0], [&](size_t b, size_t e) {
-  for (int64_t i = 0; i < ids->shape[0]; i++) {
+    for (auto i = b; i < e; ++i) {
      IdType id = ids_data[i];
      auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
      // The range must exist.
@@ -750,6 +754,7 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
          + typed_map_data[num_parts * type_id + part_id - 1];
      }
    }
+  });
  return ret;
 }

--- a/src/graph/heterograph_capi.cc
+++ b/src/graph/heterograph_capi.cc
@@ -8,6 +8,7 @@
 #include <dgl/packed_func_ext.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/container.h>
+#include <dgl/runtime/parallel_for.h>
 #include <set>
 #include "../c_api_common.h"
@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    HeteroGraphRef hg = args[0];
    dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
-#if !defined(DGL_USE_CUDA)
+    auto get_format_f = [&](size_t etype_b, size_t etype_e) {
-#pragma omp parallel for
+      for (auto etype = etype_b; etype < etype_e; ++etype) {
-#endif
-    for (int64_t etype = 0; etype < hg->NumEdgeTypes(); ++etype) {
        auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
        for (auto format : CodeToSparseFormats(code))
          bg->GetFormat(format);
      }
+    };
+#if !(defined(DGL_USE_CUDA))
+  runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
+#else
+  get_format_f(0, hg->NumEdgeTypes());
+#endif
 });
 DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph")

--- a/src/graph/network.cc
+++ b/src/graph/network.cc
@@ -9,6 +9,7 @@
 #include <dgl/runtime/container.h>
 #include <dgl/runtime/ndarray.h>
+#include <dgl/runtime/parallel_for.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/nodeflow.h>
@@ -829,8 +830,8 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
    char *return_data = new char[ID_size*row_size];
    const int64_t local_ids_size = local_ids.size();
    // Copy local data
-#pragma omp parallel for
+    runtime::parallel_for(0, local_ids_size, [&](size_t b, size_t e) {
-    for (int64_t i = 0; i < local_ids_size; ++i) {
+      for (auto i = b; i < e; ++i) {
        CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
        CHECK_GE(data_size, local_ids[i] * row_size + row_size);
        CHECK_GE(local_ids[i], 0);
@@ -838,6 +839,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
               local_data_char + local_ids[i] * row_size,
               row_size);
      }
+    });
    // Recv remote message
    for (int i = 0; i < msg_count; ++i) {
      KVStoreMsg *kv_msg = recv_kv_message(receiver);

--- a/src/graph/sampler.cc
+++ b/src/graph/sampler.cc
@@ -9,6 +9,7 @@
 #include <dgl/runtime/container.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/random.h>
+#include <dgl/runtime/parallel_for.h>
 #include <dmlc/omp.h>
 #include <algorithm>
 #include <cstdlib>
@@ -850,8 +851,8 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
    BuildCsr(*gptr, neigh_type);
    // generate node flows
    std::vector<NodeFlow> nflows(num_workers);
-#pragma omp parallel for
+    runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
-    for (int i = 0; i < num_workers; i++) {
+      for (auto i = b; i < e; ++i) {
        // create per-worker seed nodes.
        const int64_t start = (batch_start_id + i) * batch_size;
        const int64_t end = std::min(start + batch_size, num_seeds);
@@ -863,6 +864,7 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
          gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor,
          add_self_loop, probability);
      }
+    });
    return nflows;
 }
@@ -977,8 +979,8 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
    BuildCsr(*gptr, neigh_type);
    // generate node flows
    std::vector<NodeFlow> nflows(num_workers);
-#pragma omp parallel for
+    runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
-    for (int i = 0; i < num_workers; i++) {
+      for (auto i = b; i < e; ++i) {
        // create per-worker seed nodes.
        const int64_t start = (batch_start_id + i) * batch_size;
        const int64_t end = std::min(start + batch_size, num_seeds);
@@ -989,6 +991,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
        nflows[i] = SamplerOp::LayerUniformSample(
            gptr.get(), worker_seeds, neigh_type, layer_sizes);
      }
+    });
    *rv = List<NodeFlow>(nflows);
  });
@@ -1466,8 +1469,8 @@ public:
    std::vector<SubgraphRef> positive_subgs(num_workers);
    std::vector<SubgraphRef> negative_subgs(num_workers);
-#pragma omp parallel for
+    runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
-    for (int64_t i = 0; i < num_workers; i++) {
+      for (auto i = b; i < e; ++i) {
        const int64_t start = (batch_curr_id_ + i) * batch_size_;
        const int64_t end = std::min(start + batch_size_, num_seeds_);
        const int64_t num_edges = end - start;
@@ -1514,6 +1517,7 @@ public:
          negative_subgs[i] = ConvertRef(neg_subg);
        }
      }
+    });
    if (neg_mode_.size() > 0) {
      positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end());
    }

--- a/src/graph/sampling/randomwalks/randomwalks_cpu.h
+++ b/src/graph/sampling/randomwalks/randomwalks_cpu.h
@@ -9,6 +9,7 @@
 #include <dgl/base_heterograph.h>
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <tuple>
 #include <utility>
 #include "randomwalks_impl.h"
@@ -47,8 +48,8 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
  IdxType *traces_data = traces.Ptr<IdxType>();
  IdxType *eids_data = eids.Ptr<IdxType>();
-#pragma omp parallel for
+  runtime::parallel_for(0, num_seeds, [&](size_t seed_begin, size_t seed_end) {
-  for (int64_t seed_id = 0; seed_id < num_seeds; ++seed_id) {
+    for (auto seed_id = seed_begin; seed_id < seed_end; seed_id++) {
      int64_t i;
      dgl_id_t curr = seed_data[seed_id];
      traces_data[seed_id * trace_length] = curr;
@@ -66,6 +67,7 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
        eids_data[seed_id * max_num_steps + i] = -1;
      }
    }
+  });
  return std::make_pair(traces, eids);
 }