[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)

* [CPU, Parallel] Rewriting omp pragmas with parallel_for * [CPU, Parallel] Decrease number of calls to task function * c[CPU, Parallel] Modify calls to new interface of parallel_for

[Performance, CPU] Rewriting OpenMP pragmas into parallel_for (#3171)
* [CPU, Parallel] Rewriting omp pragmas with parallel_for * [CPU, Parallel] Decrease number of calls to task function * c[CPU, Parallel] Modify calls to new interface of parallel_for
f5183820 · Tomasz Patejko · GitHub · 21a40279 · f5183820 · f5183820
Unverified Commit f5183820 authored Sep 02, 2021 by Tomasz Patejko Committed by GitHub Sep 02, 2021
20 changed files
--- a/include/dgl/runtime/parallel_for.h
+++ b/include/dgl/runtime/parallel_for.h
@@ -74,14 +74,12 @@ void parallel_for(
    auto chunk_size = divup((end - begin), num_threads);
    auto begin_tid = begin + tid * chunk_size;
    if (begin_tid < end) {
-      for (auto i = begin_tid; i < std::min(end, chunk_size + begin_tid); i++) {
-        f(i);
-      }
+      auto end_tid = std::min(end, chunk_size + begin_tid);
+      f(begin_tid, end_tid);
    }
  }
 #else
-  for (auto i = begin; i < end; i++)
-    f(i);
+  f(begin, end);
 #endif
 }

@@ -98,7 +96,7 @@ void parallel_for(
    const size_t begin,
    const size_t end,
    F&& f) {
-  parallel_for(begin, end, default_grain_size(), f);
+  parallel_for(begin, end, default_grain_size(), std::forward<F>(f));
 }
 }  // namespace runtime
 }  // namespace dgl

--- a/src/array/cpu/array_op_impl.cc
+++ b/src/array/cpu/array_op_impl.cc
@@ -5,11 +5,13 @@
 */
 #include <dgl/array.h>
 #include <dgl/runtime/ndarray.h>
+#include <dgl/runtime/parallel_for.h>
 #include <numeric>
 #include "../arith.h"

 namespace dgl {
 using runtime::NDArray;
+using runtime::parallel_for;
 namespace aten {
 namespace impl {

@@ -51,8 +53,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
-  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+  for (size_t i = 0; i < lhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]);
  }
  return ret;
@@ -88,8 +89,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
-  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+  for (size_t i = 0; i < lhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs_data[i], rhs);
  }
  return ret;
@@ -125,8 +125,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
-  for (int64_t i = 0; i < rhs->shape[0]; ++i) {
+  for (size_t i = 0; i < rhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs, rhs_data[i]);
  }
  return ret;
@@ -162,8 +161,7 @@ IdArray UnaryElewise(IdArray lhs) {
  IdType* ret_data = static_cast<IdType*>(ret->data);
  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
  // etc., especially since the workload is very light.  Need to replace with parallel_for.
-// #pragma omp parallel for
-  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+  for (size_t i = 0; i < lhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs_data[i]);
  }
  return ret;

--- a/src/array/cpu/array_pack.cc
+++ b/src/array/cpu/array_pack.cc
@@ -4,11 +4,13 @@
 * \brief Array index select CPU implementation
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <tuple>
 #include <utility>

 namespace dgl {
 using runtime::NDArray;
+using runtime::parallel_for;
 namespace aten {
 namespace impl {

@@ -29,11 +31,12 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
  NDArray concat = NDArray::Empty({total_length}, array->dtype, array->ctx);
  DType *concat_data = static_cast<DType *>(concat->data);

-#pragma omp parallel for
-  for (int64_t i = 0; i < rows; ++i) {
-    for (int64_t j = 0; j < length_data[i]; ++j)
-      concat_data[offsets_data[i] + j] = array_data[i * stride + j];
-  }
+  parallel_for(0, rows, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      for (int64_t j = 0; j < length_data[i]; ++j)
+        concat_data[offsets_data[i] + j] = array_data[i * stride + j];
+    }
+  });

  return std::make_pair(concat, offsets);
 }
@@ -56,16 +59,17 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value) {

  IdArray length = NewIdArray(rows, array->ctx);
  int64_t *length_data = static_cast<int64_t *>(length->data);
-#pragma omp parallel for
-  for (int64_t i = 0; i < rows; ++i) {
-    int64_t j;
-    for (j = 0; j < cols; ++j) {
-      const DType val = array_data[i * cols + j];
-      if (val == pad_value)
-        break;
+  parallel_for(0, rows, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      int64_t j;
+      for (j = 0; j < cols; ++j) {
+        const DType val = array_data[i * cols + j];
+        if (val == pad_value)
+          break;
+      }
+      length_data[i] = j;
    }
-    length_data[i] = j;
-  }
+  });

  auto ret = ConcatSlices<XPU, DType, int64_t>(array, length);
  return std::make_tuple(ret.first, length, ret.second);

--- a/src/array/cpu/array_scatter.cc
+++ b/src/array/cpu/array_scatter.cc
@@ -4,6 +4,7 @@
 * \brief Array scatter CPU implementation
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>

 namespace dgl {
 using runtime::NDArray;
@@ -39,9 +40,11 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
  const IdType* idx = index.Ptr<IdType>();
  const DType* val = value.Ptr<DType>();
  DType* outd = out.Ptr<DType>();
-#pragma omp parallel for
-  for (int64_t i = 0; i < len; ++i)
-    outd[idx[i]] = val[i];
+  runtime::parallel_for(0, len, [&](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      outd[idx[i]] = val[i];
+    }
+  });
 }

 template void Scatter_<kDLCPU, int32_t, int32_t>(IdArray, NDArray, NDArray);

--- a/src/array/cpu/csr_get_data.cc
+++ b/src/array/cpu/csr_get_data.cc
@@ -4,6 +4,7 @@
 * \brief Retrieve entries of a CSR matrix
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <vector>
 #include <unordered_set>
 #include <numeric>
@@ -12,7 +13,7 @@
 namespace dgl {

 using runtime::NDArray;
-
+using runtime::parallel_for;
 namespace aten {
 namespace impl {

@@ -70,35 +71,37 @@ NDArray CSRGetData(

  if (csr.sorted) {
    // use binary search on each row
-#pragma omp parallel for
-    for (int64_t p = 0; p < retlen; ++p) {
-      const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
-      CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
-      CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
-      const IdType *start_ptr = indices_data + indptr_data[row_id];
-      const IdType *end_ptr = indices_data + indptr_data[row_id + 1];
-      auto it = std::lower_bound(start_ptr, end_ptr, col_id);
-      if (it != end_ptr && *it == col_id) {
-        const IdType idx = it - indices_data;
-        IdType eid = data ? data[idx] : idx;
-        ret_data[p] = return_eids ? eid : weight_data[eid];
+    parallel_for(0, retlen, [&](size_t b, size_t e) {
+      for (auto p = b; p < e; ++p) {
+        const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
+        CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
+        CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
+        const IdType *start_ptr = indices_data + indptr_data[row_id];
+        const IdType *end_ptr = indices_data + indptr_data[row_id + 1];
+        auto it = std::lower_bound(start_ptr, end_ptr, col_id);
+        if (it != end_ptr && *it == col_id) {
+          const IdType idx = it - indices_data;
+          IdType eid = data ? data[idx] : idx;
+          ret_data[p] = return_eids ? eid : weight_data[eid];
+        }
      }
-    }
+    });
  } else {
    // linear search on each row
-#pragma omp parallel for
-    for (int64_t p = 0; p < retlen; ++p) {
-      const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
-      CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
-      CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
-      for (IdType idx = indptr_data[row_id]; idx < indptr_data[row_id + 1]; ++idx) {
-        if (indices_data[idx] == col_id) {
-          IdType eid = data ? data[idx] : idx;
-          ret_data[p] = return_eids ? eid : weight_data[eid];
-          break;
+    parallel_for(0, retlen, [&](size_t b, size_t e) {
+      for (auto p = b; p < e; ++p) {
+        const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
+        CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
+        CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
+        for (IdType idx = indptr_data[row_id]; idx < indptr_data[row_id + 1]; ++idx) {
+          if (indices_data[idx] == col_id) {
+            IdType eid = data ? data[idx] : idx;
+            ret_data[p] = return_eids ? eid : weight_data[eid];
+            break;
+          }
        }
      }
-    }
+    });
  }
  return ret;
 }

--- a/src/array/cpu/csr_mm.cc
+++ b/src/array/cpu/csr_mm.cc
@@ -5,6 +5,7 @@
 */

 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <parallel_hashmap/phmap.h>
 #include <vector>
 #include "array_utils.h"
@@ -12,6 +13,7 @@
 namespace dgl {

 using dgl::runtime::NDArray;
+using dgl::runtime::parallel_for;

 namespace aten {

@@ -26,17 +28,17 @@ void CountNNZPerRow(
    const IdType* B_indices,
    IdType* C_indptr_data,
    int64_t M) {
-  phmap::flat_hash_set<IdType> set;
-#pragma omp parallel for firstprivate(set)
-  for (int64_t i = 0; i < M; ++i) {
-    set.clear();
-    for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
-      IdType w = A_indices[u];
-      for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
-        set.insert(B_indices[v]);
+  parallel_for(0, M, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      phmap::flat_hash_set<IdType> set;
+      for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
+        IdType w = A_indices[u];
+        for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v)
+          set.insert(B_indices[v]);
+      }
+      C_indptr_data[i] = set.size();
    }
-    C_indptr_data[i] = set.size();
-  }
+  });
 }

 template <typename IdType>
@@ -66,27 +68,27 @@ void ComputeIndicesAndData(
    IdType* C_indices_data,
    DType* C_weights_data,
    int64_t M) {
-  phmap::flat_hash_map<IdType, DType> map;
-#pragma omp parallel for firstprivate(map)
-  for (int64_t i = 0; i < M; ++i) {
-    map.clear();
-    for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
-      IdType w = A_indices[u];
-      DType vA = A_data[A_eids ? A_eids[u] : u];
-      for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) {
-        IdType t = B_indices[v];
-        DType vB = B_data[B_eids ? B_eids[v] : v];
-        map[t] += vA * vB;
+  parallel_for(0, M, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      phmap::flat_hash_map<IdType, DType> map;
+      for (IdType u = A_indptr[i]; u < A_indptr[i + 1]; ++u) {
+        IdType w = A_indices[u];
+        DType vA = A_data[A_eids ? A_eids[u] : u];
+        for (IdType v = B_indptr[w]; v < B_indptr[w + 1]; ++v) {
+          IdType t = B_indices[v];
+          DType vB = B_data[B_eids ? B_eids[v] : v];
+          map[t] += vA * vB;
+        }
      }
-    }

-    IdType v = C_indptr_data[i];
-    for (auto it : map) {
-      C_indices_data[v] = it.first;
-      C_weights_data[v] = it.second;
-      ++v;
+      IdType v = C_indptr_data[i];
+      for (auto it : map) {
+        C_indices_data[v] = it.first;
+        C_weights_data[v] = it.second;
+        ++v;
+      }
    }
-  }
+  });
 }

 };  // namespace

--- a/src/array/cpu/csr_sort.cc
+++ b/src/array/cpu/csr_sort.cc
@@ -4,6 +4,7 @@
 * \brief CSR sorting
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <numeric>
 #include <algorithm>
 #include <vector>
@@ -48,16 +49,14 @@ void CSRSort_(CSRMatrix* csr) {
    csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
  }
  IdType* eid_data = static_cast<IdType*>(csr->data->data);
-#pragma omp parallel
-  {
-    std::vector<ShufflePair> reorder_vec;
-#pragma omp for
-    for (int64_t row = 0; row < num_rows; row++) {
+
+  runtime::parallel_for(0, num_rows, [=](size_t b, size_t e) {
+    for (auto row = b; row < e; ++row) {
      const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
+      std::vector<ShufflePair> reorder_vec(num_cols);
      IdType *col = indices_data + indptr_data[row];
      IdType *eid = eid_data + indptr_data[row];

-      reorder_vec.resize(num_cols);
      for (int64_t i = 0; i < num_cols; i++) {
        reorder_vec[i].first = col[i];
        reorder_vec[i].second = eid[i];
@@ -71,7 +70,8 @@ void CSRSort_(CSRMatrix* csr) {
        eid[i] = reorder_vec[i].second;
      }
    }
-  }
+  });
+
  csr->sorted = true;
 }

@@ -101,37 +101,38 @@ std::pair<CSRMatrix, NDArray> CSRSortByTag(
  auto out_indices_data = static_cast<IdType *>(output.indices->data);
  auto out_eid_data = static_cast<IdType *>(output.data->data);

-#pragma omp parallel for
-  for (IdType src = 0 ; src < num_rows ; ++src) {
-    const IdType start = indptr_data[src];
-    const IdType end = indptr_data[src + 1];
-
-    auto tag_pos_row = tag_pos_data + src * (num_tags + 1);
-    std::vector<IdType> pointer(num_tags, 0);
-
-    for (IdType ptr = start ; ptr < end ; ++ptr) {
-      const IdType dst = indices_data[ptr];
-      const TagType tag = tag_data[dst];
-      CHECK_LT(tag, num_tags);
-      ++tag_pos_row[tag + 1];
-    }  // count
-
-    for (TagType tag = 1 ; tag <= num_tags; ++tag) {
-      tag_pos_row[tag] += tag_pos_row[tag - 1];
-    }  // cumulate
-
-    for (IdType ptr = start ; ptr < end ; ++ptr) {
-      const IdType dst = indices_data[ptr];
-      const IdType eid = eid_data[ptr];
-      const TagType tag = tag_data[dst];
-      const IdType offset = tag_pos_row[tag] + pointer[tag];
-      CHECK_LT(offset, tag_pos_row[tag + 1]);
-      ++pointer[tag];
-
-      out_indices_data[start + offset] = dst;
-      out_eid_data[start + offset] = eid;
+  runtime::parallel_for(0, num_rows, [&](size_t b, size_t e) {
+    for (auto src = b; src < e; ++src) {
+      const IdType start = indptr_data[src];
+      const IdType end = indptr_data[src + 1];
+
+      auto tag_pos_row = tag_pos_data + src * (num_tags + 1);
+      std::vector<IdType> pointer(num_tags, 0);
+
+      for (IdType ptr = start ; ptr < end ; ++ptr) {
+        const IdType dst = indices_data[ptr];
+        const TagType tag = tag_data[dst];
+        CHECK_LT(tag, num_tags);
+        ++tag_pos_row[tag + 1];
+      }  // count
+
+      for (TagType tag = 1 ; tag <= num_tags; ++tag) {
+        tag_pos_row[tag] += tag_pos_row[tag - 1];
+      }  // cumulate
+
+      for (IdType ptr = start ; ptr < end ; ++ptr) {
+        const IdType dst = indices_data[ptr];
+        const IdType eid = eid_data[ptr];
+        const TagType tag = tag_data[dst];
+        const IdType offset = tag_pos_row[tag] + pointer[tag];
+        CHECK_LT(offset, tag_pos_row[tag + 1]);
+        ++pointer[tag];
+
+        out_indices_data[start + offset] = dst;
+        out_eid_data[start + offset] = eid;
+      }
    }
-  }
+  });
  output.sorted = false;
  return std::make_pair(output, tag_pos);
 }

--- a/src/array/cpu/csr_sum.cc
+++ b/src/array/cpu/csr_sum.cc
@@ -5,6 +5,7 @@
 */

 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <parallel_hashmap/phmap.h>
 #include <vector>
 #include "array_utils.h"
@@ -25,16 +26,17 @@ void CountNNZPerRow(
    IdType* C_indptr_data,
    int64_t M) {
  int64_t n = A_indptr.size();
-  phmap::flat_hash_set<IdType> set;
-#pragma omp parallel for firstprivate(set)
-  for (IdType i = 0; i < M; ++i) {
-    set.clear();
-    for (int64_t k = 0; k < n; ++k) {
-      for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
-        set.insert(A_indices[k][u]);
+
+  runtime::parallel_for(0, M, [=](size_t b, size_t e) {
+    for (size_t i = b; i < e; ++i) {
+      phmap::flat_hash_set<IdType> set;
+      for (int64_t k = 0; k < n; ++k) {
+        for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u)
+          set.insert(A_indices[k][u]);
+      }
+      C_indptr_data[i] = set.size();
    }
-    C_indptr_data[i] = set.size();
-  }
+  });
 }

 template <typename IdType>
@@ -61,25 +63,24 @@ void ComputeIndicesAndData(
    DType* C_weights_data,
    int64_t M) {
  int64_t n = A_indptr.size();
-  phmap::flat_hash_map<IdType, DType> map;
-#pragma omp parallel for firstprivate(map)
-  for (int64_t i = 0; i < M; ++i) {
-    map.clear();
-    for (int64_t k = 0; k < n; ++k) {
-      for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
-        IdType kA = A_indices[k][u];
-        DType vA = A_data[k][A_eids[k] ? A_eids[k][u] : u];
-        map[kA] += vA;
+  runtime::parallel_for(0, M, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      phmap::flat_hash_map<IdType, DType> map;
+      for (int64_t k = 0; k < n; ++k) {
+        for (IdType u = A_indptr[k][i]; u < A_indptr[k][i + 1]; ++u) {
+          IdType kA = A_indices[k][u];
+          DType vA = A_data[k][A_eids[k] ? A_eids[k][u] : u];
+          map[kA] += vA;
+        }
+      }
+      IdType j = C_indptr_data[i];
+      for (auto it : map) {
+        C_indices_data[j] = it.first;
+        C_weights_data[j] = it.second;
+        ++j;
      }
    }
-
-    IdType j = C_indptr_data[i];
-    for (auto it : map) {
-      C_indices_data[j] = it.first;
-      C_weights_data[j] = it.second;
-      ++j;
-    }
-  }
+  });
 }

 };  // namespace

--- a/src/array/cpu/csr_union.cc
+++ b/src/array/cpu/csr_union.cc
@@ -4,7 +4,7 @@
 * \brief COO sorting
 */
 #include <dgl/array.h>
-
+#include <dgl/runtime/parallel_for.h>
 #include <numeric>
 #include <algorithm>
 #include <vector>
@@ -54,6 +54,7 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
    for (int64_t i = 1; i <= csrs[0].num_rows; ++i) {
      std::vector<int64_t> indices_off;
      res_indptr[i] = indptr_data[0][i];
+
      indices_off.push_back(indptr_data[0][i-1]);
      for (size_t j = 1; j < csrs.size(); ++j) {
        res_indptr[i] += indptr_data[j][i];
@@ -74,7 +75,6 @@ CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
            }
          }  // for check out of bound
        }  // for
-
        res_indices[off] = min;
        res_data[off] = data_data[min_idx][indices_off[min_idx]];
        indices_off[min_idx] += 1;

--- a/src/array/cpu/rowwise_pick.h
+++ b/src/array/cpu/rowwise_pick.h
@@ -147,6 +147,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
        global_prefix[t+1] += global_prefix[t];
      }
    }
+
    #pragma omp barrier
    const IdxType thread_offset = global_prefix[thread_id];


--- a/src/array/cpu/sddmm.h
+++ b/src/array/cpu/sddmm.h
@@ -8,6 +8,7 @@

 #include <dgl/array.h>
 #include <dgl/bcast.h>
+#include <dgl/runtime/parallel_for.h>
 #include "../selector.h"

 namespace dgl {
@@ -40,24 +41,27 @@ void SDDMMCsr(const BcastOff& bcast,
                rhs_dim = bcast.rhs_len,
                reduce_size = bcast.reduce_size;
  DType* O = out.Ptr<DType>();
-#pragma omp parallel for
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
-    const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
-    for (IdType j = row_start; j < row_end; ++j) {
-      const IdType cid = indices[j];
-      const IdType eid = has_idx? edges[j] : j;
-      DType* out_off = O + eid * dim;
-      for (int64_t k = 0; k < dim; ++k) {
-        const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
-        const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-        const DType* lhs_off = Op::use_lhs?
-          X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr;
-        const DType* rhs_off = Op::use_rhs?
-          Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr;
-        out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
+  runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
+    for (auto rid = b; rid < e; ++rid) {
+      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
+      for (IdType j = row_start; j < row_end; ++j) {
+        const IdType cid = indices[j];
+        const IdType eid = has_idx? edges[j] : j;
+        DType* out_off = O + eid * dim;
+        for (int64_t k = 0; k < dim; ++k) {
+          const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
+          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
+          const DType* lhs_off = Op::use_lhs
+            ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
+            : nullptr;
+          const DType* rhs_off = Op::use_rhs
+            ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
+            : nullptr;
+          out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
+        }
      }
    }
-  }
+  });
 }

 /*!
@@ -86,9 +90,8 @@ void SDDMMCoo(const BcastOff& bcast,
                rhs_dim = bcast.rhs_len,
                reduce_size = bcast.reduce_size;
  DType* O = out.Ptr<DType>();
-  const int64_t nnz = coo.row->shape[0];
 #pragma omp parallel for
-  for (IdType i = 0; i < nnz; ++i) {
+  for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
    const IdType rid = row[i];
    const IdType cid = col[i];
    const IdType eid = has_idx? edges[i] : i;

--- a/src/array/cpu/segment_reduce.h
+++ b/src/array/cpu/segment_reduce.h
@@ -7,6 +7,7 @@
 #define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_

 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>

 namespace dgl {
 namespace aten {
@@ -27,14 +28,15 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* offsets_data = offsets.Ptr<IdType>();
  DType *out_data = out.Ptr<DType>();
-#pragma omp parallel for
-  for (int i = 0; i < n; ++i) {
-    for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
-      for (int k = 0; k < dim; ++k) {
-        out_data[i * dim + k] += feat_data[j * dim + k];
+  runtime::parallel_for(0, n, [=](int b, int e) {
+    for (auto i = b; i < e; ++i) {
+      for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
+        for (int k = 0; k < dim; ++k) {
+          out_data[i * dim + k] += feat_data[j * dim + k];
+        }
      }
    }
-  }
+  });
 }

 /*!
@@ -58,18 +60,19 @@ void SegmentCmp(NDArray feat, NDArray offsets,
  IdType *arg_data = arg.Ptr<IdType>();
  std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
  std::fill(arg_data, arg_data + arg.NumElements(), -1);
-#pragma omp parallel for
-  for (int i = 0; i < n; ++i) {
-    for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
-      for (int k = 0; k < dim; ++k) {
-        const DType val = feat_data[j * dim + k];
-        if (Cmp::Call(out_data[i * dim + k], val)) {
-          out_data[i * dim + k] = val;
-          arg_data[i * dim + k] = j;
+  runtime::parallel_for(0, n, [=](int b, int e) {
+    for (auto i = b; i < e; ++i) {
+      for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
+        for (int k = 0; k < dim; ++k) {
+          const DType val = feat_data[j * dim + k];
+          if (Cmp::Call(out_data[i * dim + k], val)) {
+            out_data[i * dim + k] = val;
+            arg_data[i * dim + k] = j;
+          }
        }
      }
    }
-  }
+  });
 }

 /*!
@@ -114,14 +117,15 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* arg_data = arg.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();
-#pragma omp parallel for
-  for (int i = 0; i < n; ++i) {
-    for (int k = 0; k < dim; ++k) {
-      int write_row = arg_data[i * dim + k];
-      if (write_row >= 0)
-        out_data[write_row * dim + k] = feat_data[i * dim + k];
+  runtime::parallel_for(0, n, [=](int b, int e) {
+    for (auto i = b; i < e; ++i) {
+      for (int k = 0; k < dim; ++k) {
+        int write_row = arg_data[i * dim + k];
+        if (write_row >= 0)
+          out_data[write_row * dim + k] = feat_data[i * dim + k];
+      }
    }
-  }
+  });
 }

 }  // namespace cpu

--- a/src/array/cpu/spmat_op_impl_coo.cc
+++ b/src/array/cpu/spmat_op_impl_coo.cc
@@ -4,6 +4,7 @@
 * \brief CPU implementation of COO sparse matrix operators
 */
 #include <dmlc/omp.h>
+#include <dgl/runtime/parallel_for.h>
 #include <vector>
 #include <unordered_set>
 #include <unordered_map>
@@ -14,6 +15,7 @@
 namespace dgl {

 using runtime::NDArray;
+using runtime::parallel_for;

 namespace aten {
 namespace impl {
@@ -55,12 +57,13 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
  const int64_t kmax = std::max(rowlen, collen);
-#pragma omp parallel for
-  for (int64_t k = 0; k < kmax; ++k) {
-    int64_t i = row_stride * k;
-    int64_t j = col_stride * k;
-    rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
-  }
+  parallel_for(0, kmax, [=](size_t b, size_t e) {
+    for (auto k = b; k < e; ++k) {
+      int64_t i = row_stride * k;
+      int64_t j = col_stride * k;
+      rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
+    }
+  });
  return rst;
 }

@@ -114,8 +117,9 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
  NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
  IdType* rst_data = static_cast<IdType*>(rst->data);
 #pragma omp parallel for
-  for (int64_t i = 0; i < len; ++i)
+  for (int64_t i = 0; i < len; ++i) {
    rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
+  }
  return rst;
 }

@@ -178,18 +182,19 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
  //   the choice.

  if (coo.row_sorted) {
-#pragma omp parallel for
-    for (int64_t p = 0; p < retlen; ++p) {
-      const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
-      auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
-      for (; it < coo_row + nnz && *it == row_id; ++it) {
-        const auto idx = it - coo_row;
-        if (coo_col[idx] == col_id) {
-          ret_data[p] = data? data[idx] : idx;
-          break;
+    parallel_for(0, retlen, [&](size_t b, size_t e) {
+      for (auto p = b; p < e; ++p) {
+        const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
+        auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
+        for (; it < coo_row + nnz && *it == row_id; ++it) {
+          const auto idx = it - coo_row;
+          if (coo_col[idx] == col_id) {
+            ret_data[p] = data? data[idx] : idx;
+            break;
+          }
        }
      }
-    }
+    });
  } else {
 #pragma omp parallel for
    for (int64_t p = 0; p < retlen; ++p) {
@@ -328,67 +333,66 @@ CSRMatrix COOToCSR(COOMatrix coo) {
    IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);

    if (NNZ > 0) {
-      #pragma omp parallel
-      {
-        const int num_threads = omp_get_num_threads();
-        const int thread_id = omp_get_thread_num();
-
-        // We partition the set the of non-zeros among the threads
-        const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
-        const int64_t nz_start = thread_id*nz_chunk;
-        const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
-
-        // Each thread searchs the row array for a change, and marks it's
-        // location in Bp. Threads, other than the first, start at the last
-        // index covered by the previous, in order to detect changes in the row
-        // array between thread partitions. This means that each thread after
-        // the first, searches the range [nz_start-1, nz_end). That is,
-        // if we had 10 non-zeros, and 4 threads, the indexes searched by each
-        // thread would be:
-        // 0: [0, 1, 2]
-        // 1: [2, 3, 4, 5]
-        // 2: [5, 6, 7, 8]
-        // 3: [8, 9]
-        //
-        // That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
-        // change in row would be captured by one thread:
-        //
-        // 0: [0, 0, 1] - row 0
-        // 1: [1, 2, 2, 2] - row 1
-        // 2: [2, 4, 5, 5] - rows 2, 3, and 4
-        // 3: [5, 6] - rows 5 and 6
-        //
-        int64_t row = 0;
-        if (nz_start < nz_end) {
-          row = nz_start == 0 ? 0 : row_data[nz_start-1];
-          for (int64_t i = nz_start; i < nz_end; ++i) {
-            while (row != row_data[i]) {
-              ++row;
-              Bp[row] = i;
+      auto num_threads = omp_get_max_threads();
+      parallel_for(0, num_threads, [&](int b, int e) {
+        for (auto thread_id = b; thread_id < e; ++thread_id) {
+          // We partition the set the of non-zeros among the threads
+          const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
+          const int64_t nz_start = thread_id*nz_chunk;
+          const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
+
+          // Each thread searchs the row array for a change, and marks it's
+          // location in Bp. Threads, other than the first, start at the last
+          // index covered by the previous, in order to detect changes in the row
+          // array between thread partitions. This means that each thread after
+          // the first, searches the range [nz_start-1, nz_end). That is,
+          // if we had 10 non-zeros, and 4 threads, the indexes searched by each
+          // thread would be:
+          // 0: [0, 1, 2]
+          // 1: [2, 3, 4, 5]
+          // 2: [5, 6, 7, 8]
+          // 3: [8, 9]
+          //
+          // That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
+          // change in row would be captured by one thread:
+          //
+          // 0: [0, 0, 1] - row 0
+          // 1: [1, 2, 2, 2] - row 1
+          // 2: [2, 4, 5, 5] - rows 2, 3, and 4
+          // 3: [5, 6] - rows 5 and 6
+          //
+          int64_t row = 0;
+          if (nz_start < nz_end) {
+            row = nz_start == 0 ? 0 : row_data[nz_start-1];
+            for (int64_t i = nz_start; i < nz_end; ++i) {
+              while (row != row_data[i]) {
+                ++row;
+                Bp[row] = i;
+              }
            }
-          }

-          // We will not detect the row change for the last row, nor any empty
-          // rows at the end of the matrix, so the last active thread needs
-          // mark all remaining rows in Bp with NNZ.
-          if (nz_end == NNZ) {
-            while (row < N) {
-              ++row;
-              Bp[row] = NNZ;
+            // We will not detect the row change for the last row, nor any empty
+            // rows at the end of the matrix, so the last active thread needs
+            // mark all remaining rows in Bp with NNZ.
+            if (nz_end == NNZ) {
+              while (row < N) {
+                ++row;
+                Bp[row] = NNZ;
+              }
            }
-          }

-          if (fill_data) {
-            // TODO(minjie): Many of our current implementation assumes that CSR must have
-            //   a data array. This is a temporary workaround. Remove this after:
-            //   - The old immutable graph implementation is deprecated.
-            //   - The old binary reduce kernel is deprecated.
-            std::iota(fill_data+nz_start,
-                      fill_data+nz_end,
-                      nz_start);
+            if (fill_data) {
+              // TODO(minjie): Many of our current implementation assumes that CSR must have
+              //   a data array. This is a temporary workaround. Remove this after:
+              //   - The old immutable graph implementation is deprecated.
+              //   - The old binary reduce kernel is deprecated.
+              std::iota(fill_data+nz_start,
+                        fill_data+nz_end,
+                        nz_start);
+            }
          }
        }
-      }
+      });
    } else {
      std::fill(Bp, Bp+N+1, 0);
    }
@@ -627,11 +631,12 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
  IdType *out_row = static_cast<IdType*>(out_row_arr->data);
  IdType *out_col = static_cast<IdType*>(out_col_arr->data);

-#pragma omp parallel for
-  for (int64_t i = 0; i < nnz; i++) {
-    out_row[i] = new_row_ids[in_rows[i]];
-    out_col[i] = new_col_ids[in_cols[i]];
-  }
+  parallel_for(0, nnz, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      out_row[i] = new_row_ids[in_rows[i]];
+      out_col[i] = new_col_ids[in_cols[i]];
+    }
+  });
  return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
 }


--- a/src/array/cpu/spmat_op_impl_csr.cc
+++ b/src/array/cpu/spmat_op_impl_csr.cc
@@ -4,6 +4,7 @@
 * \brief CSR matrix operator CPU implementation
 */
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <vector>
 #include <unordered_set>
 #include <numeric>
@@ -12,6 +13,7 @@
 namespace dgl {

 using runtime::NDArray;
+using runtime::parallel_for;

 namespace aten {
 namespace impl {
@@ -491,11 +493,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,

  // Compute the length of rows for the new matrix.
  std::vector<IdType> new_row_lens(num_rows, -1);
-#pragma omp parallel for
-  for (int64_t i = 0; i < num_rows; i++) {
-    int64_t new_row_id = new_row_ids[i];
-    new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
-  }
+  parallel_for(0, num_rows, [=, &new_row_lens](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      int64_t new_row_id = new_row_ids[i];
+      new_row_lens[new_row_id] = in_indptr[i + 1] - in_indptr[i];
+    }
+  });
  // Compute the starting location of each row in the new matrix.
  out_indptr[0] = 0;
  // This is sequential. It should be pretty fast.
@@ -506,23 +509,24 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
  CHECK_EQ(out_indptr[num_rows], nnz);
  // Copy indieces and data with the new order.
  // Here I iterate rows in the order of the old matrix.
-#pragma omp parallel for
-  for (int64_t i = 0; i < num_rows; i++) {
-    const IdType *in_row = in_indices + in_indptr[i];
-    const IdType *in_row_data = in_data + in_indptr[i];
-
-    int64_t new_row_id = new_row_ids[i];
-    IdType *out_row = out_indices + out_indptr[new_row_id];
-    IdType *out_row_data = out_data + out_indptr[new_row_id];
-
-    int64_t row_len = new_row_lens[new_row_id];
-    // Here I iterate col indices in a row in the order of the old matrix.
-    for (int64_t j = 0; j < row_len; j++) {
-      out_row[j] = new_col_ids[in_row[j]];
-      out_row_data[j] = in_row_data[j];
+  parallel_for(0, num_rows, [=](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      const IdType *in_row = in_indices + in_indptr[i];
+      const IdType *in_row_data = in_data + in_indptr[i];
+
+      int64_t new_row_id = new_row_ids[i];
+      IdType *out_row = out_indices + out_indptr[new_row_id];
+      IdType *out_row_data = out_data + out_indptr[new_row_id];
+
+      int64_t row_len = new_row_lens[new_row_id];
+      // Here I iterate col indices in a row in the order of the old matrix.
+      for (int64_t j = 0; j < row_len; j++) {
+        out_row[j] = new_col_ids[in_row[j]];
+        out_row_data[j] = in_row_data[j];
+      }
+      // TODO(zhengda) maybe we should sort the column indices.
    }
-    // TODO(zhengda) maybe we should sort the column indices.
-  }
+  });
  return CSRMatrix(num_rows, num_cols,
    out_indptr_arr, out_indices_arr, out_data_arr);
 }

--- a/src/array/cpu/spmm.h
+++ b/src/array/cpu/spmm.h
@@ -8,6 +8,7 @@

 #include <dgl/array.h>
 #include <dgl/bcast.h>
+#include <dgl/runtime/parallel_for.h>
 #include <algorithm>
 #include <limits>
 #include <memory>
@@ -46,16 +47,18 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
  const IdType* indices = csr.indices.Ptr<IdType>();
  const IdType* edges = csr.data.Ptr<IdType>();
  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
-#pragma omp parallel for
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
-    const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
-    DType* out_off = O + rid * dim;
-    for (IdType j = row_start; j < row_end; ++j) {
-      const IdType cid = indices[j];
-      const IdType eid = has_idx ? edges[j] : j;
-      cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
+
+  runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
+    for (auto rid = b; rid < e; ++rid) {
+      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
+      DType* out_off = O + rid * dim;
+      for (IdType j = row_start; j < row_end; ++j) {
+        const IdType cid = indices[j];
+        const IdType eid = has_idx ? edges[j] : j;
+        cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
+      }
    }
-  }
+  });
 }
 #endif  // USE_AVX
 #endif  // _WIN32
@@ -79,24 +82,25 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
  const IdType* indices = csr.indices.Ptr<IdType>();
  const IdType* edges = csr.data.Ptr<IdType>();
  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
-#pragma omp parallel for
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
-    const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
-    DType* out_off = O + rid * dim;
-    for (IdType j = row_start; j < row_end; ++j) {
-      const IdType cid = indices[j];
-      const IdType eid = has_idx ? edges[j] : j;
-      for (int64_t k = 0; k < dim; ++k) {
-        const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
-        const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-        const DType* lhs_off =
-          Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
-        const DType* rhs_off =
-          Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
-        out_off[k] += Op::Call(lhs_off, rhs_off);
+  runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
+    for (auto rid = b; rid < e; ++rid) {
+      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
+      DType* out_off = O + rid * dim;
+      for (IdType j = row_start; j < row_end; ++j) {
+        const IdType cid = indices[j];
+        const IdType eid = has_idx ? edges[j] : j;
+        for (int64_t k = 0; k < dim; ++k) {
+          const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
+          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
+          const DType* lhs_off =
+            Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
+          const DType* rhs_off =
+            Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+          out_off[k] += Op::Call(lhs_off, rhs_off);
+        }
      }
    }
-  }
+  });
 }

 /*!
@@ -270,31 +274,32 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
 #endif  // USE_AVX
 #endif  // _WIN32

-#pragma omp parallel for
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
-    const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
-    DType* out_off = O + rid * dim;
-    IdType* argx_off = argX + rid * dim;
-    IdType* argw_off = argW + rid * dim;
-    for (IdType j = row_start; j < row_end; ++j) {
-      const IdType cid = indices[j];
-      const IdType eid = has_idx ? edges[j] : j;
-      for (int64_t k = 0; k < dim; ++k) {
-        const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
-        const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-        const DType* lhs_off =
-          Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
-        const DType* rhs_off =
-          Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
-        const DType val = Op::Call(lhs_off, rhs_off);
-        if (Cmp::Call(out_off[k], val)) {
-          out_off[k] = val;
-          if (Op::use_lhs) argx_off[k] = cid;
-          if (Op::use_rhs) argw_off[k] = eid;
+    runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
+      for (auto rid = b; rid < e; ++rid) {
+        const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
+        DType* out_off = O + rid * dim;
+        IdType* argx_off = argX + rid * dim;
+        IdType* argw_off = argW + rid * dim;
+        for (IdType j = row_start; j < row_end; ++j) {
+          const IdType cid = indices[j];
+          const IdType eid = has_idx ? edges[j] : j;
+          for (int64_t k = 0; k < dim; ++k) {
+            const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
+            const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
+            const DType* lhs_off =
+              Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
+            const DType* rhs_off =
+              Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+            const DType val = Op::Call(lhs_off, rhs_off);
+            if (Cmp::Call(out_off[k], val)) {
+              out_off[k] = val;
+              if (Op::use_lhs) argx_off[k] = cid;
+              if (Op::use_rhs) argw_off[k] = eid;
+            }
+          }
        }
      }
-    }
-  }
+    });
 #if !defined(_WIN32)
 #ifdef USE_AVX
 #ifdef USE_LIBXSMM

--- a/src/graph/graph_op.cc
+++ b/src/graph/graph_op.cc
@@ -8,6 +8,7 @@
 #include <dgl/immutable_graph.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/runtime/container.h>
+#include <dgl/runtime/parallel_for.h>
 #include <algorithm>
 #include "../c_api_common.h"

@@ -261,34 +262,36 @@ IdArray GraphOp::MapParentIdToSubgraphId(IdArray parent_vids, IdArray query) {

  const bool is_sorted = std::is_sorted(parent_data, parent_data + parent_len);
  if (is_sorted) {
-#pragma omp parallel for
-    for (int64_t i = 0; i < query_len; i++) {
-      const dgl_id_t id = query_data[i];
-      const auto it = std::find(parent_data, parent_data + parent_len, id);
-      // If the vertex Id doesn't exist, the vid in the subgraph is -1.
-      if (it != parent_data + parent_len) {
-        rst_data[i] = it - parent_data;
-      } else {
-        rst_data[i] = -1;
+    runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
+        const dgl_id_t id = query_data[i];
+        const auto it = std::find(parent_data, parent_data + parent_len, id);
+        // If the vertex Id doesn't exist, the vid in the subgraph is -1.
+        if (it != parent_data + parent_len) {
+          rst_data[i] = it - parent_data;
+        } else {
+          rst_data[i] = -1;
+        }
      }
-    }
+    });
  } else {
    std::unordered_map<dgl_id_t, dgl_id_t> parent_map;
    for (int64_t i = 0; i < parent_len; i++) {
      const dgl_id_t id = parent_data[i];
      parent_map[id] = i;
    }
-#pragma omp parallel for
-    for (int64_t i = 0; i < query_len; i++) {
-      const dgl_id_t id = query_data[i];
-      auto it = parent_map.find(id);
-      // If the vertex Id doesn't exist, the vid in the subgraph is -1.
-      if (it != parent_map.end()) {
-        rst_data[i] = it->second;
-      } else {
-        rst_data[i] = -1;
+    runtime::parallel_for(0, query_len, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
+        const dgl_id_t id = query_data[i];
+        auto it = parent_map.find(id);
+        // If the vertex Id doesn't exist, the vid in the subgraph is -1.
+        if (it != parent_map.end()) {
+          rst_data[i] = it->second;
+        } else {
+          rst_data[i] = -1;
+        }
      }
-    }
+    });
  }
  return rst;
 }
@@ -567,14 +570,15 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLPartitionWithHalo")
    graph_ptr->GetInCSR();
    std::vector<std::shared_ptr<HaloSubgraph> > subgs(max_part_id + 1);
    int num_partitions = part_nodes.size();
-#pragma omp parallel for
-    for (int i = 0; i < num_partitions; i++) {
-      auto nodes = aten::VecToIdArray(part_nodes[i]);
-      HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
-      std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
-      int part_id = part_ids[i];
-      subgs[part_id] = subg_ptr;
-    }
+    runtime::parallel_for(0, num_partitions, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
+        auto nodes = aten::VecToIdArray(part_nodes[i]);
+        HaloSubgraph subg = GraphOp::GetSubgraphWithHalo(graph_ptr, nodes, num_hops);
+        std::shared_ptr<HaloSubgraph> subg_ptr(new HaloSubgraph(subg));
+        int part_id = part_ids[i];
+        subgs[part_id] = subg_ptr;
+      }
+    });
    List<SubgraphRef> ret_list;
    for (size_t i = 0; i < subgs.size(); i++) {
      ret_list.push_back(SubgraphRef(subgs[i]));
@@ -732,24 +736,25 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
  const IdType *typed_map_data = static_cast<IdType *>(typed_map->data);
  IdType *types_data = static_cast<IdType *>(ret->data);
  IdType *per_type_ids_data = static_cast<IdType *>(ret->data) + num_ids;
-#pragma omp parallel for
-  for (int64_t i = 0; i < ids->shape[0]; i++) {
-    IdType id = ids_data[i];
-    auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
-    // The range must exist.
-    BUG_IF_FAIL(it != range_end_data + num_ranges);
-    size_t range_id = it - range_end_data;
-    int type_id = range_id % num_types;
-    types_data[i] = type_id;
-    int part_id = range_id / num_types;
-    BUG_IF_FAIL(part_id < num_parts);
-    if (part_id == 0) {
-      per_type_ids_data[i] = id - range_start_data[range_id];
-    } else {
-      per_type_ids_data[i] = id - range_start_data[range_id]
+  runtime::parallel_for(0, ids->shape[0], [&](size_t b, size_t e) {
+    for (auto i = b; i < e; ++i) {
+      IdType id = ids_data[i];
+      auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
+      // The range must exist.
+      BUG_IF_FAIL(it != range_end_data + num_ranges);
+      size_t range_id = it - range_end_data;
+      int type_id = range_id % num_types;
+      types_data[i] = type_id;
+      int part_id = range_id / num_types;
+      BUG_IF_FAIL(part_id < num_parts);
+      if (part_id == 0) {
+        per_type_ids_data[i] = id - range_start_data[range_id];
+      } else {
+        per_type_ids_data[i] = id - range_start_data[range_id]
          + typed_map_data[num_parts * type_id + part_id - 1];
+      }
    }
-  }
+  });
  return ret;
 }


--- a/src/graph/heterograph_capi.cc
+++ b/src/graph/heterograph_capi.cc
@@ -8,6 +8,7 @@
 #include <dgl/packed_func_ext.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/container.h>
+#include <dgl/runtime/parallel_for.h>
 #include <set>

 #include "../c_api_common.h"
@@ -629,14 +630,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    HeteroGraphRef hg = args[0];
    dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
-#if !defined(DGL_USE_CUDA)
-#pragma omp parallel for
+    auto get_format_f = [&](size_t etype_b, size_t etype_e) {
+      for (auto etype = etype_b; etype < etype_e; ++etype) {
+        auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
+        for (auto format : CodeToSparseFormats(code))
+          bg->GetFormat(format);
+      }
+    };
+
+#if !(defined(DGL_USE_CUDA))
+  runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
+#else
+  get_format_f(0, hg->NumEdgeTypes());
 #endif
-    for (int64_t etype = 0; etype < hg->NumEdgeTypes(); ++etype) {
-      auto bg = std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
-      for (auto format : CodeToSparseFormats(code))
-        bg->GetFormat(format);
-    }
 });

 DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph")

--- a/src/graph/network.cc
+++ b/src/graph/network.cc
@@ -9,6 +9,7 @@

 #include <dgl/runtime/container.h>
 #include <dgl/runtime/ndarray.h>
+#include <dgl/runtime/parallel_for.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/nodeflow.h>
@@ -829,15 +830,16 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
    char *return_data = new char[ID_size*row_size];
    const int64_t local_ids_size = local_ids.size();
    // Copy local data
-#pragma omp parallel for
-    for (int64_t i = 0; i < local_ids_size; ++i) {
-      CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
-      CHECK_GE(data_size, local_ids[i] * row_size + row_size);
-      CHECK_GE(local_ids[i], 0);
-      memcpy(return_data + local_ids_orginal[i] * row_size,
-             local_data_char + local_ids[i] * row_size,
-             row_size);
-    }
+    runtime::parallel_for(0, local_ids_size, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
+        CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
+        CHECK_GE(data_size, local_ids[i] * row_size + row_size);
+        CHECK_GE(local_ids[i], 0);
+        memcpy(return_data + local_ids_orginal[i] * row_size,
+               local_data_char + local_ids[i] * row_size,
+               row_size);
+      }
+    });
    // Recv remote message
    for (int i = 0; i < msg_count; ++i) {
      KVStoreMsg *kv_msg = recv_kv_message(receiver);

--- a/src/graph/sampler.cc
+++ b/src/graph/sampler.cc
@@ -9,6 +9,7 @@
 #include <dgl/runtime/container.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/random.h>
+#include <dgl/runtime/parallel_for.h>
 #include <dmlc/omp.h>
 #include <algorithm>
 #include <cstdlib>
@@ -850,19 +851,20 @@ std::vector<NodeFlow> NeighborSamplingImpl(const ImmutableGraphPtr gptr,
    BuildCsr(*gptr, neigh_type);
    // generate node flows
    std::vector<NodeFlow> nflows(num_workers);
-#pragma omp parallel for
-    for (int i = 0; i < num_workers; i++) {
-      // create per-worker seed nodes.
-      const int64_t start = (batch_start_id + i) * batch_size;
-      const int64_t end = std::min(start + batch_size, num_seeds);
-      // TODO(minjie): the vector allocation/copy is unnecessary
-      std::vector<dgl_id_t> worker_seeds(end - start);
-      std::copy(seed_nodes_data + start, seed_nodes_data + end,
-                worker_seeds.begin());
-      nflows[i] = SamplerOp::NeighborSample(
+    runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
+        // create per-worker seed nodes.
+        const int64_t start = (batch_start_id + i) * batch_size;
+        const int64_t end = std::min(start + batch_size, num_seeds);
+        // TODO(minjie): the vector allocation/copy is unnecessary
+        std::vector<dgl_id_t> worker_seeds(end - start);
+        std::copy(seed_nodes_data + start, seed_nodes_data + end,
+                  worker_seeds.begin());
+        nflows[i] = SamplerOp::NeighborSample(
          gptr.get(), worker_seeds, neigh_type, num_hops, expand_factor,
          add_self_loop, probability);
-    }
+      }
+    });
    return nflows;
 }

@@ -977,18 +979,19 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
    BuildCsr(*gptr, neigh_type);
    // generate node flows
    std::vector<NodeFlow> nflows(num_workers);
-#pragma omp parallel for
-    for (int i = 0; i < num_workers; i++) {
-      // create per-worker seed nodes.
-      const int64_t start = (batch_start_id + i) * batch_size;
-      const int64_t end = std::min(start + batch_size, num_seeds);
-      // TODO(minjie): the vector allocation/copy is unnecessary
-      std::vector<dgl_id_t> worker_seeds(end - start);
-      std::copy(seed_nodes_data + start, seed_nodes_data + end,
-                worker_seeds.begin());
-      nflows[i] = SamplerOp::LayerUniformSample(
-          gptr.get(), worker_seeds, neigh_type, layer_sizes);
-    }
+    runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
+        // create per-worker seed nodes.
+        const int64_t start = (batch_start_id + i) * batch_size;
+        const int64_t end = std::min(start + batch_size, num_seeds);
+        // TODO(minjie): the vector allocation/copy is unnecessary
+        std::vector<dgl_id_t> worker_seeds(end - start);
+        std::copy(seed_nodes_data + start, seed_nodes_data + end,
+                  worker_seeds.begin());
+        nflows[i] = SamplerOp::LayerUniformSample(
+            gptr.get(), worker_seeds, neigh_type, layer_sizes);
+      }
+    });
    *rv = List<NodeFlow>(nflows);
  });

@@ -1466,54 +1469,55 @@ public:
    std::vector<SubgraphRef> positive_subgs(num_workers);
    std::vector<SubgraphRef> negative_subgs(num_workers);

-#pragma omp parallel for
-    for (int64_t i = 0; i < num_workers; i++) {
-      const int64_t start = (batch_curr_id_ + i) * batch_size_;
-      const int64_t end = std::min(start + batch_size_, num_seeds_);
-      const int64_t num_edges = end - start;
-      IdArray worker_seeds;
-
-      if (replacement_ == false) {
-        worker_seeds = seed_edges_.CreateView({num_edges}, DLDataType{kDLInt, 64, 1},
-                                              sizeof(dgl_id_t) * start);
-      } else {
-        std::vector<dgl_id_t> seeds;
-        const dgl_id_t *seed_edge_ids = static_cast<const dgl_id_t *>(seed_edges_->data);
-        // sampling of each edge is a standalone event
-        for (int64_t i = 0; i < num_edges; ++i) {
-          int64_t seed = static_cast<const int64_t>(
-              RandomEngine::ThreadLocal()->RandInt(num_seeds_));
-          seeds.push_back(seed_edge_ids[seed]);
-        }
+    runtime::parallel_for(0, num_workers, [&](size_t b, size_t e) {
+      for (auto i = b; i < e; ++i) {
+        const int64_t start = (batch_curr_id_ + i) * batch_size_;
+        const int64_t end = std::min(start + batch_size_, num_seeds_);
+        const int64_t num_edges = end - start;
+        IdArray worker_seeds;

-        worker_seeds = aten::VecToIdArray(seeds, seed_edges_->dtype.bits);
-      }
+        if (replacement_ == false) {
+          worker_seeds = seed_edges_.CreateView({num_edges}, DLDataType{kDLInt, 64, 1},
+                                                sizeof(dgl_id_t) * start);
+        } else {
+          std::vector<dgl_id_t> seeds;
+          const dgl_id_t *seed_edge_ids = static_cast<const dgl_id_t *>(seed_edges_->data);
+          // sampling of each edge is a standalone event
+          for (int64_t i = 0; i < num_edges; ++i) {
+            int64_t seed = static_cast<const int64_t>(
+                RandomEngine::ThreadLocal()->RandInt(num_seeds_));
+            seeds.push_back(seed_edge_ids[seed]);
+          }

-      EdgeArray arr = gptr_->FindEdges(worker_seeds);
-      const dgl_id_t *src_ids = static_cast<const dgl_id_t *>(arr.src->data);
-      const dgl_id_t *dst_ids = static_cast<const dgl_id_t *>(arr.dst->data);
-      std::vector<dgl_id_t> src_vec(src_ids, src_ids + num_edges);
-      std::vector<dgl_id_t> dst_vec(dst_ids, dst_ids + num_edges);
-      // TODO(zhengda) what if there are duplicates in the src and dst vectors.
+          worker_seeds = aten::VecToIdArray(seeds, seed_edges_->dtype.bits);
+        }

-      Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false);
-      positive_subgs[i] = ConvertRef(subg);
-      // For chunked negative sampling, we accept "chunk-head" for corrupting head
-      // nodes and "chunk-tail" for corrupting tail nodes.
-      if (neg_mode_.substr(0, 5) == "chunk") {
-        NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6),
-                                                         neg_sample_size_,
-                                                         exclude_positive_,
-                                                         check_false_neg_);
-        negative_subgs[i] = ConvertRef(neg_subg);
-      } else if (neg_mode_ == "head" || neg_mode_ == "tail") {
-        NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_,
-                                                  neg_sample_size_,
-                                                  exclude_positive_,
-                                                  check_false_neg_);
-        negative_subgs[i] = ConvertRef(neg_subg);
+        EdgeArray arr = gptr_->FindEdges(worker_seeds);
+        const dgl_id_t *src_ids = static_cast<const dgl_id_t *>(arr.src->data);
+        const dgl_id_t *dst_ids = static_cast<const dgl_id_t *>(arr.dst->data);
+        std::vector<dgl_id_t> src_vec(src_ids, src_ids + num_edges);
+        std::vector<dgl_id_t> dst_vec(dst_ids, dst_ids + num_edges);
+        // TODO(zhengda) what if there are duplicates in the src and dst vectors.
+
+        Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false);
+        positive_subgs[i] = ConvertRef(subg);
+        // For chunked negative sampling, we accept "chunk-head" for corrupting head
+        // nodes and "chunk-tail" for corrupting tail nodes.
+        if (neg_mode_.substr(0, 5) == "chunk") {
+          NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6),
+                                                           neg_sample_size_,
+                                                           exclude_positive_,
+                                                           check_false_neg_);
+          negative_subgs[i] = ConvertRef(neg_subg);
+        } else if (neg_mode_ == "head" || neg_mode_ == "tail") {
+          NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_,
+                                                    neg_sample_size_,
+                                                    exclude_positive_,
+                                                    check_false_neg_);
+          negative_subgs[i] = ConvertRef(neg_subg);
+        }
      }
-    }
+    });
    if (neg_mode_.size() > 0) {
      positive_subgs.insert(positive_subgs.end(), negative_subgs.begin(), negative_subgs.end());
    }

--- a/src/graph/sampling/randomwalks/randomwalks_cpu.h
+++ b/src/graph/sampling/randomwalks/randomwalks_cpu.h
@@ -9,6 +9,7 @@

 #include <dgl/base_heterograph.h>
 #include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
 #include <tuple>
 #include <utility>
 #include "randomwalks_impl.h"
@@ -47,25 +48,26 @@ std::pair<IdArray, IdArray> GenericRandomWalk(
  IdxType *traces_data = traces.Ptr<IdxType>();
  IdxType *eids_data = eids.Ptr<IdxType>();

-#pragma omp parallel for
-  for (int64_t seed_id = 0; seed_id < num_seeds; ++seed_id) {
-    int64_t i;
-    dgl_id_t curr = seed_data[seed_id];
-    traces_data[seed_id * trace_length] = curr;
-
-    for (i = 0; i < max_num_steps; ++i) {
-      const auto &succ = step(traces_data + seed_id * max_num_steps, curr, i);
-      traces_data[seed_id * trace_length + i + 1] = curr = std::get<0>(succ);
-      eids_data[seed_id * max_num_steps + i] = std::get<1>(succ);
-      if (std::get<2>(succ))
-        break;
+  runtime::parallel_for(0, num_seeds, [&](size_t seed_begin, size_t seed_end) {
+    for (auto seed_id = seed_begin; seed_id < seed_end; seed_id++) {
+      int64_t i;
+      dgl_id_t curr = seed_data[seed_id];
+      traces_data[seed_id * trace_length] = curr;
+
+      for (i = 0; i < max_num_steps; ++i) {
+        const auto &succ = step(traces_data + seed_id * max_num_steps, curr, i);
+        traces_data[seed_id * trace_length + i + 1] = curr = std::get<0>(succ);
+        eids_data[seed_id * max_num_steps + i] = std::get<1>(succ);
+        if (std::get<2>(succ))
+          break;
+      }
+
+      for (; i < max_num_steps; ++i) {
+        traces_data[seed_id * trace_length + i + 1] = -1;
+        eids_data[seed_id * max_num_steps + i] = -1;
+      }
    }
-
-    for (; i < max_num_steps; ++i) {
-      traces_data[seed_id * trace_length + i + 1] = -1;
-      eids_data[seed_id * max_num_steps + i] = -1;
-    }
-  }
+  });

  return std::make_pair(traces, eids);
 }