stable multi-threading sum reduction (#3385)

* Update serial_tree_learner.cpp * Update src/treelearner/serial_tree_learner.cpp * stable multi-threading reduction * Update src/treelearner/serial_tree_learner.cpp * more fixes * Apply suggestions from code review * Apply suggestions from code review * Update src/boosting/gbdt.cpp

stable multi-threading sum reduction (#3385)
* Update serial_tree_learner.cpp * Update src/treelearner/serial_tree_learner.cpp * stable multi-threading reduction * Update src/treelearner/serial_tree_learner.cpp * more fixes * Apply suggestions from code review * Apply suggestions from code review * Update src/boosting/gbdt.cpp
692c9a5b · Guolin Ke · GitHub · f8f6c513 · 692c9a5b · 692c9a5b
Unverified Commit 692c9a5b authored Sep 30, 2020 by Guolin Ke Committed by GitHub Sep 30, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 19 deletions

include/LightGBM/utils/threading.h include/LightGBM/utils/threading.h +41 -0

src/treelearner/leaf_splits.hpp src/treelearner/leaf_splits.hpp +22 -19

No files found.
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -25,6 +25,7 @@ class Threading {
    BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
                       block_size);
  }
  template <typename INDEX_T>
  static inline void BlockInfo(int num_threads, INDEX_T cnt,
                               INDEX_T min_cnt_per_block, int* out_nblock,
@@ -38,6 +39,7 @@ class Threading {
      *block_size = cnt;
    }
  }
  template <typename INDEX_T>
  static inline void BlockInfoForceSize(int num_threads, INDEX_T cnt,
                                        INDEX_T min_cnt_per_block,
@@ -55,6 +57,14 @@ class Threading {
    }
  }
+  template <typename INDEX_T>
+  static inline void BlockInfoForceSize(INDEX_T cnt, INDEX_T min_cnt_per_block,
+                                        int* out_nblock, INDEX_T* block_size) {
+    int num_threads = OMP_NUM_THREADS();
+    BlockInfoForceSize<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
+                                block_size);
+  }
  template <typename INDEX_T>
  static inline int For(
      INDEX_T start, INDEX_T end, INDEX_T min_block_size,
@@ -74,6 +84,37 @@ class Threading {
    OMP_THROW_EX();
    return n_block;
  }
+  template <typename INDEX_T, typename VAL1_T, typename VAL2_T>
+  static inline int SumReduction(
+      INDEX_T start, INDEX_T end, INDEX_T min_block_size,
+      const std::function<void(int, INDEX_T, INDEX_T, VAL1_T* res1,
+                               VAL2_T* res2)>& inner_fun,
+      VAL1_T* res1, VAL2_T* res2) {
+    int n_block = 1;
+    INDEX_T num_inner = end - start;
+    BlockInfoForceSize<INDEX_T>(end - start, min_block_size, &n_block,
+                                &num_inner);
+    std::vector<VAL1_T> val_1s(n_block, static_cast<VAL1_T>(0));
+    std::vector<VAL2_T> val_2s(n_block, static_cast<VAL2_T>(0));
+    OMP_INIT_EX();
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < n_block; ++i) {
+      OMP_LOOP_EX_BEGIN();
+      INDEX_T inner_start = start + num_inner * i;
+      INDEX_T inner_end = std::min(end, inner_start + num_inner);
+      inner_fun(i, inner_start, inner_end, &val_1s[i], &val_2s[i]);
+      OMP_LOOP_EX_END();
+    }
+    OMP_THROW_EX();
+    *res1 = 0;
+    *res2 = 0;
+    for (int i = 0; i < n_block; ++i) {
+      *res1 += val_1s[i];
+      *res2 += val_2s[i];
+    }
+    return n_block;
+  }
 };
 template <typename INDEX_T, bool TWO_BUFFER>

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -6,6 +6,7 @@
 #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
 #include <LightGBM/meta.h>
+#include <LightGBM/utils/threading.h>
 #include <limits>
 #include <vector>
@@ -67,15 +68,16 @@ class LeafSplits {
    num_data_in_leaf_ = num_data_;
    leaf_index_ = 0;
    data_indices_ = nullptr;
-    double tmp_sum_gradients = 0.0f;
+    Threading::SumReduction<data_size_t, double, double>(
-    double tmp_sum_hessians = 0.0f;
+        0, num_data_in_leaf_, 2048,
-#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
+        [=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
-    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+          *s1 = *s2 = 0;
-      tmp_sum_gradients += gradients[i];
+          for (data_size_t i = start; i < end; ++i) {
-      tmp_sum_hessians += hessians[i];
+            *s1 += gradients[i];
-    }
+            *s2 += hessians[i];
-    sum_gradients_ = tmp_sum_gradients;
+          }
-    sum_hessians_ = tmp_sum_hessians;
+        },
+        &sum_gradients_, &sum_hessians_);
  }
  /*!
@@ -88,16 +90,17 @@ class LeafSplits {
  void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) {
    leaf_index_ = leaf;
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
-    double tmp_sum_gradients = 0.0f;
+    Threading::SumReduction<data_size_t, double, double>(
-    double tmp_sum_hessians = 0.0f;
+        0, num_data_in_leaf_, 2048,
-#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
+        [=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
-    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+          *s1 = *s2 = 0;
-      data_size_t idx = data_indices_[i];
+          for (data_size_t i = start; i < end; ++i) {
-      tmp_sum_gradients += gradients[idx];
+            data_size_t idx = data_indices_[i];
-      tmp_sum_hessians += hessians[idx];
+            *s1 += gradients[idx];
-    }
+            *s2 += hessians[idx];
-    sum_gradients_ = tmp_sum_gradients;
+          }
-    sum_hessians_ = tmp_sum_hessians;
+        },
+        &sum_gradients_, &sum_hessians_);
  }