speed up multi-threading sum (#3485)

* speed up multi-threading sum * Apply suggestions from code review

speed up multi-threading sum (#3485)
* speed up multi-threading sum * Apply suggestions from code review
ca7a01cd · Guolin Ke · GitHub · ba0a1f8d · ca7a01cd · ca7a01cd
Unverified Commit ca7a01cd authored Oct 27, 2020 by Guolin Ke Committed by GitHub Oct 27, 2020
Show whitespace changes
Inline Side-by-side

Showing with 53 additions and 39 deletions

include/LightGBM/utils/threading.h include/LightGBM/utils/threading.h +3 -7

src/treelearner/leaf_splits.hpp src/treelearner/leaf_splits.hpp +50 -32

No files found.
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -71,7 +71,7 @@ class Threading {
      const std::function<void(int, INDEX_T, INDEX_T)>& inner_fun) {
    int n_block = 1;
    INDEX_T num_inner = end - start;
-    BlockInfo<INDEX_T>(end - start, min_block_size, &n_block, &num_inner);
+    BlockInfo<INDEX_T>(num_inner, min_block_size, &n_block, &num_inner);
    OMP_INIT_EX();
 #pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < n_block; ++i) {
@@ -93,20 +93,16 @@ class Threading {
      VAL1_T* res1, VAL2_T* res2) {
    int n_block = 1;
    INDEX_T num_inner = end - start;
-    BlockInfoForceSize<INDEX_T>(end - start, min_block_size, &n_block,
+    BlockInfoForceSize<INDEX_T>(num_inner, min_block_size, &n_block,
                                &num_inner);
    std::vector<VAL1_T> val_1s(n_block, static_cast<VAL1_T>(0));
    std::vector<VAL2_T> val_2s(n_block, static_cast<VAL2_T>(0));
-    OMP_INIT_EX();
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for schedule(static)
    for (int i = 0; i < n_block; ++i) {
-      OMP_LOOP_EX_BEGIN();
      INDEX_T inner_start = start + num_inner * i;
      INDEX_T inner_end = std::min(end, inner_start + num_inner);
      inner_fun(i, inner_start, inner_end, &val_1s[i], &val_2s[i]);
-      OMP_LOOP_EX_END();
    }
-    OMP_THROW_EX();
    *res1 = 0;
    *res2 = 0;
    for (int i = 0; i < n_block; ++i) {

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -60,7 +60,7 @@ class LeafSplits {
  }

  /*!
-  * \brief Init splits on current leaf, it will traverse all data to sum up the results
+   * \brief Init splits on the current leaf, it will traverse all data to sum up the results
   * \param gradients
   * \param hessians
   */
@@ -68,10 +68,17 @@ class LeafSplits {
    num_data_in_leaf_ = num_data_;
    leaf_index_ = 0;
    data_indices_ = nullptr;
+    if (num_data_in_leaf_ < 4096) {
+      sum_gradients_ = 0.0f;
+      sum_hessians_ = 0.0f;
+      for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+        sum_gradients_ += gradients[i];
+        sum_hessians_ += hessians[i];
+      }
+    } else {
      Threading::SumReduction<data_size_t, double, double>(
          0, num_data_in_leaf_, 2048,
          [=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
-          *s1 = *s2 = 0;
            for (data_size_t i = start; i < end; ++i) {
              *s1 += gradients[i];
              *s2 += hessians[i];
@@ -79,6 +86,7 @@ class LeafSplits {
          },
          &sum_gradients_, &sum_hessians_);
    }
+  }

  /*!
   * \brief Init splits on current leaf of partial data.
@@ -87,21 +95,31 @@ class LeafSplits {
   * \param gradients
   * \param hessians
   */
-  void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) {
+  void Init(int leaf, const DataPartition* data_partition,
+            const score_t* gradients, const score_t* hessians) {
    leaf_index_ = leaf;
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
+    if (num_data_in_leaf_ < 4096) {
+      sum_gradients_ = 0.0f;
+      sum_hessians_ = 0.0f;
+      for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+        const data_size_t idx = data_indices_[i];
+        sum_gradients_ += gradients[idx];
+        sum_hessians_ += hessians[idx];
+      }
+    } else {
      Threading::SumReduction<data_size_t, double, double>(
          0, num_data_in_leaf_, 2048,
          [=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
-          *s1 = *s2 = 0;
            for (data_size_t i = start; i < end; ++i) {
-            data_size_t idx = data_indices_[i];
+              const data_size_t idx = data_indices_[i];
              *s1 += gradients[idx];
              *s2 += hessians[idx];
            }
          },
          &sum_gradients_, &sum_hessians_);
    }
+  }


  /*!