Unverified Commit 692c9a5b authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

stable multi-threading sum reduction (#3385)

* Update serial_tree_learner.cpp

* Update src/treelearner/serial_tree_learner.cpp

* stable multi-threading reduction

* Update src/treelearner/serial_tree_learner.cpp

* more fixes

* Apply suggestions from code review

* Apply suggestions from code review

* Update src/boosting/gbdt.cpp
parent f8f6c513
...@@ -25,6 +25,7 @@ class Threading { ...@@ -25,6 +25,7 @@ class Threading {
BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock, BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
block_size); block_size);
} }
template <typename INDEX_T> template <typename INDEX_T>
static inline void BlockInfo(int num_threads, INDEX_T cnt, static inline void BlockInfo(int num_threads, INDEX_T cnt,
INDEX_T min_cnt_per_block, int* out_nblock, INDEX_T min_cnt_per_block, int* out_nblock,
...@@ -38,6 +39,7 @@ class Threading { ...@@ -38,6 +39,7 @@ class Threading {
*block_size = cnt; *block_size = cnt;
} }
} }
template <typename INDEX_T> template <typename INDEX_T>
static inline void BlockInfoForceSize(int num_threads, INDEX_T cnt, static inline void BlockInfoForceSize(int num_threads, INDEX_T cnt,
INDEX_T min_cnt_per_block, INDEX_T min_cnt_per_block,
...@@ -55,6 +57,14 @@ class Threading { ...@@ -55,6 +57,14 @@ class Threading {
} }
} }
template <typename INDEX_T>
static inline void BlockInfoForceSize(INDEX_T cnt, INDEX_T min_cnt_per_block,
int* out_nblock, INDEX_T* block_size) {
int num_threads = OMP_NUM_THREADS();
BlockInfoForceSize<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
block_size);
}
template <typename INDEX_T> template <typename INDEX_T>
static inline int For( static inline int For(
INDEX_T start, INDEX_T end, INDEX_T min_block_size, INDEX_T start, INDEX_T end, INDEX_T min_block_size,
...@@ -74,6 +84,37 @@ class Threading { ...@@ -74,6 +84,37 @@ class Threading {
OMP_THROW_EX(); OMP_THROW_EX();
return n_block; return n_block;
} }
template <typename INDEX_T, typename VAL1_T, typename VAL2_T>
static inline int SumReduction(
INDEX_T start, INDEX_T end, INDEX_T min_block_size,
const std::function<void(int, INDEX_T, INDEX_T, VAL1_T* res1,
VAL2_T* res2)>& inner_fun,
VAL1_T* res1, VAL2_T* res2) {
int n_block = 1;
INDEX_T num_inner = end - start;
BlockInfoForceSize<INDEX_T>(end - start, min_block_size, &n_block,
&num_inner);
std::vector<VAL1_T> val_1s(n_block, static_cast<VAL1_T>(0));
std::vector<VAL2_T> val_2s(n_block, static_cast<VAL2_T>(0));
OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < n_block; ++i) {
OMP_LOOP_EX_BEGIN();
INDEX_T inner_start = start + num_inner * i;
INDEX_T inner_end = std::min(end, inner_start + num_inner);
inner_fun(i, inner_start, inner_end, &val_1s[i], &val_2s[i]);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
*res1 = 0;
*res2 = 0;
for (int i = 0; i < n_block; ++i) {
*res1 += val_1s[i];
*res2 += val_2s[i];
}
return n_block;
}
}; };
template <typename INDEX_T, bool TWO_BUFFER> template <typename INDEX_T, bool TWO_BUFFER>
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/utils/threading.h>
#include <limits> #include <limits>
#include <vector> #include <vector>
...@@ -67,15 +68,16 @@ class LeafSplits { ...@@ -67,15 +68,16 @@ class LeafSplits {
num_data_in_leaf_ = num_data_; num_data_in_leaf_ = num_data_;
leaf_index_ = 0; leaf_index_ = 0;
data_indices_ = nullptr; data_indices_ = nullptr;
double tmp_sum_gradients = 0.0f; Threading::SumReduction<data_size_t, double, double>(
double tmp_sum_hessians = 0.0f; 0, num_data_in_leaf_, 2048,
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians) [=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { *s1 = *s2 = 0;
tmp_sum_gradients += gradients[i]; for (data_size_t i = start; i < end; ++i) {
tmp_sum_hessians += hessians[i]; *s1 += gradients[i];
} *s2 += hessians[i];
sum_gradients_ = tmp_sum_gradients; }
sum_hessians_ = tmp_sum_hessians; },
&sum_gradients_, &sum_hessians_);
} }
/*! /*!
...@@ -88,16 +90,17 @@ class LeafSplits { ...@@ -88,16 +90,17 @@ class LeafSplits {
void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) { void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) {
leaf_index_ = leaf; leaf_index_ = leaf;
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
double tmp_sum_gradients = 0.0f; Threading::SumReduction<data_size_t, double, double>(
double tmp_sum_hessians = 0.0f; 0, num_data_in_leaf_, 2048,
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians) [=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { *s1 = *s2 = 0;
data_size_t idx = data_indices_[i]; for (data_size_t i = start; i < end; ++i) {
tmp_sum_gradients += gradients[idx]; data_size_t idx = data_indices_[i];
tmp_sum_hessians += hessians[idx]; *s1 += gradients[idx];
} *s2 += hessians[idx];
sum_gradients_ = tmp_sum_gradients; }
sum_hessians_ = tmp_sum_hessians; },
&sum_gradients_, &sum_hessians_);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment