Unverified Commit ca7a01cd authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

speed up multi-threading sum (#3485)

* speed up multi-threading sum

* Apply suggestions from code review
parent ba0a1f8d
...@@ -71,7 +71,7 @@ class Threading { ...@@ -71,7 +71,7 @@ class Threading {
const std::function<void(int, INDEX_T, INDEX_T)>& inner_fun) { const std::function<void(int, INDEX_T, INDEX_T)>& inner_fun) {
int n_block = 1; int n_block = 1;
INDEX_T num_inner = end - start; INDEX_T num_inner = end - start;
BlockInfo<INDEX_T>(end - start, min_block_size, &n_block, &num_inner); BlockInfo<INDEX_T>(num_inner, min_block_size, &n_block, &num_inner);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for schedule(static, 1)
for (int i = 0; i < n_block; ++i) { for (int i = 0; i < n_block; ++i) {
...@@ -93,20 +93,16 @@ class Threading { ...@@ -93,20 +93,16 @@ class Threading {
VAL1_T* res1, VAL2_T* res2) { VAL1_T* res1, VAL2_T* res2) {
int n_block = 1; int n_block = 1;
INDEX_T num_inner = end - start; INDEX_T num_inner = end - start;
BlockInfoForceSize<INDEX_T>(end - start, min_block_size, &n_block, BlockInfoForceSize<INDEX_T>(num_inner, min_block_size, &n_block,
&num_inner); &num_inner);
std::vector<VAL1_T> val_1s(n_block, static_cast<VAL1_T>(0)); std::vector<VAL1_T> val_1s(n_block, static_cast<VAL1_T>(0));
std::vector<VAL2_T> val_2s(n_block, static_cast<VAL2_T>(0)); std::vector<VAL2_T> val_2s(n_block, static_cast<VAL2_T>(0));
OMP_INIT_EX(); #pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < n_block; ++i) { for (int i = 0; i < n_block; ++i) {
OMP_LOOP_EX_BEGIN();
INDEX_T inner_start = start + num_inner * i; INDEX_T inner_start = start + num_inner * i;
INDEX_T inner_end = std::min(end, inner_start + num_inner); INDEX_T inner_end = std::min(end, inner_start + num_inner);
inner_fun(i, inner_start, inner_end, &val_1s[i], &val_2s[i]); inner_fun(i, inner_start, inner_end, &val_1s[i], &val_2s[i]);
OMP_LOOP_EX_END();
} }
OMP_THROW_EX();
*res1 = 0; *res1 = 0;
*res2 = 0; *res2 = 0;
for (int i = 0; i < n_block; ++i) { for (int i = 0; i < n_block; ++i) {
......
...@@ -60,47 +60,65 @@ class LeafSplits { ...@@ -60,47 +60,65 @@ class LeafSplits {
} }
/*! /*!
* \brief Init splits on current leaf, it will traverse all data to sum up the results * \brief Init splits on the current leaf, it will traverse all data to sum up the results
* \param gradients * \param gradients
* \param hessians * \param hessians
*/ */
void Init(const score_t* gradients, const score_t* hessians) { void Init(const score_t* gradients, const score_t* hessians) {
num_data_in_leaf_ = num_data_; num_data_in_leaf_ = num_data_;
leaf_index_ = 0; leaf_index_ = 0;
data_indices_ = nullptr; data_indices_ = nullptr;
Threading::SumReduction<data_size_t, double, double>( if (num_data_in_leaf_ < 4096) {
0, num_data_in_leaf_, 2048, sum_gradients_ = 0.0f;
[=](int, data_size_t start, data_size_t end, double* s1, double* s2) { sum_hessians_ = 0.0f;
*s1 = *s2 = 0; for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
for (data_size_t i = start; i < end; ++i) { sum_gradients_ += gradients[i];
*s1 += gradients[i]; sum_hessians_ += hessians[i];
*s2 += hessians[i]; }
} } else {
}, Threading::SumReduction<data_size_t, double, double>(
&sum_gradients_, &sum_hessians_); 0, num_data_in_leaf_, 2048,
[=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
for (data_size_t i = start; i < end; ++i) {
*s1 += gradients[i];
*s2 += hessians[i];
}
},
&sum_gradients_, &sum_hessians_);
}
} }
/*! /*!
* \brief Init splits on current leaf of partial data. * \brief Init splits on current leaf of partial data.
* \param leaf Index of current leaf * \param leaf Index of current leaf
* \param data_partition current data partition * \param data_partition current data partition
* \param gradients * \param gradients
* \param hessians * \param hessians
*/ */
void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) { void Init(int leaf, const DataPartition* data_partition,
const score_t* gradients, const score_t* hessians) {
leaf_index_ = leaf; leaf_index_ = leaf;
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
Threading::SumReduction<data_size_t, double, double>( if (num_data_in_leaf_ < 4096) {
0, num_data_in_leaf_, 2048, sum_gradients_ = 0.0f;
[=](int, data_size_t start, data_size_t end, double* s1, double* s2) { sum_hessians_ = 0.0f;
*s1 = *s2 = 0; for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
for (data_size_t i = start; i < end; ++i) { const data_size_t idx = data_indices_[i];
data_size_t idx = data_indices_[i]; sum_gradients_ += gradients[idx];
*s1 += gradients[idx]; sum_hessians_ += hessians[idx];
*s2 += hessians[idx]; }
} } else {
}, Threading::SumReduction<data_size_t, double, double>(
&sum_gradients_, &sum_hessians_); 0, num_data_in_leaf_, 2048,
[=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
for (data_size_t i = start; i < end; ++i) {
const data_size_t idx = data_indices_[i];
*s1 += gradients[idx];
*s2 += hessians[idx];
}
},
&sum_gradients_, &sum_hessians_);
}
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment