Squashed commit of the following:

commit 70f88c54f6820d4c3824d97b34042c617fb21635 Author: Guolin Ke <i@yumumu.me> Date: Wed Feb 22 19:40:10 2017 +0800 futher reduce guided commit f54620b807c68f5ac6cef521d787ca2750efe464 Author: Guolin Ke <i@yumumu.me> Date: Wed Feb 22 19:32:36 2017 +0800 avoid to use guided

Squashed commit of the following:
commit 70f88c54f6820d4c3824d97b34042c617fb21635 Author: Guolin Ke <i@yumumu.me> Date: Wed Feb 22 19:40:10 2017 +0800 futher reduce guided commit f54620b807c68f5ac6cef521d787ca2750efe464 Author: Guolin Ke <i@yumumu.me> Date: Wed Feb 22 19:32:36 2017 +0800 avoid to use guided
9ea487b1 · Guolin Ke · a8bb3951 · 9ea487b1 · 9ea487b1 · 9ea487b1
Commit 9ea487b1 authored Feb 22, 2017 by Guolin Ke
9 changed files
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -349,7 +349,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
      reinterpret_cast<const Dataset*>(reference));
  }
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int i = 0; i < nrow; ++i) {
    const int tid = omp_get_thread_num();
    auto one_row = get_row_fun(i);
@@ -409,7 +409,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
      reinterpret_cast<const Dataset*>(reference));
  }
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int i = 0; i < nindptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    auto one_row = get_row_fun(i);
@@ -444,7 +444,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
    auto sample_indices = rand.Sample(nrow, sample_cnt);
    std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
    std::vector<std::vector<int>> sample_idx(ncol_ptr - 1);
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
      CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
      for (int j = 0; j < sample_cnt; j++) {
@@ -463,7 +463,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
      reinterpret_cast<const Dataset*>(reference));
  }
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int i = 0; i < ncol_ptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    int feature_idx = ret->InnerFeatureIndex(i);
@@ -843,7 +843,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
  int64_t num_preb_in_one_row = GetNumPredOneRow(ref_booster, predict_type, num_iteration);
  int nrow = static_cast<int>(nindptr - 1);
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = predictor.GetPredictFunction()(one_row);
@@ -915,7 +915,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle,
  auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
  auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
  int64_t num_preb_in_one_row = GetNumPredOneRow(ref_booster, predict_type, num_iteration);
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = predictor.GetPredictFunction()(one_row);

--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -16,8 +16,6 @@ namespace LightGBM {
 const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
 Dataset::Dataset() {
  data_filename_ = "noname";
  num_data_ = 0;
@@ -206,7 +204,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
 void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
    for (int group = 0; group < num_groups_; ++group) {
      feature_groups_[group]->bin_data_->ReSize(num_data_);
    }
@@ -215,7 +213,7 @@ void Dataset::ReSize(data_size_t num_data) {
 void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) {
  CHECK(num_used_indices == num_data_);
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int group = 0; group < num_groups_; ++group) {
    feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(), used_indices, num_used_indices);
  }
@@ -407,7 +405,8 @@ void Dataset::ConstructHistograms(
    ptr_ordered_grad = ordered_gradients;
    ptr_ordered_hess = ordered_hessians;
  }
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int group = 0; group < num_groups_; ++group) {
    bool is_groud_used = false;
    const int f_cnt = group_feature_cnt_[group];

--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -748,7 +748,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
  double tmp_label = 0.0f;
  if (predict_fun_ == nullptr) {
    // if doesn't need to prediction with initial model
-#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
+#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
    for (data_size_t i = 0; i < dataset->num_data_; ++i) {
      const int tid = omp_get_thread_num();
      oneline_features.clear();
@@ -781,7 +781,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
  } else {
    // if need to prediction with initial model
    std::vector<double> init_score(dataset->num_data_ * num_class_);
-#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
+#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
    for (data_size_t i = 0; i < dataset->num_data_; ++i) {
      const int tid = omp_get_thread_num();
      oneline_features.clear();

--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -90,7 +90,7 @@ public:
    }
    std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
    if (query_weights_ == nullptr) {
-#pragma omp parallel for schedule(guided) firstprivate(tmp_dcg)
+#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        // if all doc in this query are all negative, let its NDCG=1
@@ -110,7 +110,7 @@ public:
        }
      }
    } else {
-#pragma omp parallel for schedule(guided) firstprivate(tmp_dcg)
+#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        // if all doc in this query are all negative, let its NDCG=1

--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -52,7 +52,7 @@ public:
    num_queries_ = metadata.num_queries();
    // cache inverse max DCG, avoid computation many times
    inverse_max_dcgs_.resize(num_queries_);
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_queries_; ++i) {
      inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(optimize_pos_at_,
        label_ + query_boundaries_[i],

--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -132,7 +132,7 @@ void DataParallelTreeLearner::FindBestThresholds() {
    ordered_gradients_.data(), ordered_hessians_.data(),
    smaller_leaf_histogram_array_[0].RawData() - 1);
  // construct local histograms
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue;
    // copy to buffer
@@ -146,7 +146,7 @@ void DataParallelTreeLearner::FindBestThresholds() {
  std::vector<SplitInfo> smaller_best(num_threads_, SplitInfo());
  std::vector<SplitInfo> larger_best(num_threads_, SplitInfo());
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_aggregated_[feature_index]) continue;
    const int tid = omp_get_thread_num();

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -67,6 +67,12 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  if (has_ordered_bin_) {
    is_data_in_leaf_.resize(num_data_);
    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
+    order_bin_indices_.clear();
+    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
+      if (ordered_bins_[i] != nullptr) {
+        order_bin_indices_.push_back(i);
+      }
+    }
  }
  Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
 }
@@ -103,6 +109,12 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
  if (has_ordered_bin_) {
    is_data_in_leaf_.resize(num_data_);
    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
+    order_bin_indices_.clear();
+    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
+      if (ordered_bins_[i] != nullptr) {
+        order_bin_indices_.push_back(i);
+      }
+    }
  }
 }
@@ -220,12 +232,9 @@ void SerialTreeLearner::BeforeTrain() {
  if (has_ordered_bin_) {
    if (data_partition_->leaf_count(0) == num_data_) {
      // use all data, pass nullptr
-      #pragma omp parallel for schedule(guided)
+      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
+      for (int i = 0; i < static_cast<int>(order_bin_indices_.size()); ++i) {
-        auto ptr = ordered_bins_[i].get();
+        ordered_bins_[order_bin_indices_[i]]->Init(nullptr, tree_config_->num_leaves);
-        if (ptr != nullptr) {
-          ptr->Init(nullptr, tree_config_->num_leaves);
-        }
      }
    } else {
      // bagging, only use part of data
@@ -239,12 +248,9 @@ void SerialTreeLearner::BeforeTrain() {
        is_data_in_leaf_[indices[i]] = 1;
      }
      // initialize ordered bin
-      #pragma omp parallel for schedule(guided)
+      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
+      for (int i = 0; i < static_cast<int>(order_bin_indices_.size()); ++i) {
-        auto ptr = ordered_bins_[i].get();
+        ordered_bins_[order_bin_indices_[i]]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
-        if (ptr != nullptr) {
-          ptr->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
-        }
      }
 #pragma omp parallel for schedule(static)
      for (data_size_t i = begin; i < end; ++i) {
@@ -312,11 +318,8 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
    }
    // split the ordered bin
    #pragma omp parallel for schedule(guided)
-    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
+    for (int i = 0; i < static_cast<int>(order_bin_indices_.size()); ++i) {
-      auto ptr = ordered_bins_[i].get();
+      ordered_bins_[order_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
-      if (ptr != nullptr) {
-        ptr->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
-      }
    }
 #pragma omp parallel for schedule(static)
    for (data_size_t i = begin; i < end; ++i) {
@@ -328,7 +331,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
 void SerialTreeLearner::FindBestThresholds() {
  std::vector<int8_t> is_feature_used(num_features_, 0);
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used_[feature_index]) continue;
    if (parent_leaf_histogram_array_ != nullptr 
@@ -364,7 +367,7 @@ void SerialTreeLearner::FindBestThresholds() {
  std::vector<SplitInfo> smaller_best(num_threads_);
  std::vector<SplitInfo> larger_best(num_threads_);
  // find splits
-  #pragma omp parallel for schedule(guided)
+  #pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used[feature_index]) { continue; }
    const int tid = omp_get_thread_num();

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -43,7 +43,7 @@ public:
  }
  void AddPredictionToScore(double* out_score) const override {
-    #pragma omp parallel for schedule(guided)
+    #pragma omp parallel for schedule(static)
    for (int i = 0; i < data_partition_->num_leaves(); ++i) {
      double output = static_cast<double>(last_trained_tree_->LeafOutput(i));
      data_size_t cnt_leaf_data = 0;
@@ -144,6 +144,7 @@ protected:
  /*! \brief config of tree learner*/
  const TreeConfig* tree_config_;
  int num_threads_;
+  std::vector<int> order_bin_indices_;
 };
 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const {

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -250,7 +250,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
 void VotingParallelTreeLearner::FindBestThresholds() {
  // use local data to find local best splits
  std::vector<int8_t> is_feature_used(num_features_, 0);
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used_[feature_index]) continue;
    if (parent_leaf_histogram_array_ != nullptr
@@ -288,7 +288,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
  std::vector<SplitInfo> larger_bestsplit_per_features(num_features_);
  // find splits
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used[feature_index]) { continue; }
    train_data_->FixHistogram(feature_index,
@@ -360,7 +360,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
  std::vector<SplitInfo> smaller_best(num_threads_);
  std::vector<SplitInfo> larger_best(num_threads_);
  // find best split from local aggregated histograms
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    const int tid = omp_get_thread_num();
    if (smaller_is_feature_aggregated_[feature_index]) {