don't save num_thread as possible (#2839)

* don't cache `num_thread`, to avoid change outside * rename * update document * Update docs/Parameters.rst * Update include/LightGBM/config.h * Apply suggestions from code review Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> * Apply suggestions from code review Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

don't save num_thread as possible (#2839)
* don't cache `num_thread`, to avoid change outside * rename * update document * Update docs/Parameters.rst * Update include/LightGBM/config.h * Apply suggestions from code review Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> * Apply suggestions from code review Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
0aa7bfee · Guolin Ke · GitHub · 5a80b788 · 0aa7bfee · 0aa7bfee
Unverified Commit 0aa7bfee authored Mar 02, 2020 by Guolin Ke Committed by GitHub Mar 02, 2020
18 changed files
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -171,6 +171,8 @@ Core Parameters
   -  for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
+   -  **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
 -  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, aliases: ``device``
   -  device for the tree learning, you can use GPU to achieve the faster learning

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -192,6 +192,7 @@ struct Config {
  // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
  // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
  // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
+  // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
  int num_threads = 0;
  // [doc-only]

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -292,10 +292,7 @@ struct TrainingTempState {
      return;
    }
    multi_val_bin.reset(bin);
-    int num_threads = 1;
+    int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
    num_bin_aligned =
        (bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
    size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;

--- a/include/LightGBM/utils/array_args.h
+++ b/include/LightGBM/utils/array_args.h
@@ -21,12 +21,7 @@ template<typename VAL_T>
 class ArrayArgs {
 public:
  inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
-    int num_threads = 1;
+    int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-    {
-      num_threads = omp_get_num_threads();
-    }
    std::vector<size_t> arg_maxs(num_threads, 0);
    int n_blocks = Threading::For<size_t>(
        0, array.size(), 1024,

--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -727,12 +727,7 @@ template<typename _RanIt, typename _Pr, typename _VTRanIt> inline
 static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
  size_t len = _Last - _First;
  const size_t kMinInnerLen = 1024;
-  int num_threads = 1;
+  int num_threads = OMP_NUM_THREADS();
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads = omp_get_num_threads();
-  }
  if (len <= kMinInnerLen || num_threads <= 1) {
    std::sort(_First, _Last, _Pred);
    return;
@@ -1032,10 +1027,7 @@ class Timer {
 public:
  Timer() {
 #ifdef TIMETAG
-    int num_threads = 1;
+    int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
    start_time_.resize(num_threads);
    stats_.resize(num_threads);
 #endif  // TIMETAG

--- a/include/LightGBM/utils/openmp_wrapper.h
+++ b/include/LightGBM/utils/openmp_wrapper.h
@@ -15,6 +15,14 @@
 #include <stdexcept>
 #include <vector>
+inline int OMP_NUM_THREADS() {
+  int ret = 1;
+#pragma omp parallel
+#pragma omp master
+  { ret = omp_get_num_threads(); }
+  return ret;
+}
 class ThreadExceptionHelper {
 public:
  ThreadExceptionHelper() {
@@ -70,6 +78,7 @@ class ThreadExceptionHelper {
  inline void omp_set_num_threads(int) {}
  inline int omp_get_num_threads() {return 1;}
  inline int omp_get_thread_num() {return 0;}
+  inline int OMP_NUM_THREADS() { return 1; }
 #ifdef __cplusplus
 };  // extern "C"
 #endif

--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -21,10 +21,7 @@ class Threading {
  template <typename INDEX_T>
  static inline void BlockInfo(INDEX_T cnt, INDEX_T min_cnt_per_block,
                               int* out_nblock, INDEX_T* block_size) {
-    int num_threads = 1;
+    int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
    BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
                       block_size);
  }
@@ -84,10 +81,7 @@ class ParallelPartitionRunner {
 public:
  ParallelPartitionRunner(INDEX_T num_data, INDEX_T min_block_size)
      : min_block_size_(min_block_size) {
-    num_threads_ = 1;
+    num_threads_ = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-    { num_threads_ = omp_get_num_threads(); }
    left_.resize(num_data);
    if (TWO_BUFFER) {
      right_.resize(num_data);

--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -56,16 +56,13 @@ class Predictor {
      }
    }
-#pragma omp parallel
-#pragma omp master
-    { num_threads_ = omp_get_num_threads(); }
    boosting->InitPredict(num_iteration, predict_contrib);
    boosting_ = boosting;
    num_pred_one_row_ = boosting_->NumPredictOneRow(
        num_iteration, predict_leaf_index, predict_contrib);
    num_feature_ = boosting_->MaxFeatureIdx() + 1;
    predict_buf_.resize(
-        num_threads_,
+        OMP_NUM_THREADS(),
        std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(
            num_feature_, 0.0f));
    const int kFeatureThreshold = 100000;
@@ -281,7 +278,6 @@ class Predictor {
  PredictionEarlyStopInstance early_stop_;
  int num_feature_;
  int num_pred_one_row_;
-  int num_threads_;
  std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
 };

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1529,12 +1529,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
  if (config.num_threads > 0) {
    omp_set_num_threads(config.num_threads);
  }
-  int num_threads = 1;
+  int num_threads = OMP_NUM_THREADS();
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads = omp_get_num_threads();
-  }
  int ncol = static_cast<int>(ncol_ptr - 1);
  std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
  for (int i = 0; i < num_threads; ++i) {

--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -506,10 +506,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
  }
  const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
  const int num_feature = feature_groups_[multi_group_id]->num_feature_;
-  int num_threads = 1;
+  int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
  std::vector<uint32_t> most_freq_bins;
@@ -539,10 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
 MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
                                 global_timer);
-  int num_threads = 1;
+  int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
  double sum_dense_ratio = 0;
  std::unique_ptr<MultiValBin> ret;
@@ -1185,10 +1179,7 @@ void Dataset::ConstructHistogramsMultiVal(
  if (multi_val_bin == nullptr) {
    return;
  }
-  int num_threads = 1;
+  int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
  global_timer.Start("Dataset::sparse_bin_histogram");
  const int num_bin = multi_val_bin->num_bin();

--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -25,10 +25,7 @@ class MultiValSparseBin : public MultiValBin {
        estimate_element_per_row_(estimate_element_per_row) {
    row_ptr_.resize(num_data_ + 1, 0);
    INDEX_T estimate_num_data = static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
-    int num_threads = 1;
+    int num_threads = OMP_NUM_THREADS();
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
    if (num_threads > 1) {
      t_data_.resize(num_threads - 1);
      for (size_t i = 0; i < t_data_.size(); ++i) {

--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -73,12 +73,7 @@ class SparseBin: public Bin {
  explicit SparseBin(data_size_t num_data)
    : num_data_(num_data) {
-    int num_threads = 1;
+    int num_threads = OMP_NUM_THREADS();
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads = omp_get_num_threads();
-    }
    push_buffers_.resize(num_threads);
  }

--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -23,12 +23,6 @@ class MapMetric:public Metric {
    // get eval position
    eval_at_ = config.eval_at;
    DCGCalculator::DefaultEvalAt(&eval_at_);
-    // get number of threads
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads_ = omp_get_num_threads();
-    }
  }
  ~MapMetric() {
@@ -110,8 +104,9 @@ class MapMetric:public Metric {
  }
  std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
    // some buffers for multi-threading sum up
+    int num_threads = OMP_NUM_THREADS();
    std::vector<std::vector<double>> result_buffer_;
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < num_threads; ++i) {
      result_buffer_.emplace_back(eval_at_.size(), 0.0f);
    }
    std::vector<double> tmp_map(eval_at_.size(), 0.0f);
@@ -139,7 +134,7 @@ class MapMetric:public Metric {
    // Get final average MAP
    std::vector<double> result(eval_at_.size(), 0.0f);
    for (size_t j = 0; j < result.size(); ++j) {
-      for (int i = 0; i < num_threads_; ++i) {
+      for (int i = 0; i < num_threads; ++i) {
        result[j] += result_buffer_[i][j];
      }
      result[j] /= sum_query_weights_;
@@ -162,8 +157,6 @@ class MapMetric:public Metric {
  double sum_query_weights_;
  /*! \brief Evaluate position of Nmap */
  std::vector<data_size_t> eval_at_;
-  /*! \brief Number of threads */
-  int num_threads_;
  std::vector<std::string> name_;
  std::vector<data_size_t> npos_per_query_;
 };

--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -26,12 +26,6 @@ class NDCGMetric:public Metric {
    DCGCalculator::DefaultLabelGain(&label_gain);
    // initialize DCG calculator
    DCGCalculator::Init(label_gain);
-    // get number of threads
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads_ = omp_get_num_threads();
-    }
  }
  ~NDCGMetric() {
@@ -89,9 +83,10 @@ class NDCGMetric:public Metric {
  }
  std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
+    int num_threads = OMP_NUM_THREADS();
    // some buffers for multi-threading sum up
    std::vector<std::vector<double>> result_buffer_;
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < num_threads; ++i) {
      result_buffer_.emplace_back(eval_at_.size(), 0.0f);
    }
    std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
@@ -139,7 +134,7 @@ class NDCGMetric:public Metric {
    // Get final average NDCG
    std::vector<double> result(eval_at_.size(), 0.0f);
    for (size_t j = 0; j < result.size(); ++j) {
-      for (int i = 0; i < num_threads_; ++i) {
+      for (int i = 0; i < num_threads; ++i) {
        result[j] += result_buffer_[i][j];
      }
      result[j] /= sum_query_weights_;
@@ -166,8 +161,6 @@ class NDCGMetric:public Metric {
  std::vector<data_size_t> eval_at_;
  /*! \brief Cache the inverse max dcg for all queries */
  std::vector<std::vector<double>> inverse_max_dcgs_;
-  /*! \brief Number of threads */
-  int num_threads_;
 };
 }  // namespace LightGBM

--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -165,8 +165,9 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 template <typename TREELEARNER_T>
 void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
+  int num_threads = OMP_NUM_THREADS();
-  std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
+  std::vector<SplitInfo> smaller_bests_per_thread(num_threads, SplitInfo());
+  std::vector<SplitInfo> larger_bests_per_thread(num_threads, SplitInfo());
  std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
  std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
  if (this->config_->feature_fraction_bynode < 1.0f) {

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -21,11 +21,6 @@ namespace LightGBM {
 SerialTreeLearner::SerialTreeLearner(const Config* config)
  :config_(config) {
  random_ = Random(config_->feature_fraction_seed);
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads_ = omp_get_num_threads();
-  }
 }
 SerialTreeLearner::~SerialTreeLearner() {
@@ -400,8 +395,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
    const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  Common::FunctionTimer fun_timer(
      "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
-  std::vector<SplitInfo> smaller_best(num_threads_);
+  int num_threads = OMP_NUM_THREADS();
-  std::vector<SplitInfo> larger_best(num_threads_);
+  std::vector<SplitInfo> smaller_best(num_threads);
+  std::vector<SplitInfo> larger_best(num_threads);
  std::vector<int8_t> smaller_node_used_features(num_features_, 1);
  std::vector<int8_t> larger_node_used_features(num_features_, 1);
  if (config_->feature_fraction_bynode < 1.0f) {

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -189,7 +189,6 @@ class SerialTreeLearner: public TreeLearner {
  HistogramPool histogram_pool_;
  /*! \brief config of tree learner*/
  const Config* config_;
-  int num_threads_;
  std::vector<int> ordered_bin_indices_;
  bool is_constant_hessian_;
  std::unique_ptr<TrainingTempState> temp_state_;

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -349,8 +349,9 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 template <typename TREELEARNER_T>
 void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
+  int num_threads = OMP_NUM_THREADS();
-  std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
+  std::vector<SplitInfo> smaller_bests_per_thread(num_threads);
+  std::vector<SplitInfo> larger_best_per_thread(num_threads);
  std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
  std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
  if (this->config_->feature_fraction_bynode < 1.0f) {