set explicit number of threads in every OpenMP `parallel` region (#6135)

8ed371ce · James Lamb · GitHub · 992f5056 · 8ed371ce · 8ed371ce
Unverified Commit 8ed371ce authored Oct 09, 2023 by James Lamb Committed by GitHub Oct 09, 2023
20 changed files
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -153,7 +153,7 @@ int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32
 void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
  if (!is_linear_ && num_leaves_ <= 1) {
    if (leaf_value_[0] != 0.0f) {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
      for (data_size_t i = 0; i < num_data; ++i) {
        score[i] += leaf_value_[0];
      }
@@ -234,7 +234,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
  data_size_t num_data, double* score) const {
  if (!is_linear_ && num_leaves_ <= 1) {
    if (leaf_value_[0] != 0.0f) {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
      for (data_size_t i = 0; i < num_data; ++i) {
        score[used_data_indices[i]] += leaf_value_[0];
      }

--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -61,13 +61,13 @@ class BinaryMetric: public Metric {
    double sum_loss = 0.0f;
    if (objective == nullptr) {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]) * weights_[i];
@@ -75,7 +75,7 @@ class BinaryMetric: public Metric {
      }
    } else {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double prob = 0;
          objective->ConvertOutput(&score[i], &prob);
@@ -83,7 +83,7 @@ class BinaryMetric: public Metric {
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double prob = 0;
          objective->ConvertOutput(&score[i], &prob);

--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -111,7 +111,7 @@ class MapMetric:public Metric {
    }
    std::vector<double> tmp_map(eval_at_.size(), 0.0f);
    if (query_weights_ == nullptr) {
-      #pragma omp parallel for schedule(guided) firstprivate(tmp_map)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],
@@ -121,7 +121,7 @@ class MapMetric:public Metric {
        }
      }
    } else {
-      #pragma omp parallel for schedule(guided) firstprivate(tmp_map)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],

--- a/src/metric/multiclass_metric.hpp
+++ b/src/metric/multiclass_metric.hpp
@@ -63,7 +63,7 @@ class MulticlassMetric: public Metric {
    }
    if (objective != nullptr) {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          std::vector<double> raw_score(num_tree_per_iteration);
          for (int k = 0; k < num_tree_per_iteration; ++k) {
@@ -76,7 +76,7 @@ class MulticlassMetric: public Metric {
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          std::vector<double> raw_score(num_tree_per_iteration);
          for (int k = 0; k < num_tree_per_iteration; ++k) {
@@ -91,7 +91,7 @@ class MulticlassMetric: public Metric {
      }
    } else {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          std::vector<double> rec(num_tree_per_iteration);
          for (int k = 0; k < num_tree_per_iteration; ++k) {
@@ -102,7 +102,7 @@ class MulticlassMetric: public Metric {
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          std::vector<double> rec(num_tree_per_iteration);
          for (int k = 0; k < num_tree_per_iteration; ++k) {

--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -57,7 +57,7 @@ class NDCGMetric:public Metric {
    }
    inverse_max_dcgs_.resize(num_queries_);
    // cache the inverse max DCG for all queries, used to calculate NDCG
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
    for (data_size_t i = 0; i < num_queries_; ++i) {
      inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f);
      DCGCalculator::CalMaxDCG(eval_at_, label_ + query_boundaries_[i],
@@ -92,7 +92,7 @@ class NDCGMetric:public Metric {
    }
    std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
    if (query_weights_ == nullptr) {
-      #pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        // if all doc in this query are all negative, let its NDCG=1
@@ -112,7 +112,7 @@ class NDCGMetric:public Metric {
        }
      }
    } else {
-      #pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        // if all doc in this query are all negative, let its NDCG=1

--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -59,13 +59,13 @@ class RegressionMetric: public Metric {
    double sum_loss = 0.0f;
    if (objective == nullptr) {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_) * weights_[i];
@@ -73,7 +73,7 @@ class RegressionMetric: public Metric {
      }
    } else {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
          double t = 0;
@@ -81,7 +81,7 @@ class RegressionMetric: public Metric {
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], t, config_);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
          double t = 0;

--- a/src/metric/xentropy_metric.hpp
+++ b/src/metric/xentropy_metric.hpp
@@ -107,26 +107,26 @@ class CrossEntropyMetric : public Metric {
    double sum_loss = 0.0f;
    if (objective == nullptr) {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          sum_loss += XentLoss(label_[i], score[i]);  // NOTE: does not work unless score is a probability
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          sum_loss += XentLoss(label_[i], score[i]) * weights_[i];  // NOTE: does not work unless score is a probability
        }
      }
    } else {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double p = 0;
          objective->ConvertOutput(&score[i], &p);
          sum_loss += XentLoss(label_[i], p);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double p = 0;
          objective->ConvertOutput(&score[i], &p);
@@ -192,13 +192,13 @@ class CrossEntropyLambdaMetric : public Metric {
    double sum_loss = 0.0f;
    if (objective == nullptr) {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double hhat = std::log1p(std::exp(score[i]));  // auto-convert
          sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double hhat = std::log1p(std::exp(score[i]));  // auto-convert
          sum_loss += XentLambdaLoss(label_[i], weights_[i], hhat);
@@ -206,14 +206,14 @@ class CrossEntropyLambdaMetric : public Metric {
      }
    } else {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double hhat = 0;
          objective->ConvertOutput(&score[i], &hhat);  // NOTE: this only works if objective = "xentlambda"
          sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double hhat = 0;
          objective->ConvertOutput(&score[i], &hhat);  // NOTE: this only works if objective = "xentlambda"
@@ -299,26 +299,26 @@ class KullbackLeiblerDivergence : public Metric {
    double sum_loss = 0.0f;
    if (objective == nullptr) {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          sum_loss += XentLoss(label_[i], score[i]);  // NOTE: does not work unless score is a probability
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          sum_loss += XentLoss(label_[i], score[i]) * weights_[i];  // NOTE: does not work unless score is a probability
        }
      }
    } else {
      if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double p = 0;
          objective->ConvertOutput(&score[i], &p);
          sum_loss += XentLoss(label_[i], p);
        }
      } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          double p = 0;
          objective->ConvertOutput(&score[i], &p);

--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -63,7 +63,7 @@ class BinaryLogloss: public ObjectiveFunction {
    data_size_t cnt_positive = 0;
    data_size_t cnt_negative = 0;
    // count for positive and negative samples
-    #pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:cnt_positive, cnt_negative)
    for (data_size_t i = 0; i < num_data_; ++i) {
      if (is_pos_(label_[i])) {
        ++cnt_positive;
@@ -107,7 +107,7 @@ class BinaryLogloss: public ObjectiveFunction {
      return;
    }
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // get label and label weights
        const int is_pos = is_pos_(label_[i]);
@@ -120,7 +120,7 @@ class BinaryLogloss: public ObjectiveFunction {
        hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight);
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // get label and label weights
        const int is_pos = is_pos_(label_[i]);
@@ -140,14 +140,14 @@ class BinaryLogloss: public ObjectiveFunction {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += is_pos_(label_[i]) * weights_[i];
        sumw += weights_[i];
      }
    } else {
      sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += is_pos_(label_[i]);
      }

--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -86,7 +86,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
    if (weights_ == nullptr) {
      std::vector<double> rec;
-      #pragma omp parallel for schedule(static) private(rec)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
      for (data_size_t i = 0; i < num_data_; ++i) {
        rec.resize(num_class_);
        for (int k = 0; k < num_class_; ++k) {
@@ -107,7 +107,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
      }
    } else {
      std::vector<double> rec;
-      #pragma omp parallel for schedule(static) private(rec)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
      for (data_size_t i = 0; i < num_data_; ++i) {
        rec.resize(num_class_);
        for (int k = 0; k < num_class_; ++k) {

--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -58,7 +58,7 @@ class RankingObjective : public ObjectiveFunction {

  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
    for (data_size_t i = 0; i < num_queries_; ++i) {
      const data_size_t start = query_boundaries_[i];
      const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i];
@@ -157,7 +157,7 @@ class LambdarankNDCG : public RankingObjective {
    DCGCalculator::CheckMetadata(metadata, num_queries_);
    DCGCalculator::CheckLabel(label_, num_data_);
    inverse_max_dcgs_.resize(num_queries_);
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
    for (data_size_t i = 0; i < num_queries_; ++i) {
      inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(
          truncation_level_, label_ + query_boundaries_[i],

--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -115,7 +115,7 @@ class RegressionL2loss: public ObjectiveFunction {
    label_ = metadata.label();
    if (sqrt_) {
      trans_label_.resize(num_data_);
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        trans_label_[i] = Common::Sign(label_[i]) * std::sqrt(std::fabs(label_[i]));
      }
@@ -127,13 +127,13 @@ class RegressionL2loss: public ObjectiveFunction {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        gradients[i] = static_cast<score_t>(score[i] - label_[i]);
        hessians[i] = 1.0f;
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        gradients[i] = static_cast<score_t>(static_cast<score_t>((score[i] - label_[i])) * weights_[i]);
        hessians[i] = static_cast<score_t>(weights_[i]);
@@ -174,14 +174,14 @@ class RegressionL2loss: public ObjectiveFunction {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += static_cast<double>(label_[i]) * weights_[i];
        sumw += weights_[i];
      }
    } else {
      sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += label_[i];
      }
@@ -217,14 +217,14 @@ class RegressionL1loss: public RegressionL2loss {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
        gradients[i] = static_cast<score_t>(Common::Sign(diff));
        hessians[i] = 1.0f;
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
        gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i]);
@@ -313,7 +313,7 @@ class RegressionHuberLoss: public RegressionL2loss {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
        if (std::abs(diff) <= alpha_) {
@@ -324,7 +324,7 @@ class RegressionHuberLoss: public RegressionL2loss {
        hessians[i] = 1.0f;
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
        if (std::abs(diff) <= alpha_) {
@@ -362,14 +362,14 @@ class RegressionFairLoss: public RegressionL2loss {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double x = score[i] - label_[i];
        gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_));
        hessians[i] = static_cast<score_t>(c_ * c_ / ((std::fabs(x) + c_) * (std::fabs(x) + c_)));
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double x = score[i] - label_[i];
        gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_) * weights_[i]);
@@ -441,14 +441,14 @@ class RegressionPoissonLoss: public RegressionL2loss {
                    score_t* hessians) const override {
    double exp_max_delta_step_ = std::exp(max_delta_step_);
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        double exp_score = std::exp(score[i]);
        gradients[i] = static_cast<score_t>(exp_score - label_[i]);
        hessians[i] = static_cast<score_t>(exp_score * exp_max_delta_step_);
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        double exp_score = std::exp(score[i]);
        gradients[i] = static_cast<score_t>((exp_score - label_[i]) * weights_[i]);
@@ -493,7 +493,7 @@ class RegressionQuantileloss : public RegressionL2loss {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        score_t delta = static_cast<score_t>(score[i] - label_[i]);
        if (delta >= 0) {
@@ -504,7 +504,7 @@ class RegressionQuantileloss : public RegressionL2loss {
        hessians[i] = 1.0f;
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        score_t delta = static_cast<score_t>(score[i] - label_[i]);
        if (delta >= 0) {
@@ -598,12 +598,12 @@ class RegressionMAPELOSS : public RegressionL1loss {
    }
    label_weight_.resize(num_data);
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i]));
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i])) * weights_[i];
      }
@@ -613,14 +613,14 @@ class RegressionMAPELOSS : public RegressionL1loss {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
        gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
        hessians[i] = 1.0f;
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
        gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
@@ -690,14 +690,14 @@ class RegressionGammaLoss : public RegressionPoissonLoss {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        double exp_score = std::exp(-score[i]);
        gradients[i] = static_cast<score_t>(1.0 - label_[i] * exp_score);
        hessians[i] = static_cast<score_t>(label_[i] * exp_score);
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        double exp_score = std::exp(-score[i]);
        gradients[i] = static_cast<score_t>((1.0 - label_[i] * exp_score) * weights_[i]);
@@ -728,7 +728,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        double exp_1_score = std::exp((1 - rho_) * score[i]);
        double exp_2_score = std::exp((2 - rho_) * score[i]);
@@ -737,7 +737,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
          (2 - rho_) * exp_2_score);
      }
    } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        double exp_1_score = std::exp((1 - rho_) * score[i]);
        double exp_2_score = std::exp((2 - rho_) * score[i]);

--- a/src/objective/xentropy_objective.hpp
+++ b/src/objective/xentropy_objective.hpp
@@ -77,7 +77,7 @@ class CrossEntropy: public ObjectiveFunction {
  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
    if (weights_ == nullptr) {
      // compute pointwise gradients and Hessians with implied unit weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double z = 1.0f / (1.0f + std::exp(-score[i]));
        gradients[i] = static_cast<score_t>(z - label_[i]);
@@ -85,7 +85,7 @@ class CrossEntropy: public ObjectiveFunction {
      }
    } else {
      // compute pointwise gradients and Hessians with given weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double z = 1.0f / (1.0f + std::exp(-score[i]));
        gradients[i] = static_cast<score_t>((z - label_[i]) * weights_[i]);
@@ -114,7 +114,7 @@ class CrossEntropy: public ObjectiveFunction {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)

      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += static_cast<double>(label_[i]) * weights_[i];
@@ -122,7 +122,7 @@ class CrossEntropy: public ObjectiveFunction {
      }
    } else {
      sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)

      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += label_[i];
@@ -190,7 +190,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
    if (weights_ == nullptr) {
      // compute pointwise gradients and Hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double z = 1.0f / (1.0f + std::exp(-score[i]));
        gradients[i] = static_cast<score_t>(z - label_[i]);
@@ -198,7 +198,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
      }
    } else {
      // compute pointwise gradients and Hessians with given weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double w = weights_[i];
        const double y = label_[i];
@@ -244,7 +244,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)

      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += static_cast<double>(label_[i]) * weights_[i];
@@ -252,7 +252,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
      }
    } else {
      sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)

      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += label_[i];

--- a/src/treelearner/col_sampler.hpp
+++ b/src/treelearner/col_sampler.hpp
@@ -79,7 +79,7 @@ class ColSampler {
          static_cast<int>(valid_feature_indices_.size()), used_cnt_bytree_);
      int omp_loop_size = static_cast<int>(used_feature_indices_.size());

-#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
      for (int i = 0; i < omp_loop_size; ++i) {
        int used_feature = valid_feature_indices_[used_feature_indices_[i]];
        int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
@@ -142,7 +142,7 @@ class ColSampler {
      auto sampled_indices = random_.Sample(
          static_cast<int>((*allowed_used_feature_indices).size()), used_feature_cnt);
      int omp_loop_size = static_cast<int>(sampled_indices.size());
-#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
      for (int i = 0; i < omp_loop_size; ++i) {
        int used_feature =
            valid_feature_indices_[(*allowed_used_feature_indices)[sampled_indices[i]]];
@@ -168,7 +168,7 @@ class ColSampler {
      auto sampled_indices = random_.Sample(
          static_cast<int>((*allowed_valid_feature_indices).size()), used_feature_cnt);
      int omp_loop_size = static_cast<int>(sampled_indices.size());
-#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
      for (int i = 0; i < omp_loop_size; ++i) {
        int used_feature = (*allowed_valid_feature_indices)[sampled_indices[i]];
        int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);

--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@@ -405,7 +405,7 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti
      }
      std::vector<int> n_nozeroworker_perleaf(cuda_tree->num_leaves(), 1);
      int num_machines = Network::num_machines();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int i = 0; i < cuda_tree->num_leaves(); ++i) {
        const double output = static_cast<double>(cuda_tree->LeafOutput(i));
        data_size_t cnt_leaf_data = leaf_num_data_[i];

--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -228,7 +228,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
  if (local_data_on_smaller_leaf <= 0) {
    // clear histogram buffer before synchronizing
    // otherwise histogram contents from the previous iteration will be sent
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
    for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
      if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
        continue;
@@ -249,7 +249,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
  // construct local histograms
  global_timer.Start("DataParallelTreeLearner::ReduceHistogram");
  global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy");
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
    if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
      continue;
@@ -318,7 +318,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
    if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) {
      CHECK_LE(smaller_leaf_num_bits, 16);
      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
        OMP_LOOP_EX_BEGIN();
        if (!is_feature_aggregated_[feature_index]) continue;
@@ -330,7 +330,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
  }

  OMP_INIT_EX();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
    OMP_LOOP_EX_BEGIN();
    if (!is_feature_aggregated_[feature_index]) continue;

--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -52,7 +52,7 @@ class DataPartition {
    if (used_data_indices_ == nullptr) {
      // if using all data
      leaf_count_[0] = num_data_;
-#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
      for (data_size_t i = 0; i < num_data_; ++i) {
        indices_[i] = i;
      }

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -1692,7 +1692,7 @@ class HistogramPool {
    auto& ref_feature_meta = *feature_meta;
    const int num_feature = train_data->num_features();
    ref_feature_meta.resize(num_feature);
-#pragma omp parallel for schedule(static, 512) if (num_feature >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_feature >= 1024)
    for (int i = 0; i < num_feature; ++i) {
      if (USE_DATA) {
        ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i);
@@ -1749,7 +1749,7 @@ class HistogramPool {

    if (config->use_quantized_grad) {
      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int i = old_cache_size; i < cache_size; ++i) {
        OMP_LOOP_EX_BEGIN();
        pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
@@ -1763,7 +1763,7 @@ class HistogramPool {
      OMP_THROW_EX();
    } else {
      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int i = old_cache_size; i < cache_size; ++i) {
        OMP_LOOP_EX_BEGIN();
        pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
@@ -1787,7 +1787,7 @@ class HistogramPool {
        old_config->extra_trees != config->extra_trees ||
        old_config->max_delta_step != config->max_delta_step ||
        old_config->path_smooth != config->path_smooth) {
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int i = 0; i < cache_size_; ++i) {
        for (int j = 0; j < train_data->num_features(); ++j) {
          pool_[i][j].ResetFunc();

--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -191,7 +191,7 @@ void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
  HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
  // when the output is ready, the computation is done
  histograms_wait_obj_.wait();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    if (!feature_masks_[i]) {
      continue;
@@ -359,7 +359,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
                    0, num_data_ * sizeof(Feature4)));
  }
  // building Feature4 bundles; each thread handles dword_features_ features
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
  for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) {
    int tid = omp_get_thread_num();
    Feature4* host4 = host4_ptrs[tid];
@@ -451,7 +451,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
        if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
-          #pragma omp parallel for schedule(static)
+          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
          for (int j = 0; j < num_data_; ++j) {
            host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                                + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
@@ -464,14 +464,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
        if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
-          #pragma omp parallel for schedule(static)
+          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
          for (int j = 0; j < num_data_; ++j) {
            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
          }
        } else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
-          #pragma omp parallel for schedule(static)
+          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
          for (int j = 0; j < num_data_; ++j) {
            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
@@ -485,7 +485,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
    }
    // fill the leftover features
    if (dword_features_ == 8) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int j = 0; j < num_data_; ++j) {
        for (int i = k; i < dword_features_; ++i) {
          // fill this empty feature with some "random" value
@@ -493,7 +493,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
        }
      }
    } else if (dword_features_ == 4) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int j = 0; j < num_data_; ++j) {
        for (int i = k; i < dword_features_; ++i) {
          // fill this empty feature with some "random" value
@@ -572,7 +572,7 @@ void GPUTreeLearner::BuildGPUKernels() {
  // currently we don't use constant memory
  int use_constants = 0;
  OMP_INIT_EX();
-  #pragma omp parallel for schedule(guided)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
    OMP_LOOP_EX_BEGIN();
    boost::compute::program program;
@@ -811,7 +811,7 @@ void GPUTreeLearner::BeforeTrain() {
    // transfer the indices to GPU
    indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
    if (!share_state_->is_constant_hessian) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < cnt; ++i) {
        ordered_hessians_[i] = hessians_[indices[i]];
      }
@@ -827,7 +827,7 @@ void GPUTreeLearner::BeforeTrain() {
        histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
      }
    }
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
    for (data_size_t i = 0; i < cnt; ++i) {
      ordered_gradients_[i] = gradients_[indices[i]];
    }
@@ -865,7 +865,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
    indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);

    if (!share_state_->is_constant_hessian) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = begin; i < end; ++i) {
        ordered_hessians_[i - begin] = hessians_[indices[i]];
      }
@@ -873,7 +873,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
    }

-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
    for (data_size_t i = begin; i < end; ++i) {
      ordered_gradients_[i - begin] = gradients_[indices[i]];
    }
@@ -907,7 +907,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
  // generate and copy ordered_gradients if gradients is not null
  if (gradients != nullptr) {
    if (num_data != num_data_) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_gradients[i] = gradients[data_indices[i]];
      }
@@ -919,7 +919,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
  // generate and copy ordered_hessians if Hessians is not null
  if (hessians != nullptr && !share_state_->is_constant_hessian) {
    if (num_data != num_data_) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_hessians[i] = hessians[data_indices[i]];
      }
@@ -930,7 +930,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
  }
  // converted indices in is_feature_used to feature-group indices
  std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
-  #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_features_ >= 2048)
  for (int i = 0; i < num_features_; ++i) {
    if (is_feature_used[i]) {
      is_feature_group_used[train_data_->Feature2Group(i)] = 1;
@@ -938,7 +938,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
  }
  // construct the feature masks for dense feature-groups
  int used_dense_feature_groups = 0;
-  #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    if (is_feature_group_used[dense_feature_group_map_[i]]) {
      feature_masks_[i] = 1;
@@ -973,7 +973,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
 void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
    if (!is_feature_used[feature_index]) continue;

--- a/src/treelearner/gradient_discretizer.cpp
+++ b/src/treelearner/gradient_discretizer.cpp
@@ -216,7 +216,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
      data_size_t leaf_cnt = 0;
      const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
      double sum_gradient = 0.0f, sum_hessian = 0.0f;
-      #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
      for (data_size_t i = 0; i < leaf_cnt; ++i) {
        const data_size_t index = data_indices[i];
        const score_t grad = gradients[index];
@@ -242,7 +242,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
      data_size_t leaf_cnt = 0;
      const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
      double sum_gradient = 0.0f, sum_hessian = 0.0f;
-      #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
      for (data_size_t i = 0; i < leaf_cnt; ++i) {
        const data_size_t index = data_indices[i];
        const score_t grad = gradients[index];

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -95,7 +95,7 @@ class LeafSplits {
    data_indices_ = nullptr;
    double tmp_sum_gradients = 0.0f;
    double tmp_sum_hessians = 0.0f;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      tmp_sum_gradients += gradients[i];
      tmp_sum_hessians += hessians[i];
@@ -120,7 +120,7 @@ class LeafSplits {
    double tmp_sum_hessians = 0.0f;
    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
    int64_t tmp_sum_gradients_and_hessians = 0;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
      tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
@@ -149,7 +149,7 @@ class LeafSplits {
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
    double tmp_sum_gradients = 0.0f;
    double tmp_sum_hessians = 0.0f;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      const data_size_t idx = data_indices_[i];
      tmp_sum_gradients += gradients[idx];
@@ -177,7 +177,7 @@ class LeafSplits {
    double tmp_sum_hessians = 0.0f;
    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
    int64_t tmp_sum_gradients_and_hessians = 0;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      const data_size_t idx = data_indices_[i];
      tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;