Unverified Commit 8ed371ce authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

set explicit number of threads in every OpenMP `parallel` region (#6135)

parent 992f5056
......@@ -153,7 +153,7 @@ int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
if (!is_linear_ && num_leaves_ <= 1) {
if (leaf_value_[0] != 0.0f) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) {
score[i] += leaf_value_[0];
}
......@@ -234,7 +234,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
data_size_t num_data, double* score) const {
if (!is_linear_ && num_leaves_ <= 1) {
if (leaf_value_[0] != 0.0f) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) {
score[used_data_indices[i]] += leaf_value_[0];
}
......
......@@ -61,13 +61,13 @@ class BinaryMetric: public Metric {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]) * weights_[i];
......@@ -75,7 +75,7 @@ class BinaryMetric: public Metric {
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double prob = 0;
objective->ConvertOutput(&score[i], &prob);
......@@ -83,7 +83,7 @@ class BinaryMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double prob = 0;
objective->ConvertOutput(&score[i], &prob);
......
......@@ -111,7 +111,7 @@ class MapMetric:public Metric {
}
std::vector<double> tmp_map(eval_at_.size(), 0.0f);
if (query_weights_ == nullptr) {
#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],
......@@ -121,7 +121,7 @@ class MapMetric:public Metric {
}
}
} else {
#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],
......
......@@ -63,7 +63,7 @@ class MulticlassMetric: public Metric {
}
if (objective != nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> raw_score(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) {
......@@ -76,7 +76,7 @@ class MulticlassMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> raw_score(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) {
......@@ -91,7 +91,7 @@ class MulticlassMetric: public Metric {
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> rec(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) {
......@@ -102,7 +102,7 @@ class MulticlassMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> rec(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) {
......
......@@ -57,7 +57,7 @@ class NDCGMetric:public Metric {
}
inverse_max_dcgs_.resize(num_queries_);
// cache the inverse max DCG for all queries, used to calculate NDCG
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_queries_; ++i) {
inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f);
DCGCalculator::CalMaxDCG(eval_at_, label_ + query_boundaries_[i],
......@@ -92,7 +92,7 @@ class NDCGMetric:public Metric {
}
std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
if (query_weights_ == nullptr) {
#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
// if all doc in this query are all negative, let its NDCG=1
......@@ -112,7 +112,7 @@ class NDCGMetric:public Metric {
}
}
} else {
#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
// if all doc in this query are all negative, let its NDCG=1
......
......@@ -59,13 +59,13 @@ class RegressionMetric: public Metric {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_) * weights_[i];
......@@ -73,7 +73,7 @@ class RegressionMetric: public Metric {
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// add loss
double t = 0;
......@@ -81,7 +81,7 @@ class RegressionMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], t, config_);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// add loss
double t = 0;
......
......@@ -107,26 +107,26 @@ class CrossEntropyMetric : public Metric {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability
}
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
......@@ -192,13 +192,13 @@ class CrossEntropyLambdaMetric : public Metric {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = std::log1p(std::exp(score[i])); // auto-convert
sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = std::log1p(std::exp(score[i])); // auto-convert
sum_loss += XentLambdaLoss(label_[i], weights_[i], hhat);
......@@ -206,14 +206,14 @@ class CrossEntropyLambdaMetric : public Metric {
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = 0;
objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda"
sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = 0;
objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda"
......@@ -299,26 +299,26 @@ class KullbackLeiblerDivergence : public Metric {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability
}
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
......
......@@ -63,7 +63,7 @@ class BinaryLogloss: public ObjectiveFunction {
data_size_t cnt_positive = 0;
data_size_t cnt_negative = 0;
// count for positive and negative samples
#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:cnt_positive, cnt_negative)
for (data_size_t i = 0; i < num_data_; ++i) {
if (is_pos_(label_[i])) {
++cnt_positive;
......@@ -107,7 +107,7 @@ class BinaryLogloss: public ObjectiveFunction {
return;
}
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights
const int is_pos = is_pos_(label_[i]);
......@@ -120,7 +120,7 @@ class BinaryLogloss: public ObjectiveFunction {
hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight);
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights
const int is_pos = is_pos_(label_[i]);
......@@ -140,14 +140,14 @@ class BinaryLogloss: public ObjectiveFunction {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += is_pos_(label_[i]) * weights_[i];
sumw += weights_[i];
}
} else {
sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += is_pos_(label_[i]);
}
......
......@@ -86,7 +86,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) {
std::vector<double> rec;
#pragma omp parallel for schedule(static) private(rec)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
for (data_size_t i = 0; i < num_data_; ++i) {
rec.resize(num_class_);
for (int k = 0; k < num_class_; ++k) {
......@@ -107,7 +107,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
}
} else {
std::vector<double> rec;
#pragma omp parallel for schedule(static) private(rec)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
for (data_size_t i = 0; i < num_data_; ++i) {
rec.resize(num_class_);
for (int k = 0; k < num_class_; ++k) {
......
......@@ -58,7 +58,7 @@ class RankingObjective : public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
#pragma omp parallel for schedule(guided)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (data_size_t i = 0; i < num_queries_; ++i) {
const data_size_t start = query_boundaries_[i];
const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i];
......@@ -157,7 +157,7 @@ class LambdarankNDCG : public RankingObjective {
DCGCalculator::CheckMetadata(metadata, num_queries_);
DCGCalculator::CheckLabel(label_, num_data_);
inverse_max_dcgs_.resize(num_queries_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_queries_; ++i) {
inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(
truncation_level_, label_ + query_boundaries_[i],
......
......@@ -115,7 +115,7 @@ class RegressionL2loss: public ObjectiveFunction {
label_ = metadata.label();
if (sqrt_) {
trans_label_.resize(num_data_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
trans_label_[i] = Common::Sign(label_[i]) * std::sqrt(std::fabs(label_[i]));
}
......@@ -127,13 +127,13 @@ class RegressionL2loss: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = static_cast<score_t>(score[i] - label_[i]);
hessians[i] = 1.0f;
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = static_cast<score_t>(static_cast<score_t>((score[i] - label_[i])) * weights_[i]);
hessians[i] = static_cast<score_t>(weights_[i]);
......@@ -174,14 +174,14 @@ class RegressionL2loss: public ObjectiveFunction {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += static_cast<double>(label_[i]) * weights_[i];
sumw += weights_[i];
}
} else {
sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i];
}
......@@ -217,14 +217,14 @@ class RegressionL1loss: public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff));
hessians[i] = 1.0f;
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i]);
......@@ -313,7 +313,7 @@ class RegressionHuberLoss: public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i];
if (std::abs(diff) <= alpha_) {
......@@ -324,7 +324,7 @@ class RegressionHuberLoss: public RegressionL2loss {
hessians[i] = 1.0f;
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i];
if (std::abs(diff) <= alpha_) {
......@@ -362,14 +362,14 @@ class RegressionFairLoss: public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double x = score[i] - label_[i];
gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_));
hessians[i] = static_cast<score_t>(c_ * c_ / ((std::fabs(x) + c_) * (std::fabs(x) + c_)));
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double x = score[i] - label_[i];
gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_) * weights_[i]);
......@@ -441,14 +441,14 @@ class RegressionPoissonLoss: public RegressionL2loss {
score_t* hessians) const override {
double exp_max_delta_step_ = std::exp(max_delta_step_);
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(score[i]);
gradients[i] = static_cast<score_t>(exp_score - label_[i]);
hessians[i] = static_cast<score_t>(exp_score * exp_max_delta_step_);
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(score[i]);
gradients[i] = static_cast<score_t>((exp_score - label_[i]) * weights_[i]);
......@@ -493,7 +493,7 @@ class RegressionQuantileloss : public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
score_t delta = static_cast<score_t>(score[i] - label_[i]);
if (delta >= 0) {
......@@ -504,7 +504,7 @@ class RegressionQuantileloss : public RegressionL2loss {
hessians[i] = 1.0f;
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
score_t delta = static_cast<score_t>(score[i] - label_[i]);
if (delta >= 0) {
......@@ -598,12 +598,12 @@ class RegressionMAPELOSS : public RegressionL1loss {
}
label_weight_.resize(num_data);
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i]));
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i])) * weights_[i];
}
......@@ -613,14 +613,14 @@ class RegressionMAPELOSS : public RegressionL1loss {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
hessians[i] = 1.0f;
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
......@@ -690,14 +690,14 @@ class RegressionGammaLoss : public RegressionPoissonLoss {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(-score[i]);
gradients[i] = static_cast<score_t>(1.0 - label_[i] * exp_score);
hessians[i] = static_cast<score_t>(label_[i] * exp_score);
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(-score[i]);
gradients[i] = static_cast<score_t>((1.0 - label_[i] * exp_score) * weights_[i]);
......@@ -728,7 +728,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
double exp_1_score = std::exp((1 - rho_) * score[i]);
double exp_2_score = std::exp((2 - rho_) * score[i]);
......@@ -737,7 +737,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
(2 - rho_) * exp_2_score);
}
} else {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
double exp_1_score = std::exp((1 - rho_) * score[i]);
double exp_2_score = std::exp((2 - rho_) * score[i]);
......
......@@ -77,7 +77,7 @@ class CrossEntropy: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) {
// compute pointwise gradients and Hessians with implied unit weights
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>(z - label_[i]);
......@@ -85,7 +85,7 @@ class CrossEntropy: public ObjectiveFunction {
}
} else {
// compute pointwise gradients and Hessians with given weights
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>((z - label_[i]) * weights_[i]);
......@@ -114,7 +114,7 @@ class CrossEntropy: public ObjectiveFunction {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += static_cast<double>(label_[i]) * weights_[i];
......@@ -122,7 +122,7 @@ class CrossEntropy: public ObjectiveFunction {
}
} else {
sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i];
......@@ -190,7 +190,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) {
// compute pointwise gradients and Hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>(z - label_[i]);
......@@ -198,7 +198,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
}
} else {
// compute pointwise gradients and Hessians with given weights
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double w = weights_[i];
const double y = label_[i];
......@@ -244,7 +244,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += static_cast<double>(label_[i]) * weights_[i];
......@@ -252,7 +252,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
}
} else {
sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i];
......
......@@ -79,7 +79,7 @@ class ColSampler {
static_cast<int>(valid_feature_indices_.size()), used_cnt_bytree_);
int omp_loop_size = static_cast<int>(used_feature_indices_.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) {
int used_feature = valid_feature_indices_[used_feature_indices_[i]];
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
......@@ -142,7 +142,7 @@ class ColSampler {
auto sampled_indices = random_.Sample(
static_cast<int>((*allowed_used_feature_indices).size()), used_feature_cnt);
int omp_loop_size = static_cast<int>(sampled_indices.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) {
int used_feature =
valid_feature_indices_[(*allowed_used_feature_indices)[sampled_indices[i]]];
......@@ -168,7 +168,7 @@ class ColSampler {
auto sampled_indices = random_.Sample(
static_cast<int>((*allowed_valid_feature_indices).size()), used_feature_cnt);
int omp_loop_size = static_cast<int>(sampled_indices.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) {
int used_feature = (*allowed_valid_feature_indices)[sampled_indices[i]];
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
......
......@@ -405,7 +405,7 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti
}
std::vector<int> n_nozeroworker_perleaf(cuda_tree->num_leaves(), 1);
int num_machines = Network::num_machines();
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < cuda_tree->num_leaves(); ++i) {
const double output = static_cast<double>(cuda_tree->LeafOutput(i));
data_size_t cnt_leaf_data = leaf_num_data_[i];
......
......@@ -228,7 +228,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
if (local_data_on_smaller_leaf <= 0) {
// clear histogram buffer before synchronizing
// otherwise histogram contents from the previous iteration will be sent
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
continue;
......@@ -249,7 +249,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
// construct local histograms
global_timer.Start("DataParallelTreeLearner::ReduceHistogram");
global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy");
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
continue;
......@@ -318,7 +318,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) {
CHECK_LE(smaller_leaf_num_bits, 16);
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
OMP_LOOP_EX_BEGIN();
if (!is_feature_aggregated_[feature_index]) continue;
......@@ -330,7 +330,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
}
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
OMP_LOOP_EX_BEGIN();
if (!is_feature_aggregated_[feature_index]) continue;
......
......@@ -52,7 +52,7 @@ class DataPartition {
if (used_data_indices_ == nullptr) {
// if using all data
leaf_count_[0] = num_data_;
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
for (data_size_t i = 0; i < num_data_; ++i) {
indices_[i] = i;
}
......
......@@ -1692,7 +1692,7 @@ class HistogramPool {
auto& ref_feature_meta = *feature_meta;
const int num_feature = train_data->num_features();
ref_feature_meta.resize(num_feature);
#pragma omp parallel for schedule(static, 512) if (num_feature >= 1024)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_feature >= 1024)
for (int i = 0; i < num_feature; ++i) {
if (USE_DATA) {
ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i);
......@@ -1749,7 +1749,7 @@ class HistogramPool {
if (config->use_quantized_grad) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = old_cache_size; i < cache_size; ++i) {
OMP_LOOP_EX_BEGIN();
pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
......@@ -1763,7 +1763,7 @@ class HistogramPool {
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = old_cache_size; i < cache_size; ++i) {
OMP_LOOP_EX_BEGIN();
pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
......@@ -1787,7 +1787,7 @@ class HistogramPool {
old_config->extra_trees != config->extra_trees ||
old_config->max_delta_step != config->max_delta_step ||
old_config->path_smooth != config->path_smooth) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < cache_size_; ++i) {
for (int j = 0; j < train_data->num_features(); ++j) {
pool_[i][j].ResetFunc();
......
......@@ -191,7 +191,7 @@ void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
// when the output is ready, the computation is done
histograms_wait_obj_.wait();
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < num_dense_feature_groups_; ++i) {
if (!feature_masks_[i]) {
continue;
......@@ -359,7 +359,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
0, num_data_ * sizeof(Feature4)));
}
// building Feature4 bundles; each thread handles dword_features_ features
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) {
int tid = omp_get_thread_num();
Feature4* host4 = host4_ptrs[tid];
......@@ -451,7 +451,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
......@@ -464,14 +464,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
......@@ -485,7 +485,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
}
// fill the leftover features
if (dword_features_ == 8) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value
......@@ -493,7 +493,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
}
}
} else if (dword_features_ == 4) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value
......@@ -572,7 +572,7 @@ void GPUTreeLearner::BuildGPUKernels() {
// currently we don't use constant memory
int use_constants = 0;
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
OMP_LOOP_EX_BEGIN();
boost::compute::program program;
......@@ -811,7 +811,7 @@ void GPUTreeLearner::BeforeTrain() {
// transfer the indices to GPU
indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < cnt; ++i) {
ordered_hessians_[i] = hessians_[indices[i]];
}
......@@ -827,7 +827,7 @@ void GPUTreeLearner::BeforeTrain() {
histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
}
}
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < cnt; ++i) {
ordered_gradients_[i] = gradients_[indices[i]];
}
......@@ -865,7 +865,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = begin; i < end; ++i) {
ordered_hessians_[i - begin] = hessians_[indices[i]];
}
......@@ -873,7 +873,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
}
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = begin; i < end; ++i) {
ordered_gradients_[i - begin] = gradients_[indices[i]];
}
......@@ -907,7 +907,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
// generate and copy ordered_gradients if gradients is not null
if (gradients != nullptr) {
if (num_data != num_data_) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
}
......@@ -919,7 +919,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
// generate and copy ordered_hessians if Hessians is not null
if (hessians != nullptr && !share_state_->is_constant_hessian) {
if (num_data != num_data_) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_hessians[i] = hessians[data_indices[i]];
}
......@@ -930,7 +930,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
}
// converted indices in is_feature_used to feature-group indices
std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_features_ >= 2048)
for (int i = 0; i < num_features_; ++i) {
if (is_feature_used[i]) {
is_feature_group_used[train_data_->Feature2Group(i)] = 1;
......@@ -938,7 +938,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
}
// construct the feature masks for dense feature-groups
int used_dense_feature_groups = 0;
#pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
for (int i = 0; i < num_dense_feature_groups_; ++i) {
if (is_feature_group_used[dense_feature_group_map_[i]]) {
feature_masks_[i] = 1;
......@@ -973,7 +973,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
std::vector<int8_t> is_dense_feature_used(num_features_, 0);
#pragma omp parallel for schedule(static)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
if (!is_feature_used[feature_index]) continue;
......
......@@ -216,7 +216,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
data_size_t leaf_cnt = 0;
const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
double sum_gradient = 0.0f, sum_hessian = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
for (data_size_t i = 0; i < leaf_cnt; ++i) {
const data_size_t index = data_indices[i];
const score_t grad = gradients[index];
......@@ -242,7 +242,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
data_size_t leaf_cnt = 0;
const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
double sum_gradient = 0.0f, sum_hessian = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
for (data_size_t i = 0; i < leaf_cnt; ++i) {
const data_size_t index = data_indices[i];
const score_t grad = gradients[index];
......
......@@ -95,7 +95,7 @@ class LeafSplits {
data_indices_ = nullptr;
double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
tmp_sum_gradients += gradients[i];
tmp_sum_hessians += hessians[i];
......@@ -120,7 +120,7 @@ class LeafSplits {
double tmp_sum_hessians = 0.0f;
const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
......@@ -149,7 +149,7 @@ class LeafSplits {
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
const data_size_t idx = data_indices_[i];
tmp_sum_gradients += gradients[idx];
......@@ -177,7 +177,7 @@ class LeafSplits {
double tmp_sum_hessians = 0.0f;
const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
const data_size_t idx = data_indices_[i];
tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment