Unverified Commit 8ed371ce authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

set explicit number of threads in every OpenMP `parallel` region (#6135)

parent 992f5056
...@@ -153,7 +153,7 @@ int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32 ...@@ -153,7 +153,7 @@ int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const { void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
if (!is_linear_ && num_leaves_ <= 1) { if (!is_linear_ && num_leaves_ <= 1) {
if (leaf_value_[0] != 0.0f) { if (leaf_value_[0] != 0.0f) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
score[i] += leaf_value_[0]; score[i] += leaf_value_[0];
} }
...@@ -234,7 +234,7 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -234,7 +234,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
data_size_t num_data, double* score) const { data_size_t num_data, double* score) const {
if (!is_linear_ && num_leaves_ <= 1) { if (!is_linear_ && num_leaves_ <= 1) {
if (leaf_value_[0] != 0.0f) { if (leaf_value_[0] != 0.0f) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
score[used_data_indices[i]] += leaf_value_[0]; score[used_data_indices[i]] += leaf_value_[0];
} }
......
...@@ -61,13 +61,13 @@ class BinaryMetric: public Metric { ...@@ -61,13 +61,13 @@ class BinaryMetric: public Metric {
double sum_loss = 0.0f; double sum_loss = 0.0f;
if (objective == nullptr) { if (objective == nullptr) {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// add loss // add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]); sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// add loss // add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]) * weights_[i]; sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]) * weights_[i];
...@@ -75,7 +75,7 @@ class BinaryMetric: public Metric { ...@@ -75,7 +75,7 @@ class BinaryMetric: public Metric {
} }
} else { } else {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double prob = 0; double prob = 0;
objective->ConvertOutput(&score[i], &prob); objective->ConvertOutput(&score[i], &prob);
...@@ -83,7 +83,7 @@ class BinaryMetric: public Metric { ...@@ -83,7 +83,7 @@ class BinaryMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob); sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double prob = 0; double prob = 0;
objective->ConvertOutput(&score[i], &prob); objective->ConvertOutput(&score[i], &prob);
......
...@@ -111,7 +111,7 @@ class MapMetric:public Metric { ...@@ -111,7 +111,7 @@ class MapMetric:public Metric {
} }
std::vector<double> tmp_map(eval_at_.size(), 0.0f); std::vector<double> tmp_map(eval_at_.size(), 0.0f);
if (query_weights_ == nullptr) { if (query_weights_ == nullptr) {
#pragma omp parallel for schedule(guided) firstprivate(tmp_map) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i], CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],
...@@ -121,7 +121,7 @@ class MapMetric:public Metric { ...@@ -121,7 +121,7 @@ class MapMetric:public Metric {
} }
} }
} else { } else {
#pragma omp parallel for schedule(guided) firstprivate(tmp_map) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i], CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],
......
...@@ -63,7 +63,7 @@ class MulticlassMetric: public Metric { ...@@ -63,7 +63,7 @@ class MulticlassMetric: public Metric {
} }
if (objective != nullptr) { if (objective != nullptr) {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> raw_score(num_tree_per_iteration); std::vector<double> raw_score(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) { for (int k = 0; k < num_tree_per_iteration; ++k) {
...@@ -76,7 +76,7 @@ class MulticlassMetric: public Metric { ...@@ -76,7 +76,7 @@ class MulticlassMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_); sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> raw_score(num_tree_per_iteration); std::vector<double> raw_score(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) { for (int k = 0; k < num_tree_per_iteration; ++k) {
...@@ -91,7 +91,7 @@ class MulticlassMetric: public Metric { ...@@ -91,7 +91,7 @@ class MulticlassMetric: public Metric {
} }
} else { } else {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> rec(num_tree_per_iteration); std::vector<double> rec(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) { for (int k = 0; k < num_tree_per_iteration; ++k) {
...@@ -102,7 +102,7 @@ class MulticlassMetric: public Metric { ...@@ -102,7 +102,7 @@ class MulticlassMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_); sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
std::vector<double> rec(num_tree_per_iteration); std::vector<double> rec(num_tree_per_iteration);
for (int k = 0; k < num_tree_per_iteration; ++k) { for (int k = 0; k < num_tree_per_iteration; ++k) {
......
...@@ -57,7 +57,7 @@ class NDCGMetric:public Metric { ...@@ -57,7 +57,7 @@ class NDCGMetric:public Metric {
} }
inverse_max_dcgs_.resize(num_queries_); inverse_max_dcgs_.resize(num_queries_);
// cache the inverse max DCG for all queries, used to calculate NDCG // cache the inverse max DCG for all queries, used to calculate NDCG
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f); inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f);
DCGCalculator::CalMaxDCG(eval_at_, label_ + query_boundaries_[i], DCGCalculator::CalMaxDCG(eval_at_, label_ + query_boundaries_[i],
...@@ -92,7 +92,7 @@ class NDCGMetric:public Metric { ...@@ -92,7 +92,7 @@ class NDCGMetric:public Metric {
} }
std::vector<double> tmp_dcg(eval_at_.size(), 0.0f); std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
if (query_weights_ == nullptr) { if (query_weights_ == nullptr) {
#pragma omp parallel for schedule(static) firstprivate(tmp_dcg) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
// if all doc in this query are all negative, let its NDCG=1 // if all doc in this query are all negative, let its NDCG=1
...@@ -112,7 +112,7 @@ class NDCGMetric:public Metric { ...@@ -112,7 +112,7 @@ class NDCGMetric:public Metric {
} }
} }
} else { } else {
#pragma omp parallel for schedule(static) firstprivate(tmp_dcg) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
// if all doc in this query are all negative, let its NDCG=1 // if all doc in this query are all negative, let its NDCG=1
......
...@@ -59,13 +59,13 @@ class RegressionMetric: public Metric { ...@@ -59,13 +59,13 @@ class RegressionMetric: public Metric {
double sum_loss = 0.0f; double sum_loss = 0.0f;
if (objective == nullptr) { if (objective == nullptr) {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// add loss // add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_); sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// add loss // add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_) * weights_[i]; sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_) * weights_[i];
...@@ -73,7 +73,7 @@ class RegressionMetric: public Metric { ...@@ -73,7 +73,7 @@ class RegressionMetric: public Metric {
} }
} else { } else {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// add loss // add loss
double t = 0; double t = 0;
...@@ -81,7 +81,7 @@ class RegressionMetric: public Metric { ...@@ -81,7 +81,7 @@ class RegressionMetric: public Metric {
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], t, config_); sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], t, config_);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// add loss // add loss
double t = 0; double t = 0;
......
...@@ -107,26 +107,26 @@ class CrossEntropyMetric : public Metric { ...@@ -107,26 +107,26 @@ class CrossEntropyMetric : public Metric {
double sum_loss = 0.0f; double sum_loss = 0.0f;
if (objective == nullptr) { if (objective == nullptr) {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability
} }
} }
} else { } else {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0; double p = 0;
objective->ConvertOutput(&score[i], &p); objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p); sum_loss += XentLoss(label_[i], p);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0; double p = 0;
objective->ConvertOutput(&score[i], &p); objective->ConvertOutput(&score[i], &p);
...@@ -192,13 +192,13 @@ class CrossEntropyLambdaMetric : public Metric { ...@@ -192,13 +192,13 @@ class CrossEntropyLambdaMetric : public Metric {
double sum_loss = 0.0f; double sum_loss = 0.0f;
if (objective == nullptr) { if (objective == nullptr) {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = std::log1p(std::exp(score[i])); // auto-convert double hhat = std::log1p(std::exp(score[i])); // auto-convert
sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat); sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = std::log1p(std::exp(score[i])); // auto-convert double hhat = std::log1p(std::exp(score[i])); // auto-convert
sum_loss += XentLambdaLoss(label_[i], weights_[i], hhat); sum_loss += XentLambdaLoss(label_[i], weights_[i], hhat);
...@@ -206,14 +206,14 @@ class CrossEntropyLambdaMetric : public Metric { ...@@ -206,14 +206,14 @@ class CrossEntropyLambdaMetric : public Metric {
} }
} else { } else {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = 0; double hhat = 0;
objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda" objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda"
sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat); sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = 0; double hhat = 0;
objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda" objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda"
...@@ -299,26 +299,26 @@ class KullbackLeiblerDivergence : public Metric { ...@@ -299,26 +299,26 @@ class KullbackLeiblerDivergence : public Metric {
double sum_loss = 0.0f; double sum_loss = 0.0f;
if (objective == nullptr) { if (objective == nullptr) {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability
} }
} }
} else { } else {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0; double p = 0;
objective->ConvertOutput(&score[i], &p); objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p); sum_loss += XentLoss(label_[i], p);
} }
} else { } else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0; double p = 0;
objective->ConvertOutput(&score[i], &p); objective->ConvertOutput(&score[i], &p);
......
...@@ -63,7 +63,7 @@ class BinaryLogloss: public ObjectiveFunction { ...@@ -63,7 +63,7 @@ class BinaryLogloss: public ObjectiveFunction {
data_size_t cnt_positive = 0; data_size_t cnt_positive = 0;
data_size_t cnt_negative = 0; data_size_t cnt_negative = 0;
// count for positive and negative samples // count for positive and negative samples
#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:cnt_positive, cnt_negative)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
if (is_pos_(label_[i])) { if (is_pos_(label_[i])) {
++cnt_positive; ++cnt_positive;
...@@ -107,7 +107,7 @@ class BinaryLogloss: public ObjectiveFunction { ...@@ -107,7 +107,7 @@ class BinaryLogloss: public ObjectiveFunction {
return; return;
} }
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights // get label and label weights
const int is_pos = is_pos_(label_[i]); const int is_pos = is_pos_(label_[i]);
...@@ -120,7 +120,7 @@ class BinaryLogloss: public ObjectiveFunction { ...@@ -120,7 +120,7 @@ class BinaryLogloss: public ObjectiveFunction {
hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight); hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight);
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights // get label and label weights
const int is_pos = is_pos_(label_[i]); const int is_pos = is_pos_(label_[i]);
...@@ -140,14 +140,14 @@ class BinaryLogloss: public ObjectiveFunction { ...@@ -140,14 +140,14 @@ class BinaryLogloss: public ObjectiveFunction {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += is_pos_(label_[i]) * weights_[i]; suml += is_pos_(label_[i]) * weights_[i];
sumw += weights_[i]; sumw += weights_[i];
} }
} else { } else {
sumw = static_cast<double>(num_data_); sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += is_pos_(label_[i]); suml += is_pos_(label_[i]);
} }
......
...@@ -86,7 +86,7 @@ class MulticlassSoftmax: public ObjectiveFunction { ...@@ -86,7 +86,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
std::vector<double> rec; std::vector<double> rec;
#pragma omp parallel for schedule(static) private(rec) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
rec.resize(num_class_); rec.resize(num_class_);
for (int k = 0; k < num_class_; ++k) { for (int k = 0; k < num_class_; ++k) {
...@@ -107,7 +107,7 @@ class MulticlassSoftmax: public ObjectiveFunction { ...@@ -107,7 +107,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
} }
} else { } else {
std::vector<double> rec; std::vector<double> rec;
#pragma omp parallel for schedule(static) private(rec) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
rec.resize(num_class_); rec.resize(num_class_);
for (int k = 0; k < num_class_; ++k) { for (int k = 0; k < num_class_; ++k) {
......
...@@ -58,7 +58,7 @@ class RankingObjective : public ObjectiveFunction { ...@@ -58,7 +58,7 @@ class RankingObjective : public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
#pragma omp parallel for schedule(guided) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
const data_size_t start = query_boundaries_[i]; const data_size_t start = query_boundaries_[i];
const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i];
...@@ -157,7 +157,7 @@ class LambdarankNDCG : public RankingObjective { ...@@ -157,7 +157,7 @@ class LambdarankNDCG : public RankingObjective {
DCGCalculator::CheckMetadata(metadata, num_queries_); DCGCalculator::CheckMetadata(metadata, num_queries_);
DCGCalculator::CheckLabel(label_, num_data_); DCGCalculator::CheckLabel(label_, num_data_);
inverse_max_dcgs_.resize(num_queries_); inverse_max_dcgs_.resize(num_queries_);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK( inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(
truncation_level_, label_ + query_boundaries_[i], truncation_level_, label_ + query_boundaries_[i],
......
...@@ -115,7 +115,7 @@ class RegressionL2loss: public ObjectiveFunction { ...@@ -115,7 +115,7 @@ class RegressionL2loss: public ObjectiveFunction {
label_ = metadata.label(); label_ = metadata.label();
if (sqrt_) { if (sqrt_) {
trans_label_.resize(num_data_); trans_label_.resize(num_data_);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
trans_label_[i] = Common::Sign(label_[i]) * std::sqrt(std::fabs(label_[i])); trans_label_[i] = Common::Sign(label_[i]) * std::sqrt(std::fabs(label_[i]));
} }
...@@ -127,13 +127,13 @@ class RegressionL2loss: public ObjectiveFunction { ...@@ -127,13 +127,13 @@ class RegressionL2loss: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = static_cast<score_t>(score[i] - label_[i]); gradients[i] = static_cast<score_t>(score[i] - label_[i]);
hessians[i] = 1.0f; hessians[i] = 1.0f;
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = static_cast<score_t>(static_cast<score_t>((score[i] - label_[i])) * weights_[i]); gradients[i] = static_cast<score_t>(static_cast<score_t>((score[i] - label_[i])) * weights_[i]);
hessians[i] = static_cast<score_t>(weights_[i]); hessians[i] = static_cast<score_t>(weights_[i]);
...@@ -174,14 +174,14 @@ class RegressionL2loss: public ObjectiveFunction { ...@@ -174,14 +174,14 @@ class RegressionL2loss: public ObjectiveFunction {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += static_cast<double>(label_[i]) * weights_[i]; suml += static_cast<double>(label_[i]) * weights_[i];
sumw += weights_[i]; sumw += weights_[i];
} }
} else { } else {
sumw = static_cast<double>(num_data_); sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i]; suml += label_[i];
} }
...@@ -217,14 +217,14 @@ class RegressionL1loss: public RegressionL2loss { ...@@ -217,14 +217,14 @@ class RegressionL1loss: public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i]; const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff)); gradients[i] = static_cast<score_t>(Common::Sign(diff));
hessians[i] = 1.0f; hessians[i] = 1.0f;
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i]; const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i]); gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i]);
...@@ -313,7 +313,7 @@ class RegressionHuberLoss: public RegressionL2loss { ...@@ -313,7 +313,7 @@ class RegressionHuberLoss: public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i]; const double diff = score[i] - label_[i];
if (std::abs(diff) <= alpha_) { if (std::abs(diff) <= alpha_) {
...@@ -324,7 +324,7 @@ class RegressionHuberLoss: public RegressionL2loss { ...@@ -324,7 +324,7 @@ class RegressionHuberLoss: public RegressionL2loss {
hessians[i] = 1.0f; hessians[i] = 1.0f;
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i]; const double diff = score[i] - label_[i];
if (std::abs(diff) <= alpha_) { if (std::abs(diff) <= alpha_) {
...@@ -362,14 +362,14 @@ class RegressionFairLoss: public RegressionL2loss { ...@@ -362,14 +362,14 @@ class RegressionFairLoss: public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double x = score[i] - label_[i]; const double x = score[i] - label_[i];
gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_)); gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_));
hessians[i] = static_cast<score_t>(c_ * c_ / ((std::fabs(x) + c_) * (std::fabs(x) + c_))); hessians[i] = static_cast<score_t>(c_ * c_ / ((std::fabs(x) + c_) * (std::fabs(x) + c_)));
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double x = score[i] - label_[i]; const double x = score[i] - label_[i];
gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_) * weights_[i]); gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_) * weights_[i]);
...@@ -441,14 +441,14 @@ class RegressionPoissonLoss: public RegressionL2loss { ...@@ -441,14 +441,14 @@ class RegressionPoissonLoss: public RegressionL2loss {
score_t* hessians) const override { score_t* hessians) const override {
double exp_max_delta_step_ = std::exp(max_delta_step_); double exp_max_delta_step_ = std::exp(max_delta_step_);
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(score[i]); double exp_score = std::exp(score[i]);
gradients[i] = static_cast<score_t>(exp_score - label_[i]); gradients[i] = static_cast<score_t>(exp_score - label_[i]);
hessians[i] = static_cast<score_t>(exp_score * exp_max_delta_step_); hessians[i] = static_cast<score_t>(exp_score * exp_max_delta_step_);
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(score[i]); double exp_score = std::exp(score[i]);
gradients[i] = static_cast<score_t>((exp_score - label_[i]) * weights_[i]); gradients[i] = static_cast<score_t>((exp_score - label_[i]) * weights_[i]);
...@@ -493,7 +493,7 @@ class RegressionQuantileloss : public RegressionL2loss { ...@@ -493,7 +493,7 @@ class RegressionQuantileloss : public RegressionL2loss {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
score_t delta = static_cast<score_t>(score[i] - label_[i]); score_t delta = static_cast<score_t>(score[i] - label_[i]);
if (delta >= 0) { if (delta >= 0) {
...@@ -504,7 +504,7 @@ class RegressionQuantileloss : public RegressionL2loss { ...@@ -504,7 +504,7 @@ class RegressionQuantileloss : public RegressionL2loss {
hessians[i] = 1.0f; hessians[i] = 1.0f;
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
score_t delta = static_cast<score_t>(score[i] - label_[i]); score_t delta = static_cast<score_t>(score[i] - label_[i]);
if (delta >= 0) { if (delta >= 0) {
...@@ -598,12 +598,12 @@ class RegressionMAPELOSS : public RegressionL1loss { ...@@ -598,12 +598,12 @@ class RegressionMAPELOSS : public RegressionL1loss {
} }
label_weight_.resize(num_data); label_weight_.resize(num_data);
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i])); label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i]));
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i])) * weights_[i]; label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i])) * weights_[i];
} }
...@@ -613,14 +613,14 @@ class RegressionMAPELOSS : public RegressionL1loss { ...@@ -613,14 +613,14 @@ class RegressionMAPELOSS : public RegressionL1loss {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i]; const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]); gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
hessians[i] = 1.0f; hessians[i] = 1.0f;
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double diff = score[i] - label_[i]; const double diff = score[i] - label_[i];
gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]); gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
...@@ -690,14 +690,14 @@ class RegressionGammaLoss : public RegressionPoissonLoss { ...@@ -690,14 +690,14 @@ class RegressionGammaLoss : public RegressionPoissonLoss {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(-score[i]); double exp_score = std::exp(-score[i]);
gradients[i] = static_cast<score_t>(1.0 - label_[i] * exp_score); gradients[i] = static_cast<score_t>(1.0 - label_[i] * exp_score);
hessians[i] = static_cast<score_t>(label_[i] * exp_score); hessians[i] = static_cast<score_t>(label_[i] * exp_score);
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double exp_score = std::exp(-score[i]); double exp_score = std::exp(-score[i]);
gradients[i] = static_cast<score_t>((1.0 - label_[i] * exp_score) * weights_[i]); gradients[i] = static_cast<score_t>((1.0 - label_[i] * exp_score) * weights_[i]);
...@@ -728,7 +728,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss { ...@@ -728,7 +728,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
void GetGradients(const double* score, score_t* gradients, void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override { score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double exp_1_score = std::exp((1 - rho_) * score[i]); double exp_1_score = std::exp((1 - rho_) * score[i]);
double exp_2_score = std::exp((2 - rho_) * score[i]); double exp_2_score = std::exp((2 - rho_) * score[i]);
...@@ -737,7 +737,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss { ...@@ -737,7 +737,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
(2 - rho_) * exp_2_score); (2 - rho_) * exp_2_score);
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
double exp_1_score = std::exp((1 - rho_) * score[i]); double exp_1_score = std::exp((1 - rho_) * score[i]);
double exp_2_score = std::exp((2 - rho_) * score[i]); double exp_2_score = std::exp((2 - rho_) * score[i]);
......
...@@ -77,7 +77,7 @@ class CrossEntropy: public ObjectiveFunction { ...@@ -77,7 +77,7 @@ class CrossEntropy: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
// compute pointwise gradients and Hessians with implied unit weights // compute pointwise gradients and Hessians with implied unit weights
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i])); const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>(z - label_[i]); gradients[i] = static_cast<score_t>(z - label_[i]);
...@@ -85,7 +85,7 @@ class CrossEntropy: public ObjectiveFunction { ...@@ -85,7 +85,7 @@ class CrossEntropy: public ObjectiveFunction {
} }
} else { } else {
// compute pointwise gradients and Hessians with given weights // compute pointwise gradients and Hessians with given weights
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i])); const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>((z - label_[i]) * weights_[i]); gradients[i] = static_cast<score_t>((z - label_[i]) * weights_[i]);
...@@ -114,7 +114,7 @@ class CrossEntropy: public ObjectiveFunction { ...@@ -114,7 +114,7 @@ class CrossEntropy: public ObjectiveFunction {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += static_cast<double>(label_[i]) * weights_[i]; suml += static_cast<double>(label_[i]) * weights_[i];
...@@ -122,7 +122,7 @@ class CrossEntropy: public ObjectiveFunction { ...@@ -122,7 +122,7 @@ class CrossEntropy: public ObjectiveFunction {
} }
} else { } else {
sumw = static_cast<double>(num_data_); sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i]; suml += label_[i];
...@@ -190,7 +190,7 @@ class CrossEntropyLambda: public ObjectiveFunction { ...@@ -190,7 +190,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
// compute pointwise gradients and Hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights // compute pointwise gradients and Hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i])); const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>(z - label_[i]); gradients[i] = static_cast<score_t>(z - label_[i]);
...@@ -198,7 +198,7 @@ class CrossEntropyLambda: public ObjectiveFunction { ...@@ -198,7 +198,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
} }
} else { } else {
// compute pointwise gradients and Hessians with given weights // compute pointwise gradients and Hessians with given weights
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double w = weights_[i]; const double w = weights_[i];
const double y = label_[i]; const double y = label_[i];
...@@ -244,7 +244,7 @@ class CrossEntropyLambda: public ObjectiveFunction { ...@@ -244,7 +244,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += static_cast<double>(label_[i]) * weights_[i]; suml += static_cast<double>(label_[i]) * weights_[i];
...@@ -252,7 +252,7 @@ class CrossEntropyLambda: public ObjectiveFunction { ...@@ -252,7 +252,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
} }
} else { } else {
sumw = static_cast<double>(num_data_); sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i]; suml += label_[i];
......
...@@ -79,7 +79,7 @@ class ColSampler { ...@@ -79,7 +79,7 @@ class ColSampler {
static_cast<int>(valid_feature_indices_.size()), used_cnt_bytree_); static_cast<int>(valid_feature_indices_.size()), used_cnt_bytree_);
int omp_loop_size = static_cast<int>(used_feature_indices_.size()); int omp_loop_size = static_cast<int>(used_feature_indices_.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) { for (int i = 0; i < omp_loop_size; ++i) {
int used_feature = valid_feature_indices_[used_feature_indices_[i]]; int used_feature = valid_feature_indices_[used_feature_indices_[i]];
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature); int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
...@@ -142,7 +142,7 @@ class ColSampler { ...@@ -142,7 +142,7 @@ class ColSampler {
auto sampled_indices = random_.Sample( auto sampled_indices = random_.Sample(
static_cast<int>((*allowed_used_feature_indices).size()), used_feature_cnt); static_cast<int>((*allowed_used_feature_indices).size()), used_feature_cnt);
int omp_loop_size = static_cast<int>(sampled_indices.size()); int omp_loop_size = static_cast<int>(sampled_indices.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) { for (int i = 0; i < omp_loop_size; ++i) {
int used_feature = int used_feature =
valid_feature_indices_[(*allowed_used_feature_indices)[sampled_indices[i]]]; valid_feature_indices_[(*allowed_used_feature_indices)[sampled_indices[i]]];
...@@ -168,7 +168,7 @@ class ColSampler { ...@@ -168,7 +168,7 @@ class ColSampler {
auto sampled_indices = random_.Sample( auto sampled_indices = random_.Sample(
static_cast<int>((*allowed_valid_feature_indices).size()), used_feature_cnt); static_cast<int>((*allowed_valid_feature_indices).size()), used_feature_cnt);
int omp_loop_size = static_cast<int>(sampled_indices.size()); int omp_loop_size = static_cast<int>(sampled_indices.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) { for (int i = 0; i < omp_loop_size; ++i) {
int used_feature = (*allowed_valid_feature_indices)[sampled_indices[i]]; int used_feature = (*allowed_valid_feature_indices)[sampled_indices[i]];
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature); int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
......
...@@ -405,7 +405,7 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti ...@@ -405,7 +405,7 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti
} }
std::vector<int> n_nozeroworker_perleaf(cuda_tree->num_leaves(), 1); std::vector<int> n_nozeroworker_perleaf(cuda_tree->num_leaves(), 1);
int num_machines = Network::num_machines(); int num_machines = Network::num_machines();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < cuda_tree->num_leaves(); ++i) { for (int i = 0; i < cuda_tree->num_leaves(); ++i) {
const double output = static_cast<double>(cuda_tree->LeafOutput(i)); const double output = static_cast<double>(cuda_tree->LeafOutput(i));
data_size_t cnt_leaf_data = leaf_num_data_[i]; data_size_t cnt_leaf_data = leaf_num_data_[i];
......
...@@ -228,7 +228,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) { ...@@ -228,7 +228,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
if (local_data_on_smaller_leaf <= 0) { if (local_data_on_smaller_leaf <= 0) {
// clear histogram buffer before synchronizing // clear histogram buffer before synchronizing
// otherwise histogram contents from the previous iteration will be sent // otherwise histogram contents from the previous iteration will be sent
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false) if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
continue; continue;
...@@ -249,7 +249,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) { ...@@ -249,7 +249,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
// construct local histograms // construct local histograms
global_timer.Start("DataParallelTreeLearner::ReduceHistogram"); global_timer.Start("DataParallelTreeLearner::ReduceHistogram");
global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy"); global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy");
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false) if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
continue; continue;
...@@ -318,7 +318,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const ...@@ -318,7 +318,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) { if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) {
CHECK_LE(smaller_leaf_num_bits, 16); CHECK_LE(smaller_leaf_num_bits, 16);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
if (!is_feature_aggregated_[feature_index]) continue; if (!is_feature_aggregated_[feature_index]) continue;
...@@ -330,7 +330,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const ...@@ -330,7 +330,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
} }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
if (!is_feature_aggregated_[feature_index]) continue; if (!is_feature_aggregated_[feature_index]) continue;
......
...@@ -52,7 +52,7 @@ class DataPartition { ...@@ -52,7 +52,7 @@ class DataPartition {
if (used_data_indices_ == nullptr) { if (used_data_indices_ == nullptr) {
// if using all data // if using all data
leaf_count_[0] = num_data_; leaf_count_[0] = num_data_;
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
indices_[i] = i; indices_[i] = i;
} }
......
...@@ -1692,7 +1692,7 @@ class HistogramPool { ...@@ -1692,7 +1692,7 @@ class HistogramPool {
auto& ref_feature_meta = *feature_meta; auto& ref_feature_meta = *feature_meta;
const int num_feature = train_data->num_features(); const int num_feature = train_data->num_features();
ref_feature_meta.resize(num_feature); ref_feature_meta.resize(num_feature);
#pragma omp parallel for schedule(static, 512) if (num_feature >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_feature >= 1024)
for (int i = 0; i < num_feature; ++i) { for (int i = 0; i < num_feature; ++i) {
if (USE_DATA) { if (USE_DATA) {
ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i); ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i);
...@@ -1749,7 +1749,7 @@ class HistogramPool { ...@@ -1749,7 +1749,7 @@ class HistogramPool {
if (config->use_quantized_grad) { if (config->use_quantized_grad) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = old_cache_size; i < cache_size; ++i) { for (int i = old_cache_size; i < cache_size; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
pool_[i].reset(new FeatureHistogram[train_data->num_features()]); pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
...@@ -1763,7 +1763,7 @@ class HistogramPool { ...@@ -1763,7 +1763,7 @@ class HistogramPool {
OMP_THROW_EX(); OMP_THROW_EX();
} else { } else {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = old_cache_size; i < cache_size; ++i) { for (int i = old_cache_size; i < cache_size; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
pool_[i].reset(new FeatureHistogram[train_data->num_features()]); pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
...@@ -1787,7 +1787,7 @@ class HistogramPool { ...@@ -1787,7 +1787,7 @@ class HistogramPool {
old_config->extra_trees != config->extra_trees || old_config->extra_trees != config->extra_trees ||
old_config->max_delta_step != config->max_delta_step || old_config->max_delta_step != config->max_delta_step ||
old_config->path_smooth != config->path_smooth) { old_config->path_smooth != config->path_smooth) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < cache_size_; ++i) { for (int i = 0; i < cache_size_; ++i) {
for (int j = 0; j < train_data->num_features(); ++j) { for (int j = 0; j < train_data->num_features(); ++j) {
pool_[i][j].ResetFunc(); pool_[i][j].ResetFunc();
......
...@@ -191,7 +191,7 @@ void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) { ...@@ -191,7 +191,7 @@ void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_); HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
// when the output is ready, the computation is done // when the output is ready, the computation is done
histograms_wait_obj_.wait(); histograms_wait_obj_.wait();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < num_dense_feature_groups_; ++i) { for (int i = 0; i < num_dense_feature_groups_; ++i) {
if (!feature_masks_[i]) { if (!feature_masks_[i]) {
continue; continue;
...@@ -359,7 +359,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -359,7 +359,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
0, num_data_ * sizeof(Feature4))); 0, num_data_ * sizeof(Feature4)));
} }
// building Feature4 bundles; each thread handles dword_features_ features // building Feature4 bundles; each thread handles dword_features_ features
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) { for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) {
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
Feature4* host4 = host4_ptrs[tid]; Feature4* host4 = host4_ptrs[tid];
...@@ -451,7 +451,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -451,7 +451,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]); BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) { if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter); DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1))) + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
...@@ -464,14 +464,14 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -464,14 +464,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]); BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) { if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter); DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1))); + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
} }
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) { } else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter); DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1))); + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
...@@ -485,7 +485,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -485,7 +485,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
} }
// fill the leftover features // fill the leftover features
if (dword_features_ == 8) { if (dword_features_ == 8) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) { for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value // fill this empty feature with some "random" value
...@@ -493,7 +493,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -493,7 +493,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
} }
} }
} else if (dword_features_ == 4) { } else if (dword_features_ == 4) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) { for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value // fill this empty feature with some "random" value
...@@ -572,7 +572,7 @@ void GPUTreeLearner::BuildGPUKernels() { ...@@ -572,7 +572,7 @@ void GPUTreeLearner::BuildGPUKernels() {
// currently we don't use constant memory // currently we don't use constant memory
int use_constants = 0; int use_constants = 0;
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) { for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
boost::compute::program program; boost::compute::program program;
...@@ -811,7 +811,7 @@ void GPUTreeLearner::BeforeTrain() { ...@@ -811,7 +811,7 @@ void GPUTreeLearner::BeforeTrain() {
// transfer the indices to GPU // transfer the indices to GPU
indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_); indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
if (!share_state_->is_constant_hessian) { if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
ordered_hessians_[i] = hessians_[indices[i]]; ordered_hessians_[i] = hessians_[indices[i]];
} }
...@@ -827,7 +827,7 @@ void GPUTreeLearner::BeforeTrain() { ...@@ -827,7 +827,7 @@ void GPUTreeLearner::BeforeTrain() {
histogram_fulldata_kernels_[i].set_arg(6, const_hessian); histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
} }
} }
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
ordered_gradients_[i] = gradients_[indices[i]]; ordered_gradients_[i] = gradients_[indices[i]];
} }
...@@ -865,7 +865,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri ...@@ -865,7 +865,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_); indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
if (!share_state_->is_constant_hessian) { if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = begin; i < end; ++i) { for (data_size_t i = begin; i < end; ++i) {
ordered_hessians_[i - begin] = hessians_[indices[i]]; ordered_hessians_[i - begin] = hessians_[indices[i]];
} }
...@@ -873,7 +873,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri ...@@ -873,7 +873,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_); hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
} }
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = begin; i < end; ++i) { for (data_size_t i = begin; i < end; ++i) {
ordered_gradients_[i - begin] = gradients_[indices[i]]; ordered_gradients_[i - begin] = gradients_[indices[i]];
} }
...@@ -907,7 +907,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync( ...@@ -907,7 +907,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
// generate and copy ordered_gradients if gradients is not null // generate and copy ordered_gradients if gradients is not null
if (gradients != nullptr) { if (gradients != nullptr) {
if (num_data != num_data_) { if (num_data != num_data_) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]]; ordered_gradients[i] = gradients[data_indices[i]];
} }
...@@ -919,7 +919,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync( ...@@ -919,7 +919,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
// generate and copy ordered_hessians if Hessians is not null // generate and copy ordered_hessians if Hessians is not null
if (hessians != nullptr && !share_state_->is_constant_hessian) { if (hessians != nullptr && !share_state_->is_constant_hessian) {
if (num_data != num_data_) { if (num_data != num_data_) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_hessians[i] = hessians[data_indices[i]]; ordered_hessians[i] = hessians[data_indices[i]];
} }
...@@ -930,7 +930,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync( ...@@ -930,7 +930,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
} }
// converted indices in is_feature_used to feature-group indices // converted indices in is_feature_used to feature-group indices
std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0); std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_features_ >= 2048)
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_features_; ++i) {
if (is_feature_used[i]) { if (is_feature_used[i]) {
is_feature_group_used[train_data_->Feature2Group(i)] = 1; is_feature_group_used[train_data_->Feature2Group(i)] = 1;
...@@ -938,7 +938,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync( ...@@ -938,7 +938,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
} }
// construct the feature masks for dense feature-groups // construct the feature masks for dense feature-groups
int used_dense_feature_groups = 0; int used_dense_feature_groups = 0;
#pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
for (int i = 0; i < num_dense_feature_groups_; ++i) { for (int i = 0; i < num_dense_feature_groups_; ++i) {
if (is_feature_group_used[dense_feature_group_map_[i]]) { if (is_feature_group_used[dense_feature_group_map_[i]]) {
feature_masks_[i] = 1; feature_masks_[i] = 1;
...@@ -973,7 +973,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync( ...@@ -973,7 +973,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) { void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
std::vector<int8_t> is_sparse_feature_used(num_features_, 0); std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
std::vector<int8_t> is_dense_feature_used(num_features_, 0); std::vector<int8_t> is_dense_feature_used(num_features_, 0);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) { for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
if (!is_feature_used[feature_index]) continue; if (!is_feature_used[feature_index]) continue;
......
...@@ -216,7 +216,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput( ...@@ -216,7 +216,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
data_size_t leaf_cnt = 0; data_size_t leaf_cnt = 0;
const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt); const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
double sum_gradient = 0.0f, sum_hessian = 0.0f; double sum_gradient = 0.0f, sum_hessian = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
for (data_size_t i = 0; i < leaf_cnt; ++i) { for (data_size_t i = 0; i < leaf_cnt; ++i) {
const data_size_t index = data_indices[i]; const data_size_t index = data_indices[i];
const score_t grad = gradients[index]; const score_t grad = gradients[index];
...@@ -242,7 +242,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput( ...@@ -242,7 +242,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
data_size_t leaf_cnt = 0; data_size_t leaf_cnt = 0;
const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt); const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
double sum_gradient = 0.0f, sum_hessian = 0.0f; double sum_gradient = 0.0f, sum_hessian = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
for (data_size_t i = 0; i < leaf_cnt; ++i) { for (data_size_t i = 0; i < leaf_cnt; ++i) {
const data_size_t index = data_indices[i]; const data_size_t index = data_indices[i];
const score_t grad = gradients[index]; const score_t grad = gradients[index];
......
...@@ -95,7 +95,7 @@ class LeafSplits { ...@@ -95,7 +95,7 @@ class LeafSplits {
data_indices_ = nullptr; data_indices_ = nullptr;
double tmp_sum_gradients = 0.0f; double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f; double tmp_sum_hessians = 0.0f;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
tmp_sum_gradients += gradients[i]; tmp_sum_gradients += gradients[i];
tmp_sum_hessians += hessians[i]; tmp_sum_hessians += hessians[i];
...@@ -120,7 +120,7 @@ class LeafSplits { ...@@ -120,7 +120,7 @@ class LeafSplits {
double tmp_sum_hessians = 0.0f; double tmp_sum_hessians = 0.0f;
const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians); const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
int64_t tmp_sum_gradients_and_hessians = 0; int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale; tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale; tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
...@@ -149,7 +149,7 @@ class LeafSplits { ...@@ -149,7 +149,7 @@ class LeafSplits {
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
double tmp_sum_gradients = 0.0f; double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f; double tmp_sum_hessians = 0.0f;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
const data_size_t idx = data_indices_[i]; const data_size_t idx = data_indices_[i];
tmp_sum_gradients += gradients[idx]; tmp_sum_gradients += gradients[idx];
...@@ -177,7 +177,7 @@ class LeafSplits { ...@@ -177,7 +177,7 @@ class LeafSplits {
double tmp_sum_hessians = 0.0f; double tmp_sum_hessians = 0.0f;
const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians); const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
int64_t tmp_sum_gradients_and_hessians = 0; int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
const data_size_t idx = data_indices_[i]; const data_size_t idx = data_indices_[i];
tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale; tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment