Unverified Commit 8ed371ce authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

set explicit number of threads in every OpenMP `parallel` region (#6135)

parent 992f5056
...@@ -18,3 +18,28 @@ cmakelint \ ...@@ -18,3 +18,28 @@ cmakelint \
${cmake_files} \ ${cmake_files} \
|| exit -1 || exit -1
echo "done running cmakelint" echo "done running cmakelint"
echo "checking that all OpenMP pragmas specify num_threads()"
get_omp_pragmas_without_num_threads() {
grep \
-n \
-R \
--include='*.c' \
--include='*.cc' \
--include='*.cpp' \
--include='*.h' \
--include='*.hpp' \
'pragma omp parallel' \
| grep -v ' num_threads' \
| grep -v 'openmp_wrapper.h'
}
PROBLEMATIC_LINES=$(
get_omp_pragmas_without_num_threads
)
if test "${PROBLEMATIC_LINES}" != ""; then
get_omp_pragmas_without_num_threads
echo "Found '#pragma omp parallel' not using explicit num_threads() configuration. Fix those."
echo "For details, see https://www.openmp.org/spec-html/5.0/openmpse14.html#x54-800002.6"
exit -1
fi
echo "done checking OpenMP pragmas"
...@@ -226,7 +226,7 @@ SEXP LGBM_DatasetGetSubset_R(SEXP handle, ...@@ -226,7 +226,7 @@ SEXP LGBM_DatasetGetSubset_R(SEXP handle,
int32_t len = static_cast<int32_t>(Rf_asInteger(len_used_row_indices)); int32_t len = static_cast<int32_t>(Rf_asInteger(len_used_row_indices));
std::vector<int32_t> idxvec(len); std::vector<int32_t> idxvec(len);
// convert from one-based to zero-based index // convert from one-based to zero-based index
#pragma omp parallel for schedule(static, 512) if (len >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024)
for (int32_t i = 0; i < len; ++i) { for (int32_t i = 0; i < len; ++i) {
idxvec[i] = static_cast<int32_t>(INTEGER(used_row_indices)[i] - 1); idxvec[i] = static_cast<int32_t>(INTEGER(used_row_indices)[i] - 1);
} }
...@@ -339,7 +339,7 @@ SEXP LGBM_DatasetSetField_R(SEXP handle, ...@@ -339,7 +339,7 @@ SEXP LGBM_DatasetSetField_R(SEXP handle,
const char* name = CHAR(PROTECT(Rf_asChar(field_name))); const char* name = CHAR(PROTECT(Rf_asChar(field_name)));
if (!strcmp("group", name) || !strcmp("query", name)) { if (!strcmp("group", name) || !strcmp("query", name)) {
std::vector<int32_t> vec(len); std::vector<int32_t> vec(len);
#pragma omp parallel for schedule(static, 512) if (len >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024)
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
vec[i] = static_cast<int32_t>(INTEGER(field_data)[i]); vec[i] = static_cast<int32_t>(INTEGER(field_data)[i]);
} }
...@@ -348,7 +348,7 @@ SEXP LGBM_DatasetSetField_R(SEXP handle, ...@@ -348,7 +348,7 @@ SEXP LGBM_DatasetSetField_R(SEXP handle,
CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, REAL(field_data), len, C_API_DTYPE_FLOAT64)); CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, REAL(field_data), len, C_API_DTYPE_FLOAT64));
} else { } else {
std::vector<float> vec(len); std::vector<float> vec(len);
#pragma omp parallel for schedule(static, 512) if (len >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024)
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
vec[i] = static_cast<float>(REAL(field_data)[i]); vec[i] = static_cast<float>(REAL(field_data)[i]);
} }
...@@ -372,19 +372,19 @@ SEXP LGBM_DatasetGetField_R(SEXP handle, ...@@ -372,19 +372,19 @@ SEXP LGBM_DatasetGetField_R(SEXP handle,
if (!strcmp("group", name) || !strcmp("query", name)) { if (!strcmp("group", name) || !strcmp("query", name)) {
auto p_data = reinterpret_cast<const int32_t*>(res); auto p_data = reinterpret_cast<const int32_t*>(res);
// convert from boundaries to size // convert from boundaries to size
#pragma omp parallel for schedule(static, 512) if (out_len >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024)
for (int i = 0; i < out_len - 1; ++i) { for (int i = 0; i < out_len - 1; ++i) {
INTEGER(field_data)[i] = p_data[i + 1] - p_data[i]; INTEGER(field_data)[i] = p_data[i + 1] - p_data[i];
} }
} else if (!strcmp("init_score", name)) { } else if (!strcmp("init_score", name)) {
auto p_data = reinterpret_cast<const double*>(res); auto p_data = reinterpret_cast<const double*>(res);
#pragma omp parallel for schedule(static, 512) if (out_len >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024)
for (int i = 0; i < out_len; ++i) { for (int i = 0; i < out_len; ++i) {
REAL(field_data)[i] = p_data[i]; REAL(field_data)[i] = p_data[i];
} }
} else { } else {
auto p_data = reinterpret_cast<const float*>(res); auto p_data = reinterpret_cast<const float*>(res);
#pragma omp parallel for schedule(static, 512) if (out_len >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024)
for (int i = 0; i < out_len; ++i) { for (int i = 0; i < out_len; ++i) {
REAL(field_data)[i] = p_data[i]; REAL(field_data)[i] = p_data[i];
} }
...@@ -611,7 +611,7 @@ SEXP LGBM_BoosterUpdateOneIterCustom_R(SEXP handle, ...@@ -611,7 +611,7 @@ SEXP LGBM_BoosterUpdateOneIterCustom_R(SEXP handle,
int is_finished = 0; int is_finished = 0;
int int_len = Rf_asInteger(len); int int_len = Rf_asInteger(len);
std::vector<float> tgrad(int_len), thess(int_len); std::vector<float> tgrad(int_len), thess(int_len);
#pragma omp parallel for schedule(static, 512) if (int_len >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (int_len >= 1024)
for (int j = 0; j < int_len; ++j) { for (int j = 0; j < int_len; ++j) {
tgrad[j] = static_cast<float>(REAL(grad)[j]); tgrad[j] = static_cast<float>(REAL(grad)[j]);
thess[j] = static_cast<float>(REAL(hess)[j]); thess[j] = static_cast<float>(REAL(hess)[j]);
......
...@@ -361,7 +361,7 @@ class FeatureGroup { ...@@ -361,7 +361,7 @@ class FeatureGroup {
inline void FinishLoad() { inline void FinishLoad() {
if (is_multi_val_) { if (is_multi_val_) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
multi_bin_data_[i]->FinishLoad(); multi_bin_data_[i]->FinishLoad();
......
...@@ -185,7 +185,7 @@ class Tree { ...@@ -185,7 +185,7 @@ class Tree {
* \param rate The factor of shrinkage * \param rate The factor of shrinkage
*/ */
virtual inline void Shrinkage(double rate) { virtual inline void Shrinkage(double rate) {
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
for (int i = 0; i < num_leaves_ - 1; ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate); leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate); internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate);
...@@ -210,7 +210,7 @@ class Tree { ...@@ -210,7 +210,7 @@ class Tree {
inline double shrinkage() const { return shrinkage_; } inline double shrinkage() const { return shrinkage_; }
virtual inline void AddBias(double val) { virtual inline void AddBias(double val) {
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
for (int i = 0; i < num_leaves_ - 1; ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val); leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val); internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val);
...@@ -218,7 +218,7 @@ class Tree { ...@@ -218,7 +218,7 @@ class Tree {
leaf_value_[num_leaves_ - 1] = leaf_value_[num_leaves_ - 1] =
MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val); MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val);
if (is_linear_) { if (is_linear_) {
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
for (int i = 0; i < num_leaves_ - 1; ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val); leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val);
} }
......
...@@ -691,7 +691,7 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) { ...@@ -691,7 +691,7 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
size_t inner_size = (len + num_threads - 1) / num_threads; size_t inner_size = (len + num_threads - 1) / num_threads;
inner_size = std::max(inner_size, kMinInnerLen); inner_size = std::max(inner_size, kMinInnerLen);
num_threads = static_cast<int>((len + inner_size - 1) / inner_size); num_threads = static_cast<int>((len + inner_size - 1) / inner_size);
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(num_threads) schedule(static, 1)
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
size_t left = inner_size*i; size_t left = inner_size*i;
size_t right = left + inner_size; size_t right = left + inner_size;
...@@ -707,7 +707,7 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) { ...@@ -707,7 +707,7 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
// Recursive merge // Recursive merge
while (s < len) { while (s < len) {
int loop_size = static_cast<int>((len + s * 2 - 1) / (s * 2)); int loop_size = static_cast<int>((len + s * 2 - 1) / (s * 2));
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(num_threads) schedule(static, 1)
for (int i = 0; i < loop_size; ++i) { for (int i = 0; i < loop_size; ++i) {
size_t left = i * 2 * s; size_t left = i * 2 * s;
size_t mid = left + s; size_t mid = left + s;
......
...@@ -73,7 +73,7 @@ class Threading { ...@@ -73,7 +73,7 @@ class Threading {
INDEX_T num_inner = end - start; INDEX_T num_inner = end - start;
BlockInfo<INDEX_T>(num_inner, min_block_size, &n_block, &num_inner); BlockInfo<INDEX_T>(num_inner, min_block_size, &n_block, &num_inner);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
for (int i = 0; i < n_block; ++i) { for (int i = 0; i < n_block; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
INDEX_T inner_start = start + num_inner * i; INDEX_T inner_start = start + num_inner * i;
......
...@@ -227,7 +227,7 @@ void Application::Predict() { ...@@ -227,7 +227,7 @@ void Application::Predict() {
TextReader<int> result_reader(config_.output_result.c_str(), false); TextReader<int> result_reader(config_.output_result.c_str(), false);
result_reader.ReadAllLines(); result_reader.ReadAllLines();
std::vector<std::vector<int>> pred_leaf(result_reader.Lines().size()); std::vector<std::vector<int>> pred_leaf(result_reader.Lines().size());
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < static_cast<int>(result_reader.Lines().size()); ++i) { for (int i = 0; i < static_cast<int>(result_reader.Lines().size()); ++i) {
pred_leaf[i] = Common::StringToArray<int>(result_reader.Lines()[i], '\t'); pred_leaf[i] = Common::StringToArray<int>(result_reader.Lines()[i], '\t');
// Free memory // Free memory
......
...@@ -233,7 +233,7 @@ class Predictor { ...@@ -233,7 +233,7 @@ class Predictor {
std::vector<std::pair<int, double>> oneline_features; std::vector<std::pair<int, double>> oneline_features;
std::vector<std::string> result_to_write(lines.size()); std::vector<std::string> result_to_write(lines.size());
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) firstprivate(oneline_features) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(oneline_features)
for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) { for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
oneline_features.clear(); oneline_features.clear();
......
...@@ -255,7 +255,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction) ...@@ -255,7 +255,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
std::vector<int> leaf_pred(num_data_); std::vector<int> leaf_pred(num_data_);
if (linear_tree_) { if (linear_tree_) {
std::vector<int> max_leaves_by_thread = std::vector<int>(OMP_NUM_THREADS(), 0); std::vector<int> max_leaves_by_thread = std::vector<int>(OMP_NUM_THREADS(), 0);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < static_cast<int>(tree_leaf_prediction.size()); ++i) { for (int i = 0; i < static_cast<int>(tree_leaf_prediction.size()); ++i) {
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
for (size_t j = 0; j < tree_leaf_prediction[i].size(); ++j) { for (size_t j = 0; j < tree_leaf_prediction[i].size(); ++j) {
...@@ -270,7 +270,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction) ...@@ -270,7 +270,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
Boosting(); Boosting();
for (int tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) { for (int tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) {
int model_index = iter * num_tree_per_iteration_ + tree_id; int model_index = iter * num_tree_per_iteration_ + tree_id;
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < num_data_; ++i) { for (int i = 0; i < num_data_; ++i) {
leaf_pred[i] = tree_leaf_prediction[i][model_index]; leaf_pred[i] = tree_leaf_prediction[i][model_index];
CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves()); CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves());
...@@ -348,7 +348,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ...@@ -348,7 +348,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
if (data_sample_strategy_->IsHessianChange()) { if (data_sample_strategy_->IsHessianChange()) {
// need to copy customized gradients when using GOSS // need to copy customized gradients when using GOSS
int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration_; int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration_;
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int64_t i = 0; i < total_size; ++i) { for (int64_t i = 0; i < total_size; ++i) {
gradients_[i] = gradients[i]; gradients_[i] = gradients[i];
hessians_[i] = hessians[i]; hessians_[i] = hessians[i];
...@@ -669,7 +669,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) { ...@@ -669,7 +669,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
} }
#endif // USE_CUDA #endif // USE_CUDA
if (objective_function_ != nullptr) { if (objective_function_ != nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
std::vector<double> tree_pred(num_tree_per_iteration_); std::vector<double> tree_pred(num_tree_per_iteration_);
for (int j = 0; j < num_tree_per_iteration_; ++j) { for (int j = 0; j < num_tree_per_iteration_; ++j) {
...@@ -682,7 +682,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) { ...@@ -682,7 +682,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
} }
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
for (int j = 0; j < num_tree_per_iteration_; ++j) { for (int j = 0; j < num_tree_per_iteration_; ++j) {
out_result[j * num_data + i] = static_cast<double>(raw_scores[j * num_data + i]); out_result[j * num_data + i] = static_cast<double>(raw_scores[j * num_data + i]);
......
...@@ -434,7 +434,7 @@ class GBDT : public GBDTBase { ...@@ -434,7 +434,7 @@ class GBDT : public GBDTBase {
} }
start_iteration_for_pred_ = start_iteration; start_iteration_for_pred_ = start_iteration;
if (is_pred_contrib) { if (is_pred_contrib) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < static_cast<int>(models_.size()); ++i) { for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
models_[i]->RecomputeMaxDepth(); models_[i]->RecomputeMaxDepth();
} }
......
...@@ -354,7 +354,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ...@@ -354,7 +354,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int
std::vector<std::string> tree_strs(num_used_model - start_model); std::vector<std::string> tree_strs(num_used_model - start_model);
std::vector<size_t> tree_sizes(num_used_model - start_model); std::vector<size_t> tree_sizes(num_used_model - start_model);
// output tree models // output tree models
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = start_model; i < num_used_model; ++i) { for (int i = start_model; i < num_used_model; ++i) {
const int idx = i - start_model; const int idx = i - start_model;
tree_strs[idx] = "Tree=" + std::to_string(idx) + '\n'; tree_strs[idx] = "Tree=" + std::to_string(idx) + '\n';
...@@ -552,7 +552,7 @@ bool GBDT::LoadModelFromString(const char* buffer, size_t len) { ...@@ -552,7 +552,7 @@ bool GBDT::LoadModelFromString(const char* buffer, size_t len) {
models_.emplace_back(nullptr); models_.emplace_back(nullptr);
} }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < num_trees; ++i) { for (int i = 0; i < num_trees; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
auto cur_p = p + tree_boundries[i]; auto cur_p = p + tree_boundries[i];
......
...@@ -97,7 +97,7 @@ class RF : public GBDT { ...@@ -97,7 +97,7 @@ class RF : public GBDT {
} }
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_; size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
std::vector<double> tmp_scores(total_size, 0.0f); std::vector<double> tmp_scores(total_size, 0.0f);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int j = 0; j < num_tree_per_iteration_; ++j) { for (int j = 0; j < num_tree_per_iteration_; ++j) {
size_t offset = static_cast<size_t>(j)* num_data_; size_t offset = static_cast<size_t>(j)* num_data_;
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
......
...@@ -39,7 +39,7 @@ class ScoreUpdater { ...@@ -39,7 +39,7 @@ class ScoreUpdater {
Log::Fatal("Number of class for initial score error"); Log::Fatal("Number of class for initial score error");
} }
has_init_score_ = true; has_init_score_ = true;
#pragma omp parallel for schedule(static, 512) if (total_size >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (total_size >= 1024)
for (int64_t i = 0; i < total_size; ++i) { for (int64_t i = 0; i < total_size; ++i) {
score_[i] = init_score[i]; score_[i] = init_score[i];
} }
...@@ -54,7 +54,7 @@ class ScoreUpdater { ...@@ -54,7 +54,7 @@ class ScoreUpdater {
virtual inline void AddScore(double val, int cur_tree_id) { virtual inline void AddScore(double val, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id; const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
for (int i = 0; i < num_data_; ++i) { for (int i = 0; i < num_data_; ++i) {
score_[offset + i] += val; score_[offset + i] += val;
} }
...@@ -62,7 +62,7 @@ class ScoreUpdater { ...@@ -62,7 +62,7 @@ class ScoreUpdater {
virtual inline void MultiplyScore(double val, int cur_tree_id) { virtual inline void MultiplyScore(double val, int cur_tree_id) {
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id; const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
for (int i = 0; i < num_data_; ++i) { for (int i = 0; i < num_data_; ++i) {
score_[offset + i] *= val; score_[offset + i] *= val;
} }
......
...@@ -437,7 +437,7 @@ class Booster { ...@@ -437,7 +437,7 @@ class Booster {
int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib); int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib);
auto pred_fun = predictor.GetPredictFunction(); auto pred_fun = predictor.GetPredictFunction();
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
auto one_row = get_row_fun(i); auto one_row = get_row_fun(i);
...@@ -459,7 +459,7 @@ class Booster { ...@@ -459,7 +459,7 @@ class Booster {
auto pred_sparse_fun = predictor.GetPredictSparseFunction(); auto pred_sparse_fun = predictor.GetPredictSparseFunction();
std::vector<std::vector<std::unordered_map<int, double>>>& agg = *agg_ptr; std::vector<std::vector<std::unordered_map<int, double>>>& agg = *agg_ptr;
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int64_t i = 0; i < nrow; ++i) { for (int64_t i = 0; i < nrow; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
auto one_row = get_row_fun(i); auto one_row = get_row_fun(i);
...@@ -551,7 +551,7 @@ class Booster { ...@@ -551,7 +551,7 @@ class Booster {
indptr_index++; indptr_index++;
int64_t matrix_start_index = m * static_cast<int64_t>(agg.size()); int64_t matrix_start_index = m * static_cast<int64_t>(agg.size());
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) { for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
auto row_vector = agg[i]; auto row_vector = agg[i];
...@@ -663,7 +663,7 @@ class Booster { ...@@ -663,7 +663,7 @@ class Booster {
} }
// Note: we parallelize across matrices instead of rows because of the column_counts[m][col_idx] increment inside the loop // Note: we parallelize across matrices instead of rows because of the column_counts[m][col_idx] increment inside the loop
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int m = 0; m < num_matrices; ++m) { for (int m = 0; m < num_matrices; ++m) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) { for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) {
...@@ -1074,7 +1074,7 @@ int LGBM_DatasetPushRows(DatasetHandle dataset, ...@@ -1074,7 +1074,7 @@ int LGBM_DatasetPushRows(DatasetHandle dataset,
p_dataset->ResizeRaw(p_dataset->num_numeric_features() + nrow); p_dataset->ResizeRaw(p_dataset->num_numeric_features() + nrow);
} }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
...@@ -1116,7 +1116,7 @@ int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, ...@@ -1116,7 +1116,7 @@ int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset,
const int max_omp_threads = p_dataset->omp_max_threads() > 0 ? p_dataset->omp_max_threads() : OMP_NUM_THREADS(); const int max_omp_threads = p_dataset->omp_max_threads() > 0 ? p_dataset->omp_max_threads() : OMP_NUM_THREADS();
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
// convert internal thread id to be unique based on external thread id // convert internal thread id to be unique based on external thread id
...@@ -1153,7 +1153,7 @@ int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset, ...@@ -1153,7 +1153,7 @@ int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset,
p_dataset->ResizeRaw(p_dataset->num_numeric_features() + nrow); p_dataset->ResizeRaw(p_dataset->num_numeric_features() + nrow);
} }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
...@@ -1199,7 +1199,7 @@ int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle dataset, ...@@ -1199,7 +1199,7 @@ int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle dataset,
const int max_omp_threads = p_dataset->omp_max_threads() > 0 ? p_dataset->omp_max_threads() : OMP_NUM_THREADS(); const int max_omp_threads = p_dataset->omp_max_threads() > 0 ? p_dataset->omp_max_threads() : OMP_NUM_THREADS();
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
// convert internal thread id to be unique based on external thread id // convert internal thread id to be unique based on external thread id
...@@ -1319,7 +1319,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, ...@@ -1319,7 +1319,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
int32_t start_row = 0; int32_t start_row = 0;
for (int j = 0; j < nmat; ++j) { for (int j = 0; j < nmat; ++j) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nrow[j]; ++i) { for (int i = 0; i < nrow[j]; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
...@@ -1394,7 +1394,7 @@ int LGBM_DatasetCreateFromCSR(const void* indptr, ...@@ -1394,7 +1394,7 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
} }
} }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nindptr - 1; ++i) { for (int i = 0; i < nindptr - 1; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
...@@ -1465,7 +1465,7 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, ...@@ -1465,7 +1465,7 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
OMP_INIT_EX(); OMP_INIT_EX();
std::vector<std::pair<int, double>> thread_buffer; std::vector<std::pair<int, double>> thread_buffer;
#pragma omp parallel for schedule(static) private(thread_buffer) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(thread_buffer)
for (int i = 0; i < num_rows; ++i) { for (int i = 0; i < num_rows; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
{ {
...@@ -1506,7 +1506,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -1506,7 +1506,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
std::vector<std::vector<double>> sample_values(ncol_ptr - 1); std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
std::vector<std::vector<int>> sample_idx(ncol_ptr - 1); std::vector<std::vector<int>> sample_idx(ncol_ptr - 1);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) { for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i); CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
...@@ -1534,7 +1534,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -1534,7 +1534,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
reinterpret_cast<const Dataset*>(reference)); reinterpret_cast<const Dataset*>(reference));
} }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < ncol_ptr - 1; ++i) { for (int i = 0; i < ncol_ptr - 1; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
......
...@@ -536,7 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector<uint32_t>& ...@@ -536,7 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector<uint32_t>&
std::vector<uint32_t> most_freq_bins; std::vector<uint32_t> most_freq_bins;
double sum_sparse_rate = 0; double sum_sparse_rate = 0;
for (int i = 0; i < num_feature; ++i) { for (int i = 0; i < num_feature; ++i) {
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back( iters[tid].emplace_back(
feature_groups_[multi_group_id]->SubFeatureIterator(i)); feature_groups_[multi_group_id]->SubFeatureIterator(i));
...@@ -584,7 +584,7 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& of ...@@ -584,7 +584,7 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& of
for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) { for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid]; const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
most_freq_bins.push_back(bin_mapper->GetMostFreqBin()); most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back( iters[tid].emplace_back(
feature_groups_[gid]->SubFeatureIterator(fid)); feature_groups_[gid]->SubFeatureIterator(fid));
...@@ -823,7 +823,7 @@ void Dataset::ReSize(data_size_t num_data) { ...@@ -823,7 +823,7 @@ void Dataset::ReSize(data_size_t num_data) {
if (num_data_ != num_data) { if (num_data_ != num_data) {
num_data_ = num_data; num_data_ = num_data;
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int group = 0; group < num_groups_; ++group) { for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
feature_groups_[group]->ReSize(num_data_); feature_groups_[group]->ReSize(num_data_);
...@@ -856,7 +856,7 @@ void Dataset::CopySubrow(const Dataset* fullset, ...@@ -856,7 +856,7 @@ void Dataset::CopySubrow(const Dataset* fullset,
int num_copy_tasks = static_cast<int>(group_ids.size()); int num_copy_tasks = static_cast<int>(group_ids.size());
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(dynamic) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic)
for (int task_id = 0; task_id < num_copy_tasks; ++task_id) { for (int task_id = 0; task_id < num_copy_tasks; ++task_id) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
int group = group_ids[task_id]; int group = group_ids[task_id];
...@@ -875,7 +875,7 @@ void Dataset::CopySubrow(const Dataset* fullset, ...@@ -875,7 +875,7 @@ void Dataset::CopySubrow(const Dataset* fullset,
num_numeric_features_ = fullset->num_numeric_features_; num_numeric_features_ = fullset->num_numeric_features_;
if (has_raw_) { if (has_raw_) {
ResizeRaw(num_used_indices); ResizeRaw(num_used_indices);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < num_used_indices; ++i) { for (int i = 0; i < num_used_indices; ++i) {
for (int j = 0; j < num_numeric_features_; ++j) { for (int j = 0; j < num_numeric_features_; ++j) {
raw_data_[j][i] = fullset->raw_data_[j][used_indices[i]]; raw_data_[j][i] = fullset->raw_data_[j][used_indices[i]];
...@@ -1282,7 +1282,7 @@ void Dataset::ConstructHistogramsInner( ...@@ -1282,7 +1282,7 @@ void Dataset::ConstructHistogramsInner(
int16_t* ordered_gradients_and_hessians = reinterpret_cast<int16_t*>(ordered_gradients); int16_t* ordered_gradients_and_hessians = reinterpret_cast<int16_t*>(ordered_gradients);
const int16_t* gradients_and_hessians = reinterpret_cast<const int16_t*>(gradients); const int16_t* gradients_and_hessians = reinterpret_cast<const int16_t*>(gradients);
if (USE_INDICES) { if (USE_INDICES) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients_and_hessians[i] = gradients_and_hessians[data_indices[i]]; ordered_gradients_and_hessians[i] = gradients_and_hessians[data_indices[i]];
} }
...@@ -1292,7 +1292,7 @@ void Dataset::ConstructHistogramsInner( ...@@ -1292,7 +1292,7 @@ void Dataset::ConstructHistogramsInner(
} else { } else {
if (USE_INDICES) { if (USE_INDICES) {
if (USE_HESSIAN) { if (USE_HESSIAN) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]]; ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]]; ordered_hessians[i] = hessians[data_indices[i]];
...@@ -1300,7 +1300,7 @@ void Dataset::ConstructHistogramsInner( ...@@ -1300,7 +1300,7 @@ void Dataset::ConstructHistogramsInner(
ptr_ordered_grad = ordered_gradients; ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians; ptr_ordered_hess = ordered_hessians;
} else { } else {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]]; ordered_gradients[i] = gradients[data_indices[i]];
} }
......
...@@ -625,7 +625,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, ...@@ -625,7 +625,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
if (Network::num_machines() == 1) { if (Network::num_machines() == 1) {
// if only one machine, find bin locally // if only one machine, find bin locally
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (int i = 0; i < num_col; ++i) { for (int i = 0; i < num_col; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
if (ignore_features_.count(i) > 0) { if (ignore_features_.count(i) > 0) {
...@@ -674,7 +674,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, ...@@ -674,7 +674,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
} }
len[num_machines - 1] = num_total_features - start[num_machines - 1]; len[num_machines - 1] = num_total_features - start[num_machines - 1];
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (int i = 0; i < len[rank]; ++i) { for (int i = 0; i < len[rank]; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
if (ignore_features_.count(start[rank] + i) > 0) { if (ignore_features_.count(start[rank] + i) > 0) {
...@@ -1136,7 +1136,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -1136,7 +1136,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
if (num_machines == 1) { if (num_machines == 1) {
// if only one machine, find bin locally // if only one machine, find bin locally
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) { for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
if (ignore_features_.count(i) > 0) { if (ignore_features_.count(i) > 0) {
...@@ -1177,7 +1177,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -1177,7 +1177,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
} }
len[num_machines - 1] = dataset->num_total_features_ - start[num_machines - 1]; len[num_machines - 1] = dataset->num_total_features_ - start[num_machines - 1];
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
for (int i = 0; i < len[rank]; ++i) { for (int i = 0; i < len[rank]; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
if (ignore_features_.count(start[rank] + i) > 0) { if (ignore_features_.count(start[rank] + i) > 0) {
...@@ -1268,7 +1268,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat ...@@ -1268,7 +1268,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
if (!predict_fun_) { if (!predict_fun_) {
OMP_INIT_EX(); OMP_INIT_EX();
// if doesn't need to prediction with initial model // if doesn't need to prediction with initial model
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
for (data_size_t i = 0; i < dataset->num_data_; ++i) { for (data_size_t i = 0; i < dataset->num_data_; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
...@@ -1319,7 +1319,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat ...@@ -1319,7 +1319,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
OMP_INIT_EX(); OMP_INIT_EX();
// if need to prediction with initial model // if need to prediction with initial model
std::vector<double> init_score(static_cast<size_t>(dataset->num_data_) * num_class_); std::vector<double> init_score(static_cast<size_t>(dataset->num_data_) * num_class_);
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
for (data_size_t i = 0; i < dataset->num_data_; ++i) { for (data_size_t i = 0; i < dataset->num_data_; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
...@@ -1394,7 +1394,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* ...@@ -1394,7 +1394,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
double tmp_label = 0.0f; double tmp_label = 0.0f;
std::vector<float> feature_row(dataset->num_features_); std::vector<float> feature_row(dataset->num_features_);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) { for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
......
...@@ -101,7 +101,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da ...@@ -101,7 +101,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
num_data_ = num_used_indices; num_data_ = num_used_indices;
label_ = std::vector<label_t>(num_used_indices); label_ = std::vector<label_t>(num_used_indices);
#pragma omp parallel for schedule(static, 512) if (num_used_indices >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_used_indices >= 1024)
for (data_size_t i = 0; i < num_used_indices; ++i) { for (data_size_t i = 0; i < num_used_indices; ++i) {
label_[i] = fullset.label_[used_indices[i]]; label_[i] = fullset.label_[used_indices[i]];
} }
...@@ -109,7 +109,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da ...@@ -109,7 +109,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
if (!fullset.weights_.empty()) { if (!fullset.weights_.empty()) {
weights_ = std::vector<label_t>(num_used_indices); weights_ = std::vector<label_t>(num_used_indices);
num_weights_ = num_used_indices; num_weights_ = num_used_indices;
#pragma omp parallel for schedule(static, 512) if (num_used_indices >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_used_indices >= 1024)
for (data_size_t i = 0; i < num_used_indices; ++i) { for (data_size_t i = 0; i < num_used_indices; ++i) {
weights_[i] = fullset.weights_[used_indices[i]]; weights_[i] = fullset.weights_[used_indices[i]];
} }
...@@ -121,7 +121,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da ...@@ -121,7 +121,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
int num_class = static_cast<int>(fullset.num_init_score_ / fullset.num_data_); int num_class = static_cast<int>(fullset.num_init_score_ / fullset.num_data_);
init_score_ = std::vector<double>(static_cast<size_t>(num_used_indices) * num_class); init_score_ = std::vector<double>(static_cast<size_t>(num_used_indices) * num_class);
num_init_score_ = static_cast<int64_t>(num_used_indices) * num_class; num_init_score_ = static_cast<int64_t>(num_used_indices) * num_class;
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int k = 0; k < num_class; ++k) { for (int k = 0; k < num_class; ++k) {
const size_t offset_dest = static_cast<size_t>(k) * num_data_; const size_t offset_dest = static_cast<size_t>(k) * num_data_;
const size_t offset_src = static_cast<size_t>(k) * fullset.num_data_; const size_t offset_src = static_cast<size_t>(k) * fullset.num_data_;
...@@ -173,7 +173,7 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) { ...@@ -173,7 +173,7 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
auto old_label = label_; auto old_label = label_;
num_data_ = static_cast<data_size_t>(used_indices.size()); num_data_ = static_cast<data_size_t>(used_indices.size());
label_ = std::vector<label_t>(num_data_); label_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = old_label[used_indices[i]]; label_[i] = old_label[used_indices[i]];
} }
...@@ -255,7 +255,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data ...@@ -255,7 +255,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
auto old_weights = weights_; auto old_weights = weights_;
num_weights_ = num_data_; num_weights_ = num_data_;
weights_ = std::vector<label_t>(num_data_); weights_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static, 512) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512)
for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) { for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
weights_[i] = old_weights[used_data_indices[i]]; weights_[i] = old_weights[used_data_indices[i]];
} }
...@@ -274,7 +274,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data ...@@ -274,7 +274,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
auto old_positions = positions_; auto old_positions = positions_;
num_positions_ = num_data_; num_positions_ = num_data_;
positions_ = std::vector<data_size_t>(num_data_); positions_ = std::vector<data_size_t>(num_data_);
#pragma omp parallel for schedule(static, 512) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512)
for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) { for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
positions_[i] = old_positions[used_data_indices[i]]; positions_[i] = old_positions[used_data_indices[i]];
} }
...@@ -335,7 +335,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data ...@@ -335,7 +335,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
int num_class = static_cast<int>(num_init_score_ / num_all_data); int num_class = static_cast<int>(num_init_score_ / num_all_data);
num_init_score_ = static_cast<int64_t>(num_data_) * num_class; num_init_score_ = static_cast<int64_t>(num_data_) * num_class;
init_score_ = std::vector<double>(num_init_score_); init_score_ = std::vector<double>(num_init_score_);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int k = 0; k < num_class; ++k) { for (int k = 0; k < num_class; ++k) {
const size_t offset_dest = static_cast<size_t>(k) * num_data_; const size_t offset_dest = static_cast<size_t>(k) * num_data_;
const size_t offset_src = static_cast<size_t>(k) * num_all_data; const size_t offset_src = static_cast<size_t>(k) * num_all_data;
...@@ -369,7 +369,7 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) { ...@@ -369,7 +369,7 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
if (init_score_.empty()) { init_score_.resize(len); } if (init_score_.empty()) { init_score_.resize(len); }
num_init_score_ = len; num_init_score_ = len;
#pragma omp parallel for schedule(static, 512) if (num_init_score_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024)
for (int64_t i = 0; i < num_init_score_; ++i) { for (int64_t i = 0; i < num_init_score_; ++i) {
init_score_[i] = Common::AvoidInf(init_score[i]); init_score_[i] = Common::AvoidInf(init_score[i]);
} }
...@@ -413,7 +413,7 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) { ...@@ -413,7 +413,7 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) {
} }
if (label_.empty()) { label_.resize(num_data_); } if (label_.empty()) { label_.resize(num_data_); }
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = Common::AvoidInf(label[i]); label_[i] = Common::AvoidInf(label[i]);
} }
...@@ -452,7 +452,7 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) { ...@@ -452,7 +452,7 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) {
if (weights_.empty()) { weights_.resize(num_data_); } if (weights_.empty()) { weights_.resize(num_data_); }
num_weights_ = num_data_; num_weights_ = num_data_;
#pragma omp parallel for schedule(static, 512) if (num_weights_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_weights_ >= 1024)
for (data_size_t i = 0; i < num_weights_; ++i) { for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = Common::AvoidInf(weights[i]); weights_[i] = Common::AvoidInf(weights[i]);
} }
...@@ -492,7 +492,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { ...@@ -492,7 +492,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
return; return;
} }
data_size_t sum = 0; data_size_t sum = 0;
#pragma omp parallel for schedule(static) reduction(+:sum) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum)
for (data_size_t i = 0; i < len; ++i) { for (data_size_t i = 0; i < len; ++i) {
sum += query[i]; sum += query[i];
} }
...@@ -554,7 +554,7 @@ void Metadata::SetPosition(const data_size_t* positions, data_size_t len) { ...@@ -554,7 +554,7 @@ void Metadata::SetPosition(const data_size_t* positions, data_size_t len) {
Log::Debug("number of unique positions found = %ld", position_ids_.size()); Log::Debug("number of unique positions found = %ld", position_ids_.size());
#pragma omp parallel for schedule(static, 512) if (num_positions_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_positions_ >= 1024)
for (data_size_t i = 0; i < num_positions_; ++i) { for (data_size_t i = 0; i < num_positions_; ++i) {
positions_[i] = map_id2pos.at(positions[i]); positions_[i] = map_id2pos.at(positions[i]);
} }
...@@ -590,7 +590,7 @@ void Metadata::LoadWeights() { ...@@ -590,7 +590,7 @@ void Metadata::LoadWeights() {
Log::Info("Loading weights..."); Log::Info("Loading weights...");
num_weights_ = static_cast<data_size_t>(reader.Lines().size()); num_weights_ = static_cast<data_size_t>(reader.Lines().size());
weights_ = std::vector<label_t>(num_weights_); weights_ = std::vector<label_t>(num_weights_);
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) { for (data_size_t i = 0; i < num_weights_; ++i) {
double tmp_weight = 0.0f; double tmp_weight = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp_weight); Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
...@@ -645,7 +645,7 @@ void Metadata::LoadInitialScore(const std::string& data_filename) { ...@@ -645,7 +645,7 @@ void Metadata::LoadInitialScore(const std::string& data_filename) {
init_score_ = std::vector<double>(num_init_score_); init_score_ = std::vector<double>(num_init_score_);
if (num_class == 1) { if (num_class == 1) {
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_line; ++i) { for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f; double tmp = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp); Common::Atof(reader.Lines()[i].c_str(), &tmp);
...@@ -653,7 +653,7 @@ void Metadata::LoadInitialScore(const std::string& data_filename) { ...@@ -653,7 +653,7 @@ void Metadata::LoadInitialScore(const std::string& data_filename) {
} }
} else { } else {
std::vector<std::string> oneline_init_score; std::vector<std::string> oneline_init_score;
#pragma omp parallel for schedule(static) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (data_size_t i = 0; i < num_line; ++i) { for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f; double tmp = 0.0f;
oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t'); oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t');
......
...@@ -271,7 +271,7 @@ class MultiValDenseBin : public MultiValBin { ...@@ -271,7 +271,7 @@ class MultiValDenseBin : public MultiValBin {
data_size_t block_size = num_data_; data_size_t block_size = num_data_;
Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block, Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block,
&block_size); &block_size);
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
for (int tid = 0; tid < n_block; ++tid) { for (int tid = 0; tid < n_block; ++tid) {
data_size_t start = tid * block_size; data_size_t start = tid * block_size;
data_size_t end = std::min(num_data_, start + block_size); data_size_t end = std::min(num_data_, start + block_size);
......
...@@ -85,7 +85,7 @@ class MultiValSparseBin : public MultiValBin { ...@@ -85,7 +85,7 @@ class MultiValSparseBin : public MultiValBin {
offsets[tid + 1] = offsets[tid] + sizes[tid + 1]; offsets[tid + 1] = offsets[tid] + sizes[tid + 1];
} }
data_.resize(row_ptr_[num_data_]); data_.resize(row_ptr_[num_data_]);
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
for (int tid = 0; tid < static_cast<int>(t_data_.size()); ++tid) { for (int tid = 0; tid < static_cast<int>(t_data_.size()); ++tid) {
std::copy_n(t_data_[tid].data(), sizes[tid + 1], std::copy_n(t_data_[tid].data(), sizes[tid + 1],
data_.data() + offsets[tid]); data_.data() + offsets[tid]);
...@@ -344,7 +344,7 @@ class MultiValSparseBin : public MultiValBin { ...@@ -344,7 +344,7 @@ class MultiValSparseBin : public MultiValBin {
num_data_, 1024, &n_block, &block_size); num_data_, 1024, &n_block, &block_size);
std::vector<INDEX_T> sizes(t_data_.size() + 1, 0); std::vector<INDEX_T> sizes(t_data_.size() + 1, 0);
const int pre_alloc_size = 50; const int pre_alloc_size = 50;
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
for (int tid = 0; tid < n_block; ++tid) { for (int tid = 0; tid < n_block; ++tid) {
data_size_t start = tid * block_size; data_size_t start = tid * block_size;
data_size_t end = std::min(num_data_, start + block_size); data_size_t end = std::min(num_data_, start + block_size);
......
...@@ -56,7 +56,7 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t, ...@@ -56,7 +56,7 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
if (HIST_BITS == 32) { if (HIST_BITS == 32) {
const int64_t* src = reinterpret_cast<const int64_t*>(hist_buf.data()) + hist_buf.size() / 2 - const int64_t* src = reinterpret_cast<const int64_t*>(hist_buf.data()) + hist_buf.size() / 2 -
static_cast<size_t>(num_bin_aligned_); static_cast<size_t>(num_bin_aligned_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static) num_threads(num_threads_)
for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) { for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2, std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2); reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
...@@ -65,14 +65,14 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t, ...@@ -65,14 +65,14 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 - const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
static_cast<size_t>(num_bin_aligned_); static_cast<size_t>(num_bin_aligned_);
if (is_use_subcol_) { if (is_use_subcol_) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static) num_threads(num_threads_)
for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) { for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2, std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2); reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
} }
} else { } else {
int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_); int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static) num_threads(num_threads_)
for (int i = 0; i < num_bin_; ++i) { for (int i = 0; i < num_bin_; ++i) {
orig_ptr[i] = src[i]; orig_ptr[i] = src[i];
} }
...@@ -81,7 +81,7 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t, ...@@ -81,7 +81,7 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
} else { } else {
const hist_t* src = hist_buf.data() + hist_buf.size() - const hist_t* src = hist_buf.data() + hist_buf.size() -
2 * static_cast<size_t>(num_bin_aligned_); 2 * static_cast<size_t>(num_bin_aligned_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static) num_threads(num_threads_)
for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) { for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
std::copy_n(src + hist_move_src_[i], hist_move_size_[i], std::copy_n(src + hist_move_src_[i], hist_move_size_[i],
origin_hist_data_ + hist_move_dest_[i]); origin_hist_data_ + hist_move_dest_[i]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment