Unverified Commit 0df10887 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[ci] [c++] use 'pre-commit' to run 'cpplint', upgrade to 'cpplint' 2.0.2 (#7002)

* [ci] [c++] use 'pre-commit' to run 'cpplint', upgrade to 'cpplint' 2.0.2

* remove bashisms

* one more pipefail use

* another pipefail
parent d9bdda6c
#!/bin/bash #!/bin/bash
set -e -E -u -o pipefail set -e -u
echo "running cpplint"
cpplint \
--filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length \
--recursive ./src ./include ./R-package ./swig ./tests \
|| exit 1
echo "done running cpplint"
echo "checking that all OpenMP pragmas specify num_threads()" echo "checking that all OpenMP pragmas specify num_threads()"
get_omp_pragmas_without_num_threads() { get_omp_pragmas_without_num_threads() {
...@@ -28,11 +21,11 @@ get_omp_pragmas_without_num_threads() { ...@@ -28,11 +21,11 @@ get_omp_pragmas_without_num_threads() {
# consider this a failure and stop execution of the script. # consider this a failure and stop execution of the script.
# #
# ref: https://www.gnu.org/software/grep/manual/html_node/Exit-Status.html # ref: https://www.gnu.org/software/grep/manual/html_node/Exit-Status.html
set +e +o pipefail set +e
PROBLEMATIC_LINES=$( PROBLEMATIC_LINES=$(
get_omp_pragmas_without_num_threads get_omp_pragmas_without_num_threads
) )
set -e -o pipefail set -e
if test "${PROBLEMATIC_LINES}" != ""; then if test "${PROBLEMATIC_LINES}" != ""; then
get_omp_pragmas_without_num_threads get_omp_pragmas_without_num_threads
echo "Found '#pragma omp parallel' not using explicit num_threads() configuration. Fix those." echo "Found '#pragma omp parallel' not using explicit num_threads() configuration. Fix those."
......
...@@ -105,7 +105,6 @@ if [[ $TASK == "lint" ]]; then ...@@ -105,7 +105,6 @@ if [[ $TASK == "lint" ]]; then
conda create -q -y -n "${CONDA_ENV}" \ conda create -q -y -n "${CONDA_ENV}" \
"${CONDA_PYTHON_REQUIREMENT}" \ "${CONDA_PYTHON_REQUIREMENT}" \
'biome>=1.9.3' \ 'biome>=1.9.3' \
'cpplint>=1.6.0' \
'matplotlib-base>=3.9.1' \ 'matplotlib-base>=3.9.1' \
'mypy>=1.11.1' \ 'mypy>=1.11.1' \
'pre-commit>=3.8.0' \ 'pre-commit>=3.8.0' \
...@@ -118,8 +117,6 @@ if [[ $TASK == "lint" ]]; then ...@@ -118,8 +117,6 @@ if [[ $TASK == "lint" ]]; then
bash ./.ci/run-pre-commit-mypy.sh || exit 1 bash ./.ci/run-pre-commit-mypy.sh || exit 1
echo "Linting R code" echo "Linting R code"
Rscript ./.ci/lint-r-code.R "${BUILD_DIRECTORY}" || exit 1 Rscript ./.ci/lint-r-code.R "${BUILD_DIRECTORY}" || exit 1
echo "Linting C++ code"
bash ./.ci/lint-cpp.sh || exit 1
echo "Linting JavaScript code" echo "Linting JavaScript code"
bash ./.ci/lint-js.sh || exit 1 bash ./.ci/lint-js.sh || exit 1
exit 0 exit 0
......
...@@ -22,6 +22,22 @@ repos: ...@@ -22,6 +22,22 @@ repos:
hooks: hooks:
- id: cmakelint - id: cmakelint
args: ["--linelength=120", "--filter=-convention/filename,-package/stdargs,-readability/wonkycase"] args: ["--linelength=120", "--filter=-convention/filename,-package/stdargs,-readability/wonkycase"]
- repo: https://github.com/cpplint/cpplint
rev: '2.0.2'
hooks:
- id: cpplint
args:
- --recursive
- --filter=-build/c++11,-build/include_subdir,-build/include_what_you_use,-build/header_guard,-whitespace/indent_namespace,-whitespace/line_length
- repo: local
hooks:
- id: check-omp-pragmas
name: check-omp-pragmas
entry: sh
args:
- .ci/check-omp-pragmas.sh
language: system
pass_filenames: false
- repo: https://github.com/adrienverge/yamllint - repo: https://github.com/adrienverge/yamllint
rev: v1.37.1 rev: v1.37.1
hooks: hooks:
......
...@@ -554,9 +554,13 @@ class Dataset { ...@@ -554,9 +554,13 @@ class Dataset {
} }
inline void FinishOneRow(int tid, data_size_t row_idx, const std::vector<bool>& is_feature_added) { inline void FinishOneRow(int tid, data_size_t row_idx, const std::vector<bool>& is_feature_added) {
if (is_finish_load_) { return; } if (is_finish_load_) {
return;
}
for (auto fidx : feature_need_push_zeros_) { for (auto fidx : feature_need_push_zeros_) {
if (is_feature_added[fidx]) { continue; } if (is_feature_added[fidx]) {
continue;
}
const int group = feature2group_[fidx]; const int group = feature2group_[fidx];
const int sub_feature = feature2subfeature_[fidx]; const int sub_feature = feature2subfeature_[fidx];
feature_groups_[group]->PushData(tid, sub_feature, row_idx, 0.0f); feature_groups_[group]->PushData(tid, sub_feature, row_idx, 0.0f);
...@@ -587,10 +591,14 @@ class Dataset { ...@@ -587,10 +591,14 @@ class Dataset {
} }
inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) { inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
if (is_finish_load_) { return; } if (is_finish_load_) {
return;
}
std::vector<bool> is_feature_added(num_features_, false); std::vector<bool> is_feature_added(num_features_, false);
for (auto& inner_data : feature_values) { for (auto& inner_data : feature_values) {
if (inner_data.first >= num_total_features_) { continue; } if (inner_data.first >= num_total_features_) {
continue;
}
int feature_idx = used_feature_map_[inner_data.first]; int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
is_feature_added[feature_idx] = true; is_feature_added[feature_idx] = true;
......
...@@ -112,17 +112,33 @@ class ArrayArgs { ...@@ -112,17 +112,33 @@ class ArrayArgs {
VAL_T v = ref[end - 1]; VAL_T v = ref[end - 1];
for (;;) { for (;;) {
while (ref[++i] > v) {} while (ref[++i] > v) {}
while (v > ref[--j]) { if (j == start) { break; } } while (v > ref[--j]) {
if (i >= j) { break; } if (j == start) {
break;
}
}
if (i >= j) {
break;
}
std::swap(ref[i], ref[j]); std::swap(ref[i], ref[j]);
if (ref[i] == v) { p++; std::swap(ref[p], ref[i]); } if (ref[i] == v) {
if (v == ref[j]) { q--; std::swap(ref[j], ref[q]); } p++;
std::swap(ref[p], ref[i]);
}
if (v == ref[j]) {
q--;
std::swap(ref[j], ref[q]);
}
} }
std::swap(ref[i], ref[end - 1]); std::swap(ref[i], ref[end - 1]);
j = i - 1; j = i - 1;
i = i + 1; i = i + 1;
for (int k = start; k <= p; k++, j--) { std::swap(ref[k], ref[j]); } for (int k = start; k <= p; k++, j--) {
for (int k = end - 2; k >= q; k--, i++) { std::swap(ref[i], ref[k]); } std::swap(ref[k], ref[j]);
}
for (int k = end - 2; k >= q; k--, i++) {
std::swap(ref[i], ref[k]);
}
*l = j; *l = j;
*r = i; *r = i;
} }
......
...@@ -315,9 +315,18 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -315,9 +315,18 @@ inline static const char* Atof(const char* p, double* out) {
} }
if (expon > 308) expon = 308; if (expon > 308) expon = 308;
// Calculate scaling factor. // Calculate scaling factor.
while (expon >= 50) { scale *= 1E50; expon -= 50; } while (expon >= 50) {
while (expon >= 8) { scale *= 1E8; expon -= 8; } scale *= 1E50;
while (expon > 0) { scale *= 10.0; expon -= 1; } expon -= 50;
}
while (expon >= 8) {
scale *= 1E8;
expon -= 8;
}
while (expon > 0) {
scale *= 10.0;
expon -= 1;
}
} }
// Return signed and scaled floating point result. // Return signed and scaled floating point result.
*out = sign * (frac ? (value / scale) : (value * scale)); *out = sign * (frac ? (value / scale) : (value * scale));
...@@ -713,7 +722,9 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) { ...@@ -713,7 +722,9 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
size_t mid = left + s; size_t mid = left + s;
size_t right = mid + s; size_t right = mid + s;
right = std::min(len, right); right = std::min(len, right);
if (mid >= right) { continue; } if (mid >= right) {
continue;
}
std::copy(_First + left, _First + mid, buf + left); std::copy(_First + left, _First + mid, buf + left);
std::merge(buf + left, buf + mid, _First + mid, _First + right, _First + left, _Pred); std::merge(buf + left, buf + mid, _First + mid, _First + right, _First + left, _Pred);
} }
......
...@@ -62,9 +62,13 @@ class ThreadExceptionHelper { ...@@ -62,9 +62,13 @@ class ThreadExceptionHelper {
} }
void CaptureException() { void CaptureException() {
// only catch first exception. // only catch first exception.
if (ex_ptr_ != nullptr) { return; } if (ex_ptr_ != nullptr) {
return;
}
std::unique_lock<std::mutex> guard(lock_); std::unique_lock<std::mutex> guard(lock_);
if (ex_ptr_ != nullptr) { return; } if (ex_ptr_ != nullptr) {
return;
}
ex_ptr_ = std::current_exception(); ex_ptr_ = std::current_exception();
} }
......
...@@ -124,7 +124,9 @@ class TextReader { ...@@ -124,7 +124,9 @@ class TextReader {
++i; ++i;
++total_cnt; ++total_cnt;
// skip end of line // skip end of line
while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; } while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) {
++i;
}
last_i = i; last_i = i;
} else { } else {
++i; ++i;
...@@ -284,7 +286,9 @@ class TextReader { ...@@ -284,7 +286,9 @@ class TextReader {
++i; ++i;
++total_cnt; ++total_cnt;
// skip end of line // skip end of line
while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; } while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) {
++i;
}
last_i = i; last_i = i;
} else { } else {
++i; ++i;
......
...@@ -121,7 +121,9 @@ void Application::LoadData() { ...@@ -121,7 +121,9 @@ void Application::LoadData() {
if (config_.is_provide_training_metric) { if (config_.is_provide_training_metric) {
for (auto metric_type : config_.metric) { for (auto metric_type : config_.metric) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_)); auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_));
if (metric == nullptr) { continue; } if (metric == nullptr) {
continue;
}
metric->Init(train_data_->metadata(), train_data_->num_data()); metric->Init(train_data_->metadata(), train_data_->num_data());
train_metric_.push_back(std::move(metric)); train_metric_.push_back(std::move(metric));
} }
...@@ -149,7 +151,9 @@ void Application::LoadData() { ...@@ -149,7 +151,9 @@ void Application::LoadData() {
valid_metrics_.emplace_back(); valid_metrics_.emplace_back();
for (auto metric_type : config_.metric) { for (auto metric_type : config_.metric) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_)); auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_));
if (metric == nullptr) { continue; } if (metric == nullptr) {
continue;
}
metric->Init(valid_datas_.back()->metadata(), metric->Init(valid_datas_.back()->metadata(),
valid_datas_.back()->num_data()); valid_datas_.back()->num_data());
valid_metrics_.back().push_back(std::move(metric)); valid_metrics_.back().push_back(std::move(metric));
......
...@@ -210,7 +210,9 @@ void GBDT::AddValidDataset(const Dataset* valid_data, ...@@ -210,7 +210,9 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
if (early_stopping_round_ > 0) { if (early_stopping_round_ > 0) {
auto num_metrics = valid_metrics.size(); auto num_metrics = valid_metrics.size();
if (es_first_metric_only_) { num_metrics = 1; } if (es_first_metric_only_) {
num_metrics = 1;
}
best_iter_.emplace_back(num_metrics, 0); best_iter_.emplace_back(num_metrics, 0);
best_score_.emplace_back(num_metrics, kMinScore); best_score_.emplace_back(num_metrics, kMinScore);
best_msg_.emplace_back(num_metrics); best_msg_.emplace_back(num_metrics);
...@@ -452,7 +454,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ...@@ -452,7 +454,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
} }
void GBDT::RollbackOneIter() { void GBDT::RollbackOneIter() {
if (iter_ <= 0) { return; } if (iter_ <= 0) {
return;
}
// reset score // reset score
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
auto curr_tree = models_.size() - num_tree_per_iteration_ + cur_tree_id; auto curr_tree = models_.size() - num_tree_per_iteration_ + cur_tree_id;
...@@ -588,7 +592,9 @@ std::string GBDT::OutputMetric(int iter) { ...@@ -588,7 +592,9 @@ std::string GBDT::OutputMetric(int iter) {
msg_buf << tmp_buf.str() << '\n'; msg_buf << tmp_buf.str() << '\n';
} }
} }
if (es_first_metric_only_ && j > 0) { continue; } if (es_first_metric_only_ && j > 0) {
continue;
}
if (ret.empty() && early_stopping_round_ > 0) { if (ret.empty() && early_stopping_round_ > 0) {
auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back(); auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
if (cur_score - best_score_[i][j] > early_stopping_min_delta_) { if (cur_score - best_score_[i][j] > early_stopping_min_delta_) {
...@@ -596,7 +602,9 @@ std::string GBDT::OutputMetric(int iter) { ...@@ -596,7 +602,9 @@ std::string GBDT::OutputMetric(int iter) {
best_iter_[i][j] = iter; best_iter_[i][j] = iter;
meet_early_stopping_pairs.emplace_back(i, j); meet_early_stopping_pairs.emplace_back(i, j);
} else { } else {
if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; } if (iter - best_iter_[i][j] >= early_stopping_round_) {
ret = best_msg_[i][j];
}
} }
} }
} }
......
...@@ -30,7 +30,9 @@ class GOSSStrategy : public SampleStrategy { ...@@ -30,7 +30,9 @@ class GOSSStrategy : public SampleStrategy {
void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override {
bag_data_cnt_ = num_data_; bag_data_cnt_ = num_data_;
// not subsample for first iterations // not subsample for first iterations
if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; } if (iter < static_cast<int>(1.0f / config_->learning_rate)) {
return;
}
auto left_cnt = bagging_runner_.Run<true>( auto left_cnt = bagging_runner_.Run<true>(
num_data_, num_data_,
[=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
......
...@@ -182,7 +182,9 @@ class RF : public GBDT { ...@@ -182,7 +182,9 @@ class RF : public GBDT {
} }
void RollbackOneIter() override { void RollbackOneIter() override {
if (iter_ <= 0) { return; } if (iter_ <= 0) {
return;
}
int cur_iter = iter_ + num_init_iteration_ - 1; int cur_iter = iter_ + num_init_iteration_ - 1;
// reset score // reset score
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
......
...@@ -218,7 +218,9 @@ class Booster { ...@@ -218,7 +218,9 @@ class Booster {
for (auto metric_type : config_.metric) { for (auto metric_type : config_.metric) {
auto metric = std::unique_ptr<Metric>( auto metric = std::unique_ptr<Metric>(
Metric::CreateMetric(metric_type, config_)); Metric::CreateMetric(metric_type, config_));
if (metric == nullptr) { continue; } if (metric == nullptr) {
continue;
}
metric->Init(train_data_->metadata(), train_data_->num_data()); metric->Init(train_data_->metadata(), train_data_->num_data());
train_metric_.push_back(std::move(metric)); train_metric_.push_back(std::move(metric));
} }
...@@ -394,7 +396,9 @@ class Booster { ...@@ -394,7 +396,9 @@ class Booster {
valid_metrics_.emplace_back(); valid_metrics_.emplace_back();
for (auto metric_type : config_.metric) { for (auto metric_type : config_.metric) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_)); auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_));
if (metric == nullptr) { continue; } if (metric == nullptr) {
continue;
}
metric->Init(valid_data->metadata(), valid_data->num_data()); metric->Init(valid_data->metadata(), valid_data->num_data());
valid_metrics_.back().push_back(std::move(metric)); valid_metrics_.back().push_back(std::move(metric));
} }
...@@ -1603,7 +1607,9 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -1603,7 +1607,9 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
int feature_idx = ret->InnerFeatureIndex(i); int feature_idx = ret->InnerFeatureIndex(i);
if (feature_idx < 0) { continue; } if (feature_idx < 0) {
continue;
}
int group = ret->Feature2Group(feature_idx); int group = ret->Feature2Group(feature_idx);
int sub_feature = ret->Feture2SubFeature(feature_idx); int sub_feature = ret->Feture2SubFeature(feature_idx);
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i); CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
...@@ -1614,7 +1620,9 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -1614,7 +1620,9 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
auto pair = col_it.NextNonZero(); auto pair = col_it.NextNonZero();
row_idx = pair.first; row_idx = pair.first;
// no more data // no more data
if (row_idx < 0) { break; } if (row_idx < 0) {
break;
}
ret->PushOneData(tid, row_idx, group, feature_idx, sub_feature, pair.second); ret->PushOneData(tid, row_idx, group, feature_idx, sub_feature, pair.second);
} }
} else { } else {
...@@ -1838,7 +1846,9 @@ int LGBM_DatasetSetField(DatasetHandle handle, ...@@ -1838,7 +1846,9 @@ int LGBM_DatasetSetField(DatasetHandle handle,
} else if (type == C_API_DTYPE_FLOAT64) { } else if (type == C_API_DTYPE_FLOAT64) {
is_success = dataset->SetDoubleField(field_name, reinterpret_cast<const double*>(field_data), static_cast<int32_t>(num_element)); is_success = dataset->SetDoubleField(field_name, reinterpret_cast<const double*>(field_data), static_cast<int32_t>(num_element));
} }
if (!is_success) { Log::Fatal("Input data type error or field not found"); } if (!is_success) {
Log::Fatal("Input data type error or field not found");
}
API_END(); API_END();
} }
...@@ -1875,8 +1885,12 @@ int LGBM_DatasetGetField(DatasetHandle handle, ...@@ -1875,8 +1885,12 @@ int LGBM_DatasetGetField(DatasetHandle handle,
*out_type = C_API_DTYPE_FLOAT64; *out_type = C_API_DTYPE_FLOAT64;
is_success = true; is_success = true;
} }
if (!is_success) { Log::Fatal("Field not found"); } if (!is_success) {
if (*out_ptr == nullptr) { *out_len = 0; } Log::Fatal("Field not found");
}
if (*out_ptr == nullptr) {
*out_len = 0;
}
API_END(); API_END();
} }
......
...@@ -131,7 +131,9 @@ namespace LightGBM { ...@@ -131,7 +131,9 @@ namespace LightGBM {
upper_bounds[bin_cnt] = distinct_values[i]; upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt; ++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1]; lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; } if (bin_cnt >= max_bin - 1) {
break;
}
cur_cnt_inbin = 0; cur_cnt_inbin = 0;
if (!is_big_count_value[i]) { if (!is_big_count_value[i]) {
--rest_bin_cnt; --rest_bin_cnt;
......
...@@ -665,7 +665,9 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, ...@@ -665,7 +665,9 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
std::vector<int> start(num_machines); std::vector<int> start(num_machines);
std::vector<int> len(num_machines); std::vector<int> len(num_machines);
int step = (num_total_features + num_machines - 1) / num_machines; int step = (num_total_features + num_machines - 1) / num_machines;
if (step < 1) { step = 1; } if (step < 1) {
step = 1;
}
start[0] = 0; start[0] = 0;
for (int i = 0; i < num_machines - 1; ++i) { for (int i = 0; i < num_machines - 1; ++i) {
...@@ -1168,7 +1170,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -1168,7 +1170,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
std::vector<int> start(num_machines); std::vector<int> start(num_machines);
std::vector<int> len(num_machines); std::vector<int> len(num_machines);
int step = (dataset->num_total_features_ + num_machines - 1) / num_machines; int step = (dataset->num_total_features_ + num_machines - 1) / num_machines;
if (step < 1) { step = 1; } if (step < 1) {
step = 1;
}
start[0] = 0; start[0] = 0;
for (int i = 0; i < num_machines - 1; ++i) { for (int i = 0; i < num_machines - 1; ++i) {
...@@ -1284,7 +1288,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat ...@@ -1284,7 +1288,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
std::vector<bool> is_feature_added(dataset->num_features_, false); std::vector<bool> is_feature_added(dataset->num_features_, false);
// push data // push data
for (auto& inner_data : oneline_features) { for (auto& inner_data : oneline_features) {
if (inner_data.first >= dataset->num_total_features_) { continue; } if (inner_data.first >= dataset->num_total_features_) {
continue;
}
int feature_idx = dataset->used_feature_map_[inner_data.first]; int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
is_feature_added[feature_idx] = true; is_feature_added[feature_idx] = true;
...@@ -1341,7 +1347,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat ...@@ -1341,7 +1347,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
// push data // push data
std::vector<bool> is_feature_added(dataset->num_features_, false); std::vector<bool> is_feature_added(dataset->num_features_, false);
for (auto& inner_data : oneline_features) { for (auto& inner_data : oneline_features) {
if (inner_data.first >= dataset->num_total_features_) { continue; } if (inner_data.first >= dataset->num_total_features_) {
continue;
}
int feature_idx = dataset->used_feature_map_[inner_data.first]; int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
is_feature_added[feature_idx] = true; is_feature_added[feature_idx] = true;
...@@ -1414,7 +1422,9 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* ...@@ -1414,7 +1422,9 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
std::vector<bool> is_feature_added(dataset->num_features_, false); std::vector<bool> is_feature_added(dataset->num_features_, false);
// push data // push data
for (auto& inner_data : oneline_features) { for (auto& inner_data : oneline_features) {
if (inner_data.first >= dataset->num_total_features_) { continue; } if (inner_data.first >= dataset->num_total_features_) {
continue;
}
int feature_idx = dataset->used_feature_map_[inner_data.first]; int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
is_feature_added[feature_idx] = true; is_feature_added[feature_idx] = true;
......
...@@ -56,7 +56,9 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { ...@@ -56,7 +56,9 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
Log::Info("Using query id in data file, ignoring the additional query file"); Log::Info("Using query id in data file, ignoring the additional query file");
query_boundaries_.clear(); query_boundaries_.clear();
} }
if (!query_weights_.empty()) { query_weights_.clear(); } if (!query_weights_.empty()) {
query_weights_.clear();
}
queries_ = std::vector<data_size_t>(num_data_, 0); queries_ = std::vector<data_size_t>(num_data_, 0);
query_load_from_file_ = false; query_load_from_file_ = false;
} }
...@@ -401,7 +403,9 @@ void Metadata::InsertInitScores(const double* init_scores, data_size_t start_ind ...@@ -401,7 +403,9 @@ void Metadata::InsertInitScores(const double* init_scores, data_size_t start_ind
// Note that len here is row count, not num_init_score, so we compare against num_data // Note that len here is row count, not num_init_score, so we compare against num_data
Log::Fatal("Inserted initial score data is too large for dataset"); Log::Fatal("Inserted initial score data is too large for dataset");
} }
if (init_score_.empty()) { init_score_.resize(num_init_score_); } if (init_score_.empty()) {
init_score_.resize(num_init_score_);
}
int nclasses = num_init_score_classes(); int nclasses = num_init_score_classes();
...@@ -455,7 +459,9 @@ void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data ...@@ -455,7 +459,9 @@ void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data
if (start_index + len > num_data_) { if (start_index + len > num_data_) {
Log::Fatal("Inserted label data is too large for dataset"); Log::Fatal("Inserted label data is too large for dataset");
} }
if (label_.empty()) { label_.resize(num_data_); } if (label_.empty()) {
label_.resize(num_data_);
}
memcpy(label_.data() + start_index, labels, sizeof(label_t) * len); memcpy(label_.data() + start_index, labels, sizeof(label_t) * len);
...@@ -511,7 +517,9 @@ void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, da ...@@ -511,7 +517,9 @@ void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, da
if (start_index + len > num_weights_) { if (start_index + len > num_weights_) {
Log::Fatal("Inserted weight data is too large for dataset"); Log::Fatal("Inserted weight data is too large for dataset");
} }
if (weights_.empty()) { weights_.resize(num_weights_); } if (weights_.empty()) {
weights_.resize(num_weights_);
}
memcpy(weights_.data() + start_index, weights, sizeof(label_t) * len); memcpy(weights_.data() + start_index, weights, sizeof(label_t) * len);
...@@ -797,20 +805,26 @@ void Metadata::LoadFromMemory(const void* memory) { ...@@ -797,20 +805,26 @@ void Metadata::LoadFromMemory(const void* memory) {
num_queries_ = *(reinterpret_cast<const data_size_t*>(mem_ptr)); num_queries_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_queries_)); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_queries_));
if (!label_.empty()) { label_.clear(); } if (!label_.empty()) {
label_.clear();
}
label_ = std::vector<label_t>(num_data_); label_ = std::vector<label_t>(num_data_);
std::memcpy(label_.data(), mem_ptr, sizeof(label_t) * num_data_); std::memcpy(label_.data(), mem_ptr, sizeof(label_t) * num_data_);
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_data_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_data_);
if (num_weights_ > 0) { if (num_weights_ > 0) {
if (!weights_.empty()) { weights_.clear(); } if (!weights_.empty()) {
weights_.clear();
}
weights_ = std::vector<label_t>(num_weights_); weights_ = std::vector<label_t>(num_weights_);
std::memcpy(weights_.data(), mem_ptr, sizeof(label_t) * num_weights_); std::memcpy(weights_.data(), mem_ptr, sizeof(label_t) * num_weights_);
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_weights_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_weights_);
weight_load_from_file_ = true; weight_load_from_file_ = true;
} }
if (num_queries_ > 0) { if (num_queries_ > 0) {
if (!query_boundaries_.empty()) { query_boundaries_.clear(); } if (!query_boundaries_.empty()) {
query_boundaries_.clear();
}
query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1); query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1);
std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t) * (num_queries_ + 1)); std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t) * (num_queries_ + 1));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(data_size_t) * mem_ptr += VirtualFileWriter::AlignedSize(sizeof(data_size_t) *
......
...@@ -31,7 +31,9 @@ void DCGCalculator::DefaultEvalAt(std::vector<int>* eval_at) { ...@@ -31,7 +31,9 @@ void DCGCalculator::DefaultEvalAt(std::vector<int>* eval_at) {
} }
void DCGCalculator::DefaultLabelGain(std::vector<double>* label_gain) { void DCGCalculator::DefaultLabelGain(std::vector<double>* label_gain) {
if (!label_gain->empty()) { return; } if (!label_gain->empty()) {
return;
}
// label_gain = 2^i - 1, may overflow, so we use 31 here // label_gain = 2^i - 1, may overflow, so we use 31 here
const int max_label = 31; const int max_label = 31;
label_gain->push_back(0.0f); label_gain->push_back(0.0f);
...@@ -60,7 +62,9 @@ double DCGCalculator::CalMaxDCGAtK(data_size_t k, const label_t* label, data_siz ...@@ -60,7 +62,9 @@ double DCGCalculator::CalMaxDCGAtK(data_size_t k, const label_t* label, data_siz
} }
int top_label = static_cast<int>(label_gain_.size()) - 1; int top_label = static_cast<int>(label_gain_.size()) - 1;
if (k > num_data) { k = num_data; } if (k > num_data) {
k = num_data;
}
// start from top label, and accumulate DCG // start from top label, and accumulate DCG
for (data_size_t j = 0; j < k; ++j) { for (data_size_t j = 0; j < k; ++j) {
while (top_label > 0 && label_cnt[top_label] <= 0) { while (top_label > 0 && label_cnt[top_label] <= 0) {
...@@ -90,7 +94,9 @@ void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks, ...@@ -90,7 +94,9 @@ void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
// calculate k Max DCG by one pass // calculate k Max DCG by one pass
for (size_t i = 0; i < ks.size(); ++i) { for (size_t i = 0; i < ks.size(); ++i) {
data_size_t cur_k = ks[i]; data_size_t cur_k = ks[i];
if (cur_k > num_data) { cur_k = num_data; } if (cur_k > num_data) {
cur_k = num_data;
}
for (data_size_t j = cur_left; j < cur_k; ++j) { for (data_size_t j = cur_left; j < cur_k; ++j) {
while (top_label > 0 && label_cnt[top_label] <= 0) { while (top_label > 0 && label_cnt[top_label] <= 0) {
top_label -= 1; top_label -= 1;
...@@ -121,7 +127,9 @@ void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const label_t* la ...@@ -121,7 +127,9 @@ void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const label_t* la
// calculate multi dcg by one pass // calculate multi dcg by one pass
for (size_t i = 0; i < ks.size(); ++i) { for (size_t i = 0; i < ks.size(); ++i) {
data_size_t cur_k = ks[i]; data_size_t cur_k = ks[i];
if (cur_k > num_data) { cur_k = num_data; } if (cur_k > num_data) {
cur_k = num_data;
}
for (data_size_t j = cur_left; j < cur_k; ++j) { for (data_size_t j = cur_left; j < cur_k; ++j) {
data_size_t idx = sorted_idx[j]; data_size_t idx = sorted_idx[j];
cur_result += label_gain_[static_cast<int>(label[idx])] * discount_[j]; cur_result += label_gain_[static_cast<int>(label[idx])] * discount_[j];
......
...@@ -86,7 +86,9 @@ class MapMetric:public Metric { ...@@ -86,7 +86,9 @@ class MapMetric:public Metric {
data_size_t cur_left = 0; data_size_t cur_left = 0;
for (size_t i = 0; i < ks.size(); ++i) { for (size_t i = 0; i < ks.size(); ++i) {
data_size_t cur_k = static_cast<data_size_t>(ks[i]); data_size_t cur_k = static_cast<data_size_t>(ks[i]);
if (cur_k > num_data) { cur_k = num_data; } if (cur_k > num_data) {
cur_k = num_data;
}
for (data_size_t j = cur_left; j < cur_k; ++j) { for (data_size_t j = cur_left; j < cur_k; ++j) {
data_size_t idx = sorted_idx[j]; data_size_t idx = sorted_idx[j];
if (label[idx] > 0.5f) { if (label[idx] > 0.5f) {
......
...@@ -68,7 +68,9 @@ RecursiveHalvingMap::RecursiveHalvingMap(int in_k, RecursiveHalvingNodeType _typ ...@@ -68,7 +68,9 @@ RecursiveHalvingMap::RecursiveHalvingMap(int in_k, RecursiveHalvingNodeType _typ
RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) { RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) {
// construct all recursive halving map for all machines // construct all recursive halving map for all machines
int k = 0; int k = 0;
while ((1 << k) <= num_machines) { ++k; } while ((1 << k) <= num_machines) {
++k;
}
// let 1 << k <= num_machines // let 1 << k <= num_machines
--k; --k;
// distance of each communication // distance of each communication
......
...@@ -206,11 +206,17 @@ class LambdarankNDCG : public RankingObjective { ...@@ -206,11 +206,17 @@ class LambdarankNDCG : public RankingObjective {
double sum_lambdas = 0.0; double sum_lambdas = 0.0;
// start accumulate lambdas by pairs that contain at least one document above truncation level // start accumulate lambdas by pairs that contain at least one document above truncation level
for (data_size_t i = 0; i < cnt - 1 && i < truncation_level_; ++i) { for (data_size_t i = 0; i < cnt - 1 && i < truncation_level_; ++i) {
if (score[sorted_idx[i]] == kMinScore) { continue; } if (score[sorted_idx[i]] == kMinScore) {
continue;
}
for (data_size_t j = i + 1; j < cnt; ++j) { for (data_size_t j = i + 1; j < cnt; ++j) {
if (score[sorted_idx[j]] == kMinScore) { continue; } if (score[sorted_idx[j]] == kMinScore) {
continue;
}
// skip pairs with the same labels // skip pairs with the same labels
if (label[sorted_idx[i]] == label[sorted_idx[j]]) { continue; } if (label[sorted_idx[i]] == label[sorted_idx[j]]) {
continue;
}
data_size_t high_rank, low_rank; data_size_t high_rank, low_rank;
if (label[sorted_idx[i]] > label[sorted_idx[j]]) { if (label[sorted_idx[i]] > label[sorted_idx[j]]) {
high_rank = i; high_rank = i;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment