Commit 33d0378f authored by Guolin Ke's avatar Guolin Ke Committed by Nikita Titov
Browse files

avoid nan and inf in weight/label/init_score (#2377)

* avoid nan and inf in weight/label/init_score

* use prefix increment
parent de1f3cb3
...@@ -20,7 +20,7 @@ Metadata::Metadata() { ...@@ -20,7 +20,7 @@ Metadata::Metadata() {
init_score_load_from_file_ = false; init_score_load_from_file_ = false;
} }
void Metadata::Init(const char * data_filename, const char* initscore_file) { void Metadata::Init(const char* data_filename, const char* initscore_file) {
data_filename_ = data_filename; data_filename_ = data_filename;
// for lambdarank, it needs query data for partition data in parallel learning // for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries(); LoadQueryBoundaries();
...@@ -40,12 +40,8 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { ...@@ -40,12 +40,8 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
Log::Info("Using weights in data file, ignoring the additional weights file"); Log::Info("Using weights in data file, ignoring the additional weights file");
weights_.clear(); weights_.clear();
} }
weights_ = std::vector<label_t>(num_data_); weights_ = std::vector<label_t>(num_data_, 0.0f);
num_weights_ = num_data_; num_weights_ = num_data_;
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = 0.0f;
}
weight_load_from_file_ = false; weight_load_from_file_ = false;
} }
if (query_idx >= 0) { if (query_idx >= 0) {
...@@ -54,11 +50,7 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { ...@@ -54,11 +50,7 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
query_boundaries_.clear(); query_boundaries_.clear();
} }
if (!query_weights_.empty()) { query_weights_.clear(); } if (!query_weights_.empty()) { query_weights_.clear(); }
queries_ = std::vector<data_size_t>(num_data_); queries_ = std::vector<data_size_t>(num_data_, 0);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
queries_[i] = 0;
}
query_load_from_file_ = false; query_load_from_file_ = false;
} }
} }
...@@ -67,16 +59,16 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da ...@@ -67,16 +59,16 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
num_data_ = num_used_indices; num_data_ = num_used_indices;
label_ = std::vector<label_t>(num_used_indices); label_ = std::vector<label_t>(num_used_indices);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; i++) { for (data_size_t i = 0; i < num_used_indices; ++i) {
label_[i] = fullset.label_[used_indices[i]]; label_[i] = fullset.label_[used_indices[i]];
} }
if (!fullset.weights_.empty()) { if (!fullset.weights_.empty()) {
weights_ = std::vector<label_t>(num_used_indices); weights_ = std::vector<label_t>(num_used_indices);
num_weights_ = num_used_indices; num_weights_ = num_used_indices;
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; i++) { for (data_size_t i = 0; i < num_used_indices; ++i) {
weights_[i] = fullset.weights_[used_indices[i]]; weights_[i] = fullset.weights_[used_indices[i]];
} }
} else { } else {
...@@ -85,12 +77,14 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da ...@@ -85,12 +77,14 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
if (!fullset.init_score_.empty()) { if (!fullset.init_score_.empty()) {
int num_class = static_cast<int>(fullset.num_init_score_ / fullset.num_data_); int num_class = static_cast<int>(fullset.num_init_score_ / fullset.num_data_);
init_score_ = std::vector<double>(num_used_indices*num_class); init_score_ = std::vector<double>(static_cast<size_t>(num_used_indices) * num_class);
num_init_score_ = static_cast<int64_t>(num_used_indices) * num_class; num_init_score_ = static_cast<int64_t>(num_used_indices) * num_class;
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int k = 0; k < num_class; ++k) { for (int k = 0; k < num_class; ++k) {
for (data_size_t i = 0; i < num_used_indices; i++) { const size_t offset_dest = static_cast<size_t>(k) * num_data_;
init_score_[k*num_data_ + i] = fullset.init_score_[k* fullset.num_data_ + used_indices[i]]; const size_t offset_src = static_cast<size_t>(k) * fullset.num_data_;
for (data_size_t i = 0; i < num_used_indices; ++i) {
init_score_[offset_dest + i] = fullset.init_score_[offset_src + used_indices[i]];
} }
} }
} else { } else {
...@@ -137,7 +131,7 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) { ...@@ -137,7 +131,7 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
auto old_label = label_; auto old_label = label_;
num_data_ = static_cast<data_size_t>(used_indices.size()); num_data_ = static_cast<data_size_t>(used_indices.size());
label_ = std::vector<label_t>(num_data_); label_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = old_label[used_indices[i]]; label_[i] = old_label[used_indices[i]];
} }
...@@ -208,7 +202,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data ...@@ -208,7 +202,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
auto old_weights = weights_; auto old_weights = weights_;
num_weights_ = num_data_; num_weights_ = num_data_;
weights_ = std::vector<label_t>(num_data_); weights_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) { for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
weights_[i] = old_weights[used_data_indices[i]]; weights_[i] = old_weights[used_data_indices[i]];
} }
...@@ -269,10 +263,12 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data ...@@ -269,10 +263,12 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
int num_class = static_cast<int>(num_init_score_ / num_all_data); int num_class = static_cast<int>(num_init_score_ / num_all_data);
num_init_score_ = static_cast<int64_t>(num_data_) * num_class; num_init_score_ = static_cast<int64_t>(num_data_) * num_class;
init_score_ = std::vector<double>(num_init_score_); init_score_ = std::vector<double>(num_init_score_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int k = 0; k < num_class; ++k) { for (int k = 0; k < num_class; ++k) {
const size_t offset_dest = static_cast<size_t>(k) * num_data_;
const size_t offset_src = static_cast<size_t>(k) * num_all_data;
for (size_t i = 0; i < used_data_indices.size(); ++i) { for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[k * num_data_ + i] = old_scores[k * num_all_data + used_data_indices[i]]; init_score_[offset_dest + i] = old_scores[offset_src + used_data_indices[i]];
} }
} }
old_scores.clear(); old_scores.clear();
...@@ -297,9 +293,9 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) { ...@@ -297,9 +293,9 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
if (!init_score_.empty()) { init_score_.clear(); } if (!init_score_.empty()) { init_score_.clear(); }
num_init_score_ = len; num_init_score_ = len;
init_score_ = std::vector<double>(len); init_score_ = std::vector<double>(len);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int64_t i = 0; i < num_init_score_; ++i) { for (int64_t i = 0; i < num_init_score_; ++i) {
init_score_[i] = init_score[i]; init_score_[i] = Common::AvoidInf(init_score[i]);
} }
init_score_load_from_file_ = false; init_score_load_from_file_ = false;
} }
...@@ -314,9 +310,9 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) { ...@@ -314,9 +310,9 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) {
} }
if (!label_.empty()) { label_.clear(); } if (!label_.empty()) { label_.clear(); }
label_ = std::vector<label_t>(num_data_); label_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = label[i]; label_[i] = Common::AvoidInf(label[i]);
} }
} }
...@@ -334,9 +330,9 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) { ...@@ -334,9 +330,9 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) {
if (!weights_.empty()) { weights_.clear(); } if (!weights_.empty()) { weights_.clear(); }
num_weights_ = num_data_; num_weights_ = num_data_;
weights_ = std::vector<label_t>(num_weights_); weights_ = std::vector<label_t>(num_weights_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) { for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = weights[i]; weights_[i] = Common::AvoidInf(weights[i]);
} }
LoadQueryWeights(); LoadQueryWeights();
weight_load_from_file_ = false; weight_load_from_file_ = false;
...@@ -351,7 +347,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { ...@@ -351,7 +347,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
return; return;
} }
data_size_t sum = 0; data_size_t sum = 0;
#pragma omp parallel for schedule(static) reduction(+:sum) #pragma omp parallel for schedule(static) reduction(+:sum)
for (data_size_t i = 0; i < len; ++i) { for (data_size_t i = 0; i < len; ++i) {
sum += query[i]; sum += query[i];
} }
...@@ -382,11 +378,11 @@ void Metadata::LoadWeights() { ...@@ -382,11 +378,11 @@ void Metadata::LoadWeights() {
Log::Info("Loading weights..."); Log::Info("Loading weights...");
num_weights_ = static_cast<data_size_t>(reader.Lines().size()); num_weights_ = static_cast<data_size_t>(reader.Lines().size());
weights_ = std::vector<label_t>(num_weights_); weights_ = std::vector<label_t>(num_weights_);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) { for (data_size_t i = 0; i < num_weights_; ++i) {
double tmp_weight = 0.0f; double tmp_weight = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp_weight); Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
weights_[i] = static_cast<label_t>(tmp_weight); weights_[i] = Common::AvoidInf(static_cast<label_t>(tmp_weight));
} }
weight_load_from_file_ = true; weight_load_from_file_ = true;
} }
...@@ -413,15 +409,15 @@ void Metadata::LoadInitialScore(const char* initscore_file) { ...@@ -413,15 +409,15 @@ void Metadata::LoadInitialScore(const char* initscore_file) {
init_score_ = std::vector<double>(num_init_score_); init_score_ = std::vector<double>(num_init_score_);
if (num_class == 1) { if (num_class == 1) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_line; ++i) { for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f; double tmp = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp); Common::Atof(reader.Lines()[i].c_str(), &tmp);
init_score_[i] = static_cast<double>(tmp); init_score_[i] = Common::AvoidInf(static_cast<double>(tmp));
} }
} else { } else {
std::vector<std::string> oneline_init_score; std::vector<std::string> oneline_init_score;
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_line; ++i) { for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f; double tmp = 0.0f;
oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t'); oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t');
...@@ -430,7 +426,7 @@ void Metadata::LoadInitialScore(const char* initscore_file) { ...@@ -430,7 +426,7 @@ void Metadata::LoadInitialScore(const char* initscore_file) {
} }
for (int k = 0; k < num_class; ++k) { for (int k = 0; k < num_class; ++k) {
Common::Atof(oneline_init_score[k].c_str(), &tmp); Common::Atof(oneline_init_score[k].c_str(), &tmp);
init_score_[k * num_line + i] = static_cast<double>(tmp); init_score_[static_cast<size_t>(k) * num_line + i] = Common::AvoidInf(static_cast<double>(tmp));
} }
} }
} }
...@@ -487,21 +483,21 @@ void Metadata::LoadFromMemory(const void* memory) { ...@@ -487,21 +483,21 @@ void Metadata::LoadFromMemory(const void* memory) {
if (!label_.empty()) { label_.clear(); } if (!label_.empty()) { label_.clear(); }
label_ = std::vector<label_t>(num_data_); label_ = std::vector<label_t>(num_data_);
std::memcpy(label_.data(), mem_ptr, sizeof(label_t)*num_data_); std::memcpy(label_.data(), mem_ptr, sizeof(label_t) * num_data_);
mem_ptr += sizeof(label_t)*num_data_; mem_ptr += sizeof(label_t) * num_data_;
if (num_weights_ > 0) { if (num_weights_ > 0) {
if (!weights_.empty()) { weights_.clear(); } if (!weights_.empty()) { weights_.clear(); }
weights_ = std::vector<label_t>(num_weights_); weights_ = std::vector<label_t>(num_weights_);
std::memcpy(weights_.data(), mem_ptr, sizeof(label_t)*num_weights_); std::memcpy(weights_.data(), mem_ptr, sizeof(label_t) * num_weights_);
mem_ptr += sizeof(label_t)*num_weights_; mem_ptr += sizeof(label_t) * num_weights_;
weight_load_from_file_ = true; weight_load_from_file_ = true;
} }
if (num_queries_ > 0) { if (num_queries_ > 0) {
if (!query_boundaries_.empty()) { query_boundaries_.clear(); } if (!query_boundaries_.empty()) { query_boundaries_.clear(); }
query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1); query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1);
std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t)*(num_queries_ + 1)); std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t) * (num_queries_ + 1));
mem_ptr += sizeof(data_size_t)*(num_queries_ + 1); mem_ptr += sizeof(data_size_t) * (num_queries_ + 1);
query_load_from_file_ = true; query_load_from_file_ = true;
} }
LoadQueryWeights(); LoadQueryWeights();
...@@ -520,7 +516,7 @@ void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const { ...@@ -520,7 +516,7 @@ void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const {
} }
} }
size_t Metadata::SizesInByte() const { size_t Metadata::SizesInByte() const {
size_t size = sizeof(num_data_) + sizeof(num_weights_) size_t size = sizeof(num_data_) + sizeof(num_weights_)
+ sizeof(num_queries_); + sizeof(num_queries_);
size += sizeof(label_t) * num_data_; size += sizeof(label_t) * num_data_;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment