Unverified Commit 530b5cef authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

Fix #3557 and potential issue with dense multi-val feature groups. (#3590)

Fix num_total_bin_ and bin_offsets_ of FeatureGroup
if a dense multi val feature group with non zero most freq bin
is the first feature group of the dataset.
parent c5d9d243
...@@ -37,7 +37,8 @@ class FeatureGroup { ...@@ -37,7 +37,8 @@ class FeatureGroup {
*/ */
FeatureGroup(int num_feature, int8_t is_multi_val, FeatureGroup(int num_feature, int8_t is_multi_val,
std::vector<std::unique_ptr<BinMapper>>* bin_mappers, std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) { data_size_t num_data, int group_id) :
num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature); CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
auto& ref_bin_mappers = *bin_mappers; auto& ref_bin_mappers = *bin_mappers;
double sum_sparse_rate = 0.0f; double sum_sparse_rate = 0.0f;
...@@ -55,6 +56,12 @@ class FeatureGroup { ...@@ -55,6 +56,12 @@ class FeatureGroup {
} }
// use bin at zero to store most_freq_bin only when not using dense multi val bin // use bin at zero to store most_freq_bin only when not using dense multi val bin
num_total_bin_ = offset; num_total_bin_ = offset;
// however, we should force to leave one bin, if dense multi val bin is the first bin
// and its first feature has most freq bin > 0
if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
bin_mappers_[0]->GetMostFreqBin() > 0) {
num_total_bin_ = 1;
}
bin_offsets_.emplace_back(num_total_bin_); bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
auto num_bin = bin_mappers_[i]->num_bin(); auto num_bin = bin_mappers_[i]->num_bin();
...@@ -109,7 +116,8 @@ class FeatureGroup { ...@@ -109,7 +116,8 @@ class FeatureGroup {
* \param local_used_indices Local used indices, empty means using all data * \param local_used_indices Local used indices, empty means using all data
*/ */
FeatureGroup(const void* memory, data_size_t num_all_data, FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) { const std::vector<data_size_t>& local_used_indices,
int group_id) {
const char* memory_ptr = reinterpret_cast<const char*>(memory); const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse // get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr)); is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
...@@ -122,19 +130,33 @@ class FeatureGroup { ...@@ -122,19 +130,33 @@ class FeatureGroup {
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_)); memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
// get bin mapper // get bin mapper
bin_mappers_.clear(); bin_mappers_.clear();
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
memory_ptr += bin_mappers_[i]->SizesInByte();
}
bin_offsets_.clear(); bin_offsets_.clear();
// start from 1, due to need to store zero bin in this slot int offset = 1;
if (is_dense_multi_val_) {
offset = 0;
}
// use bin at zero to store most_freq_bin only when not using dense multi val bin
num_total_bin_ = offset;
// however, we should force to leave one bin, if dense multi val bin is the first bin
// and its first feature has most freq bin > 0
if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
bin_mappers_[0]->GetMostFreqBin() > 0) {
num_total_bin_ = 1; num_total_bin_ = 1;
}
bin_offsets_.emplace_back(num_total_bin_); bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
auto num_bin = bin_mappers_[i]->num_bin(); auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetMostFreqBin() == 0) { if (bin_mappers_[i]->GetMostFreqBin() == 0) {
num_bin -= 1; num_bin -= offset;
} }
num_total_bin_ += num_bin; num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_); bin_offsets_.emplace_back(num_total_bin_);
memory_ptr += bin_mappers_[i]->SizesInByte();
} }
data_size_t num_data = num_all_data; data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) { if (!local_used_indices.empty()) {
...@@ -210,7 +232,7 @@ class FeatureGroup { ...@@ -210,7 +232,7 @@ class FeatureGroup {
} }
} }
void AddFeaturesFrom(const FeatureGroup* other) { void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
CHECK(is_multi_val_); CHECK(is_multi_val_);
CHECK(other->is_multi_val_); CHECK(other->is_multi_val_);
// every time when new features are added, we need to reconsider sparse or dense // every time when new features are added, we need to reconsider sparse or dense
...@@ -231,6 +253,12 @@ class FeatureGroup { ...@@ -231,6 +253,12 @@ class FeatureGroup {
} }
bin_offsets_.clear(); bin_offsets_.clear();
num_total_bin_ = offset; num_total_bin_ = offset;
// however, we should force to leave one bin, if dense multi val bin is the first bin
// and its first feature has most freq bin > 0
if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
bin_mappers_[0]->GetMostFreqBin() > 0) {
num_total_bin_ = 1;
}
bin_offsets_.emplace_back(num_total_bin_); bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
auto num_bin = bin_mappers_[i]->num_bin(); auto num_bin = bin_mappers_[i]->num_bin();
...@@ -248,6 +276,7 @@ class FeatureGroup { ...@@ -248,6 +276,7 @@ class FeatureGroup {
num_bin -= offset; num_bin -= offset;
} }
num_total_bin_ += num_bin; num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone()); multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
} }
num_feature_ += other->num_feature_; num_feature_ += other->num_feature_;
...@@ -407,7 +436,8 @@ class FeatureGroup { ...@@ -407,7 +436,8 @@ class FeatureGroup {
FeatureGroup& operator=(const FeatureGroup&) = delete; FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Deep copy */ /*! \brief Deep copy */
FeatureGroup(const FeatureGroup& other) { FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
int group_id) {
num_feature_ = other.num_feature_; num_feature_ = other.num_feature_;
is_multi_val_ = other.is_multi_val_; is_multi_val_ = other.is_multi_val_;
is_dense_multi_val_ = other.is_dense_multi_val_; is_dense_multi_val_ = other.is_dense_multi_val_;
...@@ -427,6 +457,17 @@ class FeatureGroup { ...@@ -427,6 +457,17 @@ class FeatureGroup {
multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone()); multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
} }
} }
if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
// this feature group was the first feature group, but now no longer is,
// so we need to eliminate its special empty bin for multi val dense bin
if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
for (size_t i = 0; i < bin_offsets_.size(); ++i) {
bin_offsets_[i] -= 1;
}
num_total_bin_ -= 1;
}
}
} }
private: private:
......
...@@ -392,7 +392,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers, ...@@ -392,7 +392,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
++cur_fidx; ++cur_fidx;
} }
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>( feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_))); new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_, i)));
num_total_bin += feature_groups_[i]->num_total_bin_; num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin); group_bin_boundaries_.push_back(num_total_bin);
} }
...@@ -1282,7 +1282,8 @@ void Dataset::AddFeaturesFrom(Dataset* other) { ...@@ -1282,7 +1282,8 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
PushVector(&group_feature_cnt_, other->group_feature_cnt_); PushVector(&group_feature_cnt_, other->group_feature_cnt_);
feature_groups_.reserve(other->feature_groups_.size()); feature_groups_.reserve(other->feature_groups_.size());
for (auto& fg : other->feature_groups_) { for (auto& fg : other->feature_groups_) {
feature_groups_.emplace_back(new FeatureGroup(*fg)); const int cur_group_id = static_cast<int>(feature_groups_.size());
feature_groups_.emplace_back(new FeatureGroup(*fg, true, cur_group_id));
} }
for (auto feature_idx : other->used_feature_map_) { for (auto feature_idx : other->used_feature_map_) {
if (feature_idx >= 0) { if (feature_idx >= 0) {
...@@ -1311,34 +1312,37 @@ void Dataset::AddFeaturesFrom(Dataset* other) { ...@@ -1311,34 +1312,37 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
int f_cnt = group_feature_cnt_[i]; int f_cnt = group_feature_cnt_[i];
features_in_group.emplace_back(); features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) { for (int j = 0; j < f_cnt; ++j) {
features_in_group.back().push_back(f_start + j); const int real_fidx = real_feature_idx_[f_start + j];
features_in_group.back().push_back(real_fidx);
} }
} }
feature_groups_[mv_gid]->AddFeaturesFrom( feature_groups_[mv_gid]->AddFeaturesFrom(
other->feature_groups_[other_mv_gid].get()); other->feature_groups_[other_mv_gid].get(), mv_gid);
for (int i = 0; i < other->num_groups_; ++i) { for (int i = 0; i < other->num_groups_; ++i) {
int f_start = other->group_feature_start_[i]; int f_start = other->group_feature_start_[i];
int f_cnt = other->group_feature_cnt_[i]; int f_cnt = other->group_feature_cnt_[i];
if (i == other_mv_gid) { if (i == other_mv_gid) {
for (int j = 0; j < f_cnt; ++j) { for (int j = 0; j < f_cnt; ++j) {
features_in_group[mv_gid].push_back(f_start + j); const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
features_in_group[mv_gid].push_back(real_fidx);
} }
} else { } else {
features_in_group.emplace_back(); features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) { for (int j = 0; j < f_cnt; ++j) {
features_in_group.back().push_back(f_start + j); const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
features_in_group.back().push_back(real_fidx);
} }
feature_groups_.emplace_back( feature_groups_.emplace_back(
new FeatureGroup(*other->feature_groups_[i])); new FeatureGroup(*other->feature_groups_[i], false, -1));
} }
} }
// regenerate other fields // regenerate other fields
num_groups_ += other->num_groups_ - 1; num_groups_ += other->num_groups_ - 1;
CHECK(num_groups_ == static_cast<int>(features_in_group.size())); CHECK(num_groups_ == static_cast<int>(features_in_group.size()));
num_features_ += other->num_features_; num_features_ += other->num_features_;
int cur_fidx = 0; int cur_fidx = 0;
used_feature_map_ = std::vector<int>(num_total_features_, -1); used_feature_map_ =
std::vector<int>(num_total_features_ + other->num_total_features_, -1);
real_feature_idx_.resize(num_features_); real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_); feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_); feature2subfeature_.resize(num_features_);
......
...@@ -543,7 +543,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -543,7 +543,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>( dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(buffer.data(), new FeatureGroup(buffer.data(),
*num_global_data, *num_global_data,
*used_data_indices))); *used_data_indices, i)));
} }
dataset->feature_groups_.shrink_to_fit(); dataset->feature_groups_.shrink_to_fit();
dataset->is_finish_load_ = true; dataset->is_finish_load_ = true;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment