Unverified Commit 530b5cef authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

Fix #3557 and potential issue with dense multi-val feature groups. (#3590)

Fix num_total_bin_ and bin_offsets_ of FeatureGroup
if a dense multi val feature group with non zero most freq bin
is the first feature group of the dataset.
parent c5d9d243
......@@ -37,7 +37,8 @@ class FeatureGroup {
*/
FeatureGroup(int num_feature, int8_t is_multi_val,
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
data_size_t num_data, int group_id) :
num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
auto& ref_bin_mappers = *bin_mappers;
double sum_sparse_rate = 0.0f;
......@@ -55,6 +56,12 @@ class FeatureGroup {
}
// use bin at zero to store most_freq_bin only when not using dense multi val bin
num_total_bin_ = offset;
// however, we should force to leave one bin, if dense multi val bin is the first bin
// and its first feature has most freq bin > 0
if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
bin_mappers_[0]->GetMostFreqBin() > 0) {
num_total_bin_ = 1;
}
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
auto num_bin = bin_mappers_[i]->num_bin();
......@@ -109,7 +116,8 @@ class FeatureGroup {
* \param local_used_indices Local used indices, empty means using all data
*/
FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) {
const std::vector<data_size_t>& local_used_indices,
int group_id) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
......@@ -122,19 +130,33 @@ class FeatureGroup {
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
// get bin mapper
bin_mappers_.clear();
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
memory_ptr += bin_mappers_[i]->SizesInByte();
}
bin_offsets_.clear();
// start from 1, due to need to store zero bin in this slot
int offset = 1;
if (is_dense_multi_val_) {
offset = 0;
}
// use bin at zero to store most_freq_bin only when not using dense multi val bin
num_total_bin_ = offset;
// however, we should force to leave one bin, if dense multi val bin is the first bin
// and its first feature has most freq bin > 0
if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
bin_mappers_[0]->GetMostFreqBin() > 0) {
num_total_bin_ = 1;
}
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetMostFreqBin() == 0) {
num_bin -= 1;
num_bin -= offset;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
memory_ptr += bin_mappers_[i]->SizesInByte();
}
data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) {
......@@ -210,7 +232,7 @@ class FeatureGroup {
}
}
void AddFeaturesFrom(const FeatureGroup* other) {
void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
CHECK(is_multi_val_);
CHECK(other->is_multi_val_);
// every time when new features are added, we need to reconsider sparse or dense
......@@ -231,6 +253,12 @@ class FeatureGroup {
}
bin_offsets_.clear();
num_total_bin_ = offset;
// however, we should force to leave one bin, if dense multi val bin is the first bin
// and its first feature has most freq bin > 0
if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
bin_mappers_[0]->GetMostFreqBin() > 0) {
num_total_bin_ = 1;
}
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
auto num_bin = bin_mappers_[i]->num_bin();
......@@ -248,6 +276,7 @@ class FeatureGroup {
num_bin -= offset;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
}
num_feature_ += other->num_feature_;
......@@ -407,7 +436,8 @@ class FeatureGroup {
FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Deep copy */
FeatureGroup(const FeatureGroup& other) {
FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
int group_id) {
num_feature_ = other.num_feature_;
is_multi_val_ = other.is_multi_val_;
is_dense_multi_val_ = other.is_dense_multi_val_;
......@@ -427,6 +457,17 @@ class FeatureGroup {
multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
}
}
if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
// this feature group was the first feature group, but now no longer is,
// so we need to eliminate its special empty bin for multi val dense bin
if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
for (size_t i = 0; i < bin_offsets_.size(); ++i) {
bin_offsets_[i] -= 1;
}
num_total_bin_ -= 1;
}
}
}
private:
......
......@@ -392,7 +392,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
++cur_fidx;
}
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_)));
new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_, i)));
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
......@@ -1282,7 +1282,8 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
PushVector(&group_feature_cnt_, other->group_feature_cnt_);
feature_groups_.reserve(other->feature_groups_.size());
for (auto& fg : other->feature_groups_) {
feature_groups_.emplace_back(new FeatureGroup(*fg));
const int cur_group_id = static_cast<int>(feature_groups_.size());
feature_groups_.emplace_back(new FeatureGroup(*fg, true, cur_group_id));
}
for (auto feature_idx : other->used_feature_map_) {
if (feature_idx >= 0) {
......@@ -1311,34 +1312,37 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
int f_cnt = group_feature_cnt_[i];
features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) {
features_in_group.back().push_back(f_start + j);
const int real_fidx = real_feature_idx_[f_start + j];
features_in_group.back().push_back(real_fidx);
}
}
feature_groups_[mv_gid]->AddFeaturesFrom(
other->feature_groups_[other_mv_gid].get());
other->feature_groups_[other_mv_gid].get(), mv_gid);
for (int i = 0; i < other->num_groups_; ++i) {
int f_start = other->group_feature_start_[i];
int f_cnt = other->group_feature_cnt_[i];
if (i == other_mv_gid) {
for (int j = 0; j < f_cnt; ++j) {
features_in_group[mv_gid].push_back(f_start + j);
const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
features_in_group[mv_gid].push_back(real_fidx);
}
} else {
features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) {
features_in_group.back().push_back(f_start + j);
const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
features_in_group.back().push_back(real_fidx);
}
feature_groups_.emplace_back(
new FeatureGroup(*other->feature_groups_[i]));
new FeatureGroup(*other->feature_groups_[i], false, -1));
}
}
// regenerate other fields
num_groups_ += other->num_groups_ - 1;
CHECK(num_groups_ == static_cast<int>(features_in_group.size()));
num_features_ += other->num_features_;
int cur_fidx = 0;
used_feature_map_ = std::vector<int>(num_total_features_, -1);
used_feature_map_ =
std::vector<int>(num_total_features_ + other->num_total_features_, -1);
real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_);
......
......@@ -543,7 +543,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(buffer.data(),
*num_global_data,
*used_data_indices)));
*used_data_indices, i)));
}
dataset->feature_groups_.shrink_to_fit();
dataset->is_finish_load_ = true;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment