Unverified Commit 53977f36 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Fix add features (#2754)



* fix subset bug

* typo

* add fixme tag

* bin mapper

* fix test

* fix add_features_from

* Update dataset.cpp

* fix merge bug

* added Python merge code

* added test for add_features

* Update dataset.cpp

* Update src/io/dataset.cpp

* continue implementing

* warn users about categorical features
Co-authored-by: default avatarStrikerRUS <nekit94-12@hotmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent ceb6265f
/*! /*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information. * Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/ */
#ifndef LIGHTGBM_FEATURE_GROUP_H_ #ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_ #define LIGHTGBM_FEATURE_GROUP_H_
...@@ -17,7 +18,8 @@ namespace LightGBM { ...@@ -17,7 +18,8 @@ namespace LightGBM {
class Dataset; class Dataset;
class DatasetLoader; class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/ /*! \brief Using to store data and providing some operations on one feature
* group*/
class FeatureGroup { class FeatureGroup {
public: public:
friend Dataset; friend Dataset;
...@@ -122,9 +124,11 @@ class FeatureGroup { ...@@ -122,9 +124,11 @@ class FeatureGroup {
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi)); multi_bin_data_.emplace_back(Bin::CreateSparseBin(
num_data, bin_mappers_[i]->num_bin() + addi));
} else { } else {
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi)); multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
} }
multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices); multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_.back()->SizesInByte(); memory_ptr += multi_bin_data_.back()->SizesInByte();
...@@ -141,8 +145,7 @@ class FeatureGroup { ...@@ -141,8 +145,7 @@ class FeatureGroup {
} }
/*! \brief Destructor */ /*! \brief Destructor */
~FeatureGroup() { ~FeatureGroup() {}
}
/*! /*!
* \brief Push one record, will auto convert to bin and push to bin data * \brief Push one record, will auto convert to bin and push to bin data
...@@ -150,9 +153,12 @@ class FeatureGroup { ...@@ -150,9 +153,12 @@ class FeatureGroup {
* \param idx Index of record * \param idx Index of record
* \param value feature value of record * \param value feature value of record
*/ */
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) { inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
double value) {
uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value); uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; } if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
return;
}
if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) { if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
bin -= 1; bin -= 1;
} }
...@@ -184,6 +190,23 @@ class FeatureGroup { ...@@ -184,6 +190,23 @@ class FeatureGroup {
} }
} }
void AddFeaturesFrom(const FeatureGroup* other) {
CHECK(is_multi_val_);
CHECK(other->is_multi_val_);
for (int i = 0; i < other->num_feature_; ++i) {
const auto& other_bin_mapper = other->bin_mappers_[i];
bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
auto num_bin = other_bin_mapper->num_bin();
if (other_bin_mapper->GetMostFreqBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
}
num_feature_ += other->num_feature_;
}
inline BinIterator* SubFeatureIterator(int sub_feature) { inline BinIterator* SubFeatureIterator(int sub_feature) {
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin(); uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (!is_multi_val_) { if (!is_multi_val_) {
...@@ -194,14 +217,15 @@ class FeatureGroup { ...@@ -194,14 +217,15 @@ class FeatureGroup {
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1; int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1; uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi; uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin); return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
most_freq_bin);
} }
} }
inline void FinishLoad() { inline void FinishLoad() {
if (is_multi_val_) { if (is_multi_val_) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
multi_bin_data_[i]->FinishLoad(); multi_bin_data_[i]->FinishLoad();
...@@ -213,11 +237,6 @@ class FeatureGroup { ...@@ -213,11 +237,6 @@ class FeatureGroup {
} }
} }
/*!
* \brief Returns a BinIterator that can access the entire feature group's raw data.
* The RawGet() function of the iterator should be called for best efficiency.
* \return A pointer to the BinIterator object
*/
inline BinIterator* FeatureGroupIterator() { inline BinIterator* FeatureGroupIterator() {
if (is_multi_val_) { if (is_multi_val_) {
return nullptr; return nullptr;
...@@ -377,7 +396,8 @@ class FeatureGroup { ...@@ -377,7 +396,8 @@ class FeatureGroup {
} }
is_multi_val_ = true; is_multi_val_ = true;
} else { } else {
if (force_sparse || (!force_dense && num_feature_ == 1 && if (force_sparse ||
(!force_dense && num_feature_ == 1 &&
bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
is_sparse_ = true; is_sparse_ = true;
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
...@@ -404,7 +424,6 @@ class FeatureGroup { ...@@ -404,7 +424,6 @@ class FeatureGroup {
int num_total_bin_; int num_total_bin_;
}; };
} // namespace LightGBM } // namespace LightGBM
#endif // LIGHTGBM_FEATURE_GROUP_H_ #endif // LIGHTGBM_FEATURE_GROUP_H_
...@@ -1904,6 +1904,76 @@ class Dataset(object): ...@@ -1904,6 +1904,76 @@ class Dataset(object):
if self.handle is None or other.handle is None: if self.handle is None or other.handle is None:
raise ValueError('Both source and target Datasets must be constructed before adding features') raise ValueError('Both source and target Datasets must be constructed before adding features')
_safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle)) _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
was_none = self.data is None
old_self_data_type = type(self.data).__name__
if other.data is None:
self.data = None
elif self.data is not None:
if isinstance(self.data, np.ndarray):
if isinstance(other.data, np.ndarray):
self.data = np.hstack((self.data, other.data))
elif scipy.sparse.issparse(other.data):
self.data = np.hstack((self.data, other.data.toarray()))
elif isinstance(other.data, DataFrame):
self.data = np.hstack((self.data, other.data.values))
elif isinstance(other.data, DataTable):
self.data = np.hstack((self.data, other.data.to_numpy()))
else:
self.data = None
elif scipy.sparse.issparse(self.data):
sparse_format = self.data.getformat()
if isinstance(other.data, np.ndarray) or scipy.sparse.issparse(other.data):
self.data = scipy.sparse.hstack((self.data, other.data), format=sparse_format)
elif isinstance(other.data, DataFrame):
self.data = scipy.sparse.hstack((self.data, other.data.values), format=sparse_format)
elif isinstance(other.data, DataTable):
self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
else:
self.data = None
elif isinstance(self.data, DataFrame):
if not PANDAS_INSTALLED:
raise LightGBMError("Cannot add features to DataFrame type of raw data "
"without pandas installed")
from pandas import concat
if isinstance(other.data, np.ndarray):
self.data = concat((self.data, DataFrame(other.data)),
axis=1, ignore_index=True)
elif scipy.sparse.issparse(other.data):
self.data = concat((self.data, DataFrame(other.data.toarray())),
axis=1, ignore_index=True)
elif isinstance(other.data, DataFrame):
self.data = concat((self.data, other.data),
axis=1, ignore_index=True)
elif isinstance(other.data, DataTable):
self.data = concat((self.data, DataFrame(other.data.to_numpy())),
axis=1, ignore_index=True)
else:
self.data = None
elif isinstance(self.data, DataTable):
if isinstance(other.data, np.ndarray):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data)))
elif scipy.sparse.issparse(other.data):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.toarray())))
elif isinstance(other.data, DataFrame):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
elif isinstance(other.data, DataTable):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
else:
self.data = None
else:
self.data = None
if self.data is None:
err_msg = ("Cannot add features from {} type of raw data to "
"{} type of raw data.\n").format(type(other.data).__name__,
old_self_data_type)
err_msg += ("Set free_raw_data=False when construct Dataset to avoid this"
if was_none else "Freeing raw data")
warnings.warn(err_msg)
self.feature_name = self.get_feature_name()
warnings.warn("Reseting categorical features.\n"
"You can set new categorical features via ``set_categorical_feature`` method")
self.categorical_feature = "auto"
self.pandas_categorical = None
return self return self
def _dump_text(self, filename): def _dump_text(self, filename):
......
...@@ -365,14 +365,17 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers, ...@@ -365,14 +365,17 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
real_feature_idx_.resize(num_features_); real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_); feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_); feature2subfeature_.resize(num_features_);
int num_multi_val_group = 0;
feature_need_push_zeros_.clear(); feature_need_push_zeros_.clear();
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
group_feature_start_.resize(num_groups_);
group_feature_cnt_.resize(num_groups_);
for (int i = 0; i < num_groups_; ++i) { for (int i = 0; i < num_groups_; ++i) {
auto cur_features = features_in_group[i]; auto cur_features = features_in_group[i];
int cur_cnt_features = static_cast<int>(cur_features.size()); int cur_cnt_features = static_cast<int>(cur_features.size());
if (group_is_multi_val[i]) { group_feature_start_[i] = cur_fidx;
++num_multi_val_group; group_feature_cnt_[i] = cur_cnt_features;
}
// get bin_mappers // get bin_mappers
std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers; std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
for (int j = 0; j < cur_cnt_features; ++j) { for (int j = 0; j < cur_cnt_features; ++j) {
...@@ -388,32 +391,11 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers, ...@@ -388,32 +391,11 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
} }
++cur_fidx; ++cur_fidx;
} }
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(new FeatureGroup( feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_))); new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_)));
}
feature_groups_.shrink_to_fit();
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
num_total_bin += feature_groups_[i]->num_total_bin_; num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin); group_bin_boundaries_.push_back(num_total_bin);
} }
int last_group = 0;
group_feature_start_.reserve(num_groups_);
group_feature_cnt_.reserve(num_groups_);
group_feature_start_.push_back(0);
group_feature_cnt_.push_back(1);
for (int i = 1; i < num_features_; ++i) {
const int group = feature2group_[i];
if (group == last_group) {
group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
} else {
group_feature_start_.push_back(i);
group_feature_cnt_.push_back(1);
last_group = group;
}
}
if (!io_config.max_bin_by_feature.empty()) { if (!io_config.max_bin_by_feature.empty()) {
CHECK_EQ(static_cast<size_t>(num_total_features_), CHECK_EQ(static_cast<size_t>(num_total_features_),
io_config.max_bin_by_feature.size()); io_config.max_bin_by_feature.size());
...@@ -725,8 +707,13 @@ void Dataset::CreateValid(const Dataset* dataset) { ...@@ -725,8 +707,13 @@ void Dataset::CreateValid(const Dataset* dataset) {
num_groups_ = num_features_; num_groups_ = num_features_;
feature2group_.clear(); feature2group_.clear();
feature2subfeature_.clear(); feature2subfeature_.clear();
// copy feature bin mapper data
feature_need_push_zeros_.clear(); feature_need_push_zeros_.clear();
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
group_feature_start_.resize(num_groups_);
group_feature_cnt_.resize(num_groups_);
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_features_; ++i) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers; std::vector<std::unique_ptr<BinMapper>> bin_mappers;
bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i)))); bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
...@@ -737,6 +724,10 @@ void Dataset::CreateValid(const Dataset* dataset) { ...@@ -737,6 +724,10 @@ void Dataset::CreateValid(const Dataset* dataset) {
feature_groups_.emplace_back(new FeatureGroup(&bin_mappers, num_data_)); feature_groups_.emplace_back(new FeatureGroup(&bin_mappers, num_data_));
feature2group_.push_back(i); feature2group_.push_back(i);
feature2subfeature_.push_back(0); feature2subfeature_.push_back(0);
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
group_feature_start_[i] = i;
group_feature_cnt_[i] = 1;
} }
feature_groups_.shrink_to_fit(); feature_groups_.shrink_to_fit();
...@@ -745,28 +736,6 @@ void Dataset::CreateValid(const Dataset* dataset) { ...@@ -745,28 +736,6 @@ void Dataset::CreateValid(const Dataset* dataset) {
feature_names_ = dataset->feature_names_; feature_names_ = dataset->feature_names_;
label_idx_ = dataset->label_idx_; label_idx_ = dataset->label_idx_;
real_feature_idx_ = dataset->real_feature_idx_; real_feature_idx_ = dataset->real_feature_idx_;
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
int last_group = 0;
group_feature_start_.reserve(num_groups_);
group_feature_cnt_.reserve(num_groups_);
group_feature_start_.push_back(0);
group_feature_cnt_.push_back(1);
for (int i = 1; i < num_features_; ++i) {
const int group = feature2group_[i];
if (group == last_group) {
group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
} else {
group_feature_start_.push_back(i);
group_feature_cnt_.push_back(1);
last_group = group;
}
}
forced_bin_bounds_ = dataset->forced_bin_bounds_; forced_bin_bounds_ = dataset->forced_bin_bounds_;
} }
...@@ -1319,10 +1288,11 @@ void Dataset::ConstructHistogramsInner( ...@@ -1319,10 +1288,11 @@ void Dataset::ConstructHistogramsInner(
int multi_val_groud_id = -1; int multi_val_groud_id = -1;
used_dense_group.reserve(num_groups_); used_dense_group.reserve(num_groups_);
for (int group = 0; group < num_groups_; ++group) { for (int group = 0; group < num_groups_; ++group) {
const int f_start = group_feature_start_[group];
const int f_cnt = group_feature_cnt_[group]; const int f_cnt = group_feature_cnt_[group];
bool is_group_used = false; bool is_group_used = false;
for (int j = 0; j < f_cnt; ++j) { for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j; const int fidx = f_start + j;
if (is_feature_used[fidx]) { if (is_feature_used[fidx]) {
is_group_used = true; is_group_used = true;
break; break;
...@@ -1494,13 +1464,23 @@ void Dataset::AddFeaturesFrom(Dataset* other) { ...@@ -1494,13 +1464,23 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
"Cannot add features from other Dataset with a different number of " "Cannot add features from other Dataset with a different number of "
"rows"); "rows");
} }
PushVector(&feature_names_, other->feature_names_); int mv_gid = -1;
int other_mv_gid = -1;
for (int i = 0; i < num_groups_; ++i) {
if (IsMultiGroup(i)) {
mv_gid = i;
}
}
for (int i = 0; i < other->num_groups_; ++i) {
if (other->IsMultiGroup(i)) {
other_mv_gid = i;
}
}
// Only one multi-val group, just simply merge
if (mv_gid < 0 || other_mv_gid < 0) {
PushVector(&feature2subfeature_, other->feature2subfeature_); PushVector(&feature2subfeature_, other->feature2subfeature_);
PushVector(&group_feature_cnt_, other->group_feature_cnt_); PushVector(&group_feature_cnt_, other->group_feature_cnt_);
PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
feature_groups_.reserve(other->feature_groups_.size()); feature_groups_.reserve(other->feature_groups_.size());
// FIXME: fix the multiple multi-val feature groups, they need to be merged
// into one multi-val group
for (auto& fg : other->feature_groups_) { for (auto& fg : other->feature_groups_) {
feature_groups_.emplace_back(new FeatureGroup(*fg)); feature_groups_.emplace_back(new FeatureGroup(*fg));
} }
...@@ -1511,7 +1491,8 @@ void Dataset::AddFeaturesFrom(Dataset* other) { ...@@ -1511,7 +1491,8 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
used_feature_map_.push_back(-1); // Unused feature. used_feature_map_.push_back(-1); // Unused feature.
} }
} }
PushOffset(&real_feature_idx_, other->real_feature_idx_, num_total_features_); PushOffset(&real_feature_idx_, other->real_feature_idx_,
num_total_features_);
PushOffset(&feature2group_, other->feature2group_, num_groups_); PushOffset(&feature2group_, other->feature2group_, num_groups_);
auto bin_offset = group_bin_boundaries_.back(); auto bin_offset = group_bin_boundaries_.back();
// Skip the leading 0 when copying group_bin_boundaries. // Skip the leading 0 when copying group_bin_boundaries.
...@@ -1519,12 +1500,95 @@ void Dataset::AddFeaturesFrom(Dataset* other) { ...@@ -1519,12 +1500,95 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
i < other->group_bin_boundaries_.end(); ++i) { i < other->group_bin_boundaries_.end(); ++i) {
group_bin_boundaries_.push_back(*i + bin_offset); group_bin_boundaries_.push_back(*i + bin_offset);
} }
PushOffset(&group_feature_start_, other->group_feature_start_, num_features_); PushOffset(&group_feature_start_, other->group_feature_start_,
PushClearIfEmpty(&max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1); num_features_);
num_groups_ += other->num_groups_;
num_features_ += other->num_features_;
} else {
std::vector<std::vector<int>> features_in_group;
for (int i = 0; i < num_groups_; ++i) {
int f_start = group_feature_start_[i];
int f_cnt = group_feature_cnt_[i];
features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) {
features_in_group.back().push_back(f_start + j);
}
}
feature_groups_[mv_gid]->AddFeaturesFrom(
other->feature_groups_[other_mv_gid].get());
for (int i = 0; i < other->num_groups_; ++i) {
int f_start = other->group_feature_start_[i];
int f_cnt = other->group_feature_cnt_[i];
if (i == other_mv_gid) {
for (int j = 0; j < f_cnt; ++j) {
features_in_group[mv_gid].push_back(f_start + j);
}
} else {
features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) {
features_in_group.back().push_back(f_start + j);
}
feature_groups_.emplace_back(
new FeatureGroup(*other->feature_groups_[i]));
}
}
// regenerate other fields
num_groups_ += other->num_groups_ - 1;
CHECK(num_groups_ == static_cast<int>(features_in_group.size()));
num_features_ += other->num_features_; num_features_ += other->num_features_;
int cur_fidx = 0;
used_feature_map_ = std::vector<int>(num_total_features_, -1);
real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_);
group_feature_start_.resize(num_groups_);
group_feature_cnt_.resize(num_groups_);
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
auto cur_features = features_in_group[i];
int cur_cnt_features = static_cast<int>(cur_features.size());
group_feature_start_[i] = cur_fidx;
group_feature_cnt_[i] = cur_cnt_features;
for (int j = 0; j < cur_cnt_features; ++j) {
int real_fidx = cur_features[j];
used_feature_map_[real_fidx] = cur_fidx;
real_feature_idx_[cur_fidx] = real_fidx;
feature2group_[cur_fidx] = i;
feature2subfeature_[cur_fidx] = j;
++cur_fidx;
}
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
}
std::unordered_set<std::string> feature_names_set;
for (const auto& val : feature_names_) {
feature_names_set.emplace(val);
}
for (const auto& val : other->feature_names_) {
std::string new_name = val;
int cnt = 2;
while (feature_names_set.count(new_name)) {
new_name = "D" + std::to_string(cnt) + "_" + val;
++cnt;
}
if (new_name != val) {
Log::Warning(
"Find the same feature name (%s) in Dataset::AddFeaturesFrom, change "
"its name to (%s)",
val.c_str(), new_name.c_str());
}
feature_names_set.emplace(new_name);
feature_names_.push_back(new_name);
}
PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
PushClearIfEmpty(&max_bin_by_feature_, num_total_features_,
other->max_bin_by_feature_, other->num_total_features_, -1);
num_total_features_ += other->num_total_features_; num_total_features_ += other->num_total_features_;
num_groups_ += other->num_groups_;
} }
} // namespace LightGBM } // namespace LightGBM
...@@ -188,6 +188,42 @@ class TestBasic(unittest.TestCase): ...@@ -188,6 +188,42 @@ class TestBasic(unittest.TestCase):
d1txt = d1f.read() d1txt = d1f.read()
self.assertEqual(dtxt, d1txt) self.assertEqual(dtxt, d1txt)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_add_features_from_different_sources(self):
import pandas as pd
n_row = 100
n_col = 5
X = np.random.random((n_row, n_col))
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
names = ['col_%d' % i for i in range(n_col)]
for x_1 in xxs:
# test that method works even with free_raw_data=True
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
d2 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
d1.add_features_from(d2)
self.assertIsNone(d1.data)
# test that method works but sets raw data to None in case of immergeable data types
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
d2 = lgb.Dataset([X[:n_row // 2, :], X[n_row // 2:, :]],
feature_name=names, free_raw_data=False).construct()
d1.add_features_from(d2)
self.assertIsNone(d1.data)
# test that method works for different data types
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
res_feature_names = [name for name in names]
idx = 1
for idx, x_2 in enumerate(xxs, 2):
original_type = type(d1.get_data())
d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
d1.add_features_from(d2)
self.assertIsInstance(d1.get_data(), original_type)
self.assertTupleEqual(d1.get_data().shape, (n_row, n_col * idx))
res_feature_names += ['D{}_{}'.format(idx, name) for name in names]
idx += 1
self.assertListEqual(d1.feature_name, res_feature_names)
def test_cegb_affects_behavior(self): def test_cegb_affects_behavior(self):
X = np.random.random((100, 5)) X = np.random.random((100, 5))
X[:, [1, 3]] = 0 X[:, [1, 3]] = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment