Unverified Commit 699d4381 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

fix bug in feature fraction (#1099)

* fix feature fraction

* fix bugs.
parent a957bd62
......@@ -361,7 +361,15 @@ public:
inline uint64_t NumTotalBin() const {
return group_bin_boundaries_.back();
}
inline std::vector<int> ValidFeatureIndices() const {
std::vector<int> ret;
for (int i = 0; i < num_total_features_; ++i) {
if (used_feature_map_[i] >= 0) {
ret.push_back(i);
}
}
return ret;
}
void ReSize(data_size_t num_data);
void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
......
......@@ -78,6 +78,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
// initialize data partition
data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves));
is_feature_used_.resize(num_features_);
valid_feature_indices_ = train_data_->ValidFeatureIndices();
// initialize ordered gradients and hessians
ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_);
......@@ -237,16 +238,19 @@ void SerialTreeLearner::BeforeTrain() {
histogram_pool_.ResetMap();
if (tree_config_->feature_fraction < 1) {
int used_feature_cnt = static_cast<int>(train_data_->num_total_features()*tree_config_->feature_fraction);
int used_feature_cnt = static_cast<int>(valid_feature_indices_.size()*tree_config_->feature_fraction);
// at least use one feature
used_feature_cnt = std::max(used_feature_cnt, 1);
// initialize used features
std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
// Get used feature at current tree
auto used_feature_indices = random_.Sample(train_data_->num_total_features(), used_feature_cnt);
int omp_loop_size = static_cast<int>(used_feature_indices.size());
auto sampled_indices = random_.Sample(valid_feature_indices_.size(), used_feature_cnt);
int omp_loop_size = static_cast<int>(sampled_indices.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) {
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature_indices[i]);
if (inner_feature_index < 0) { continue; }
int used_feature = valid_feature_indices_[sampled_indices[i]];
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
CHECK(inner_feature_index >= 0);
is_feature_used_[inner_feature_index] = 1;
}
} else {
......
......@@ -125,6 +125,7 @@ protected:
std::unique_ptr<LeafSplits> smaller_leaf_splits_;
/*! \brief stores best thresholds for all feature for larger leaf */
std::unique_ptr<LeafSplits> larger_leaf_splits_;
std::vector<int> valid_feature_indices_;
#ifdef USE_GPU
/*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment