Unverified Commit 699d4381 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

fix bug in feature fraction (#1099)

* fix feature fraction

* fix bugs.
parent a957bd62
...@@ -361,7 +361,15 @@ public: ...@@ -361,7 +361,15 @@ public:
inline uint64_t NumTotalBin() const { inline uint64_t NumTotalBin() const {
return group_bin_boundaries_.back(); return group_bin_boundaries_.back();
} }
inline std::vector<int> ValidFeatureIndices() const {
std::vector<int> ret;
for (int i = 0; i < num_total_features_; ++i) {
if (used_feature_map_[i] >= 0) {
ret.push_back(i);
}
}
return ret;
}
void ReSize(data_size_t num_data); void ReSize(data_size_t num_data);
void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
......
...@@ -78,6 +78,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian ...@@ -78,6 +78,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
// initialize data partition // initialize data partition
data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves)); data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves));
is_feature_used_.resize(num_features_); is_feature_used_.resize(num_features_);
valid_feature_indices_ = train_data_->ValidFeatureIndices();
// initialize ordered gradients and hessians // initialize ordered gradients and hessians
ordered_gradients_.resize(num_data_); ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_); ordered_hessians_.resize(num_data_);
...@@ -237,16 +238,19 @@ void SerialTreeLearner::BeforeTrain() { ...@@ -237,16 +238,19 @@ void SerialTreeLearner::BeforeTrain() {
histogram_pool_.ResetMap(); histogram_pool_.ResetMap();
if (tree_config_->feature_fraction < 1) { if (tree_config_->feature_fraction < 1) {
int used_feature_cnt = static_cast<int>(train_data_->num_total_features()*tree_config_->feature_fraction); int used_feature_cnt = static_cast<int>(valid_feature_indices_.size()*tree_config_->feature_fraction);
// at least use one feature
used_feature_cnt = std::max(used_feature_cnt, 1);
// initialize used features // initialize used features
std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_); std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
// Get used feature at current tree // Get used feature at current tree
auto used_feature_indices = random_.Sample(train_data_->num_total_features(), used_feature_cnt); auto sampled_indices = random_.Sample(valid_feature_indices_.size(), used_feature_cnt);
int omp_loop_size = static_cast<int>(used_feature_indices.size()); int omp_loop_size = static_cast<int>(sampled_indices.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) { for (int i = 0; i < omp_loop_size; ++i) {
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature_indices[i]); int used_feature = valid_feature_indices_[sampled_indices[i]];
if (inner_feature_index < 0) { continue; } int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
CHECK(inner_feature_index >= 0);
is_feature_used_[inner_feature_index] = 1; is_feature_used_[inner_feature_index] = 1;
} }
} else { } else {
......
...@@ -125,6 +125,7 @@ protected: ...@@ -125,6 +125,7 @@ protected:
std::unique_ptr<LeafSplits> smaller_leaf_splits_; std::unique_ptr<LeafSplits> smaller_leaf_splits_;
/*! \brief stores best thresholds for all feature for larger leaf */ /*! \brief stores best thresholds for all feature for larger leaf */
std::unique_ptr<LeafSplits> larger_leaf_splits_; std::unique_ptr<LeafSplits> larger_leaf_splits_;
std::vector<int> valid_feature_indices_;
#ifdef USE_GPU #ifdef USE_GPU
/*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */ /*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment