Unverified Commit d163c2c1 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

Log warnings for number of bins of categorical features (#4448)

* log warnings when number of bins of categorical features exceeds the configured maximum number of bins

* log only one warning information for all categorical features

* Add #include <memory> for unique_ptr

* remove useless param description
parent 17d4e007
......@@ -7,6 +7,7 @@
#include <LightGBM/dataset.h>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
......@@ -63,6 +64,16 @@ class DatasetLoader {
/*! \brief Check can load from binary file */
std::string CheckCanLoadFromBin(const char* filename);
/*! \brief Check the number of bins for categorical features.
* The number of bins for categorical features may exceed the configured maximum value.
* Log warnings when such cases happen.
*
* \param bin_mappers the bin_mappers of all features
* \param max_bin max_bin from Config
* \param max_bin_by_feature max_bin_by_feature from Config
*/
void CheckCategoricalFeatureNumBin(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers, const int max_bin, const std::vector<int>& max_bin_by_feature) const;
const Config& config_;
/*! \brief Random generator*/
Random random_;
......
......@@ -805,6 +805,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
cp_ptr += bin_mappers[i]->SizesInByte();
}
}
CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
dataset->Construct(&bin_mappers, num_total_features, forced_bin_bounds, sample_indices, sample_values, num_per_col, num_col, total_sample_size, config_);
if (dataset->has_raw()) {
......@@ -1184,6 +1185,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
cp_ptr += bin_mappers[i]->SizesInByte();
}
}
CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
dataset->Construct(&bin_mappers, dataset->num_total_features_, forced_bin_bounds, Common::Vector2Ptr<int>(&sample_indices).data(),
Common::Vector2Ptr<double>(&sample_values).data(),
Common::VectorSize<int>(sample_indices).data(), static_cast<int>(sample_indices.size()), sample_data.size(), config_);
......@@ -1463,4 +1465,44 @@ std::vector<std::vector<double>> DatasetLoader::GetForcedBins(std::string forced
return forced_bins;
}
void DatasetLoader::CheckCategoricalFeatureNumBin(
const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const int max_bin, const std::vector<int>& max_bin_by_feature) const {
bool need_warning = false;
if (bin_mappers.size() < 1024) {
for (size_t i = 0; i < bin_mappers.size(); ++i) {
const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
need_warning = true;
break;
}
}
} else {
const int num_threads = OMP_NUM_THREADS();
std::vector<bool> thread_need_warning(num_threads, false);
Threading::For<size_t>(0, bin_mappers.size(), 1,
[&bin_mappers, &thread_need_warning, &max_bin_by_feature, max_bin] (int thread_index, size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
thread_need_warning[thread_index] = false;
const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
thread_need_warning[thread_index] = true;
break;
}
}
});
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
if (thread_need_warning[thread_index]) {
need_warning = true;
break;
}
}
}
if (need_warning) {
Log::Warning("Categorical features with more bins than the configured maximum bin number found.");
Log::Warning("For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.");
}
}
} // namespace LightGBM
......@@ -506,10 +506,30 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
} else {
Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
}
if (max_num_bin_ == 65) {
// ignore the feature groups that contain categorical features when producing warnings about max_bin.
// these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
int max_num_bin_no_categorical = 0;
int cur_feature_group = 0;
bool categorical_feature_found = false;
for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
const int feature_group = train_data_->Feature2Group(inner_feature_index);
const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
categorical_feature_found = true;
}
if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
if (!categorical_feature_found) {
max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
}
categorical_feature_found = false;
cur_feature_group = feature_group;
}
}
if (max_num_bin_no_categorical == 65) {
Log::Warning("Setting max_bin to 63 is suggested for best performance");
}
if (max_num_bin_ == 17) {
if (max_num_bin_no_categorical == 17) {
Log::Warning("Setting max_bin to 15 is suggested for best performance");
}
......
......@@ -719,10 +719,30 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
} else {
Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
}
if (max_num_bin_ == 65) {
// ignore the feature groups that contain categorical features when producing warnings about max_bin.
// these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
int max_num_bin_no_categorical = 0;
int cur_feature_group = 0;
bool categorical_feature_found = false;
for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
const int feature_group = train_data_->Feature2Group(inner_feature_index);
const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
categorical_feature_found = true;
}
if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
if (!categorical_feature_found) {
max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
}
categorical_feature_found = false;
cur_feature_group = feature_group;
}
}
if (max_num_bin_no_categorical == 65) {
Log::Warning("Setting max_bin to 63 is suggested for best performance");
}
if (max_num_bin_ == 17) {
if (max_num_bin_no_categorical == 17) {
Log::Warning("Setting max_bin to 15 is suggested for best performance");
}
ctx_ = boost::compute::context(dev_);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment