Unverified Commit 0584065c authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

add per-feature-penalites (#1449)

* add per-feature-penalites

* fix comment
parent dfbb8836
......@@ -347,6 +347,12 @@ Learning Control Parameters
- you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
- ``feature_contri``, default = ``None``, type = multi-double, aliases: ``fc``, ``fp``, ``feature_penalty``
- used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature
- you need to specify all features in order
- ``forcedsplits_filename``, default = ``""``, type = string, aliases: ``fs``, ``forced_splits_filename``, ``forced_splits_file``, ``forced_splits``
- path to a ``.json`` file that specifies splits to force at the top of every decision tree before best-first learning commences
......
......@@ -353,6 +353,13 @@ public:
// desc = ``1`` means increasing, ``-1`` means decreasing, ``0`` means non-constraint
// desc = you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
std::vector<int8_t> monotone_constraints;
// type = multi-double
// alias = fc, fp, feature_penalty
// default = None
// desc = used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature
// desc = you need to specify all features in order
std::vector<double> feature_contri;
// alias = fs, forced_splits_filename, forced_splits_file, forced_splits
// desc = path to a ``.json`` file that specifies splits to force at the top of every decision tree before best-first learning commences
......
......@@ -443,6 +443,14 @@ public:
}
}
inline double FeaturePenalte(int i) const {
if (feature_penalty_.empty()) {
return 1;
} else {
return feature_penalty_[i];
}
}
bool HasMonotone() const {
if (monotone_types_.empty()) {
return false;
......@@ -605,6 +613,7 @@ private:
std::vector<int> group_feature_start_;
std::vector<int> group_feature_cnt_;
std::vector<int8_t> monotone_types_;
std::vector<double> feature_penalty_;
bool is_finish_load_;
};
......
......@@ -176,6 +176,15 @@ public:
return true;
}
inline static bool CheckAll(const std::vector<VAL_T>& array, VAL_T t) {
for (size_t i = 0; i < array.size(); ++i) {
if (array[i] != t) {
return false;
}
}
return true;
}
};
} // namespace LightGBM
......
......@@ -57,6 +57,9 @@ std::unordered_map<std::string, std::string> Config::alias_table({
{"topk", "top_k"},
{"mc", "monotone_constraints"},
{"monotone_constraint", "monotone_constraints"},
{"fc", "feature_contri"},
{"fp", "feature_contri"},
{"feature_penalty", "feature_contri"},
{"fs", "forcedsplits_filename"},
{"forced_splits_filename", "forcedsplits_filename"},
{"forced_splits_file", "forcedsplits_filename"},
......@@ -172,6 +175,7 @@ std::unordered_set<std::string> Config::parameter_set({
"max_cat_to_onehot",
"top_k",
"monotone_constraints",
"feature_contri",
"forcedsplits_filename",
"verbosity",
"max_bin",
......@@ -336,6 +340,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
monotone_constraints = Common::StringToArray<int8_t>(tmp_str, ',');
}
if (GetString(params, "feature_contri", &tmp_str)) {
feature_contri = Common::StringToArray<double>(tmp_str, ',');
}
GetString(params, "forcedsplits_filename", &forcedsplits_filename);
GetInt(params, "verbosity", &verbosity);
......@@ -523,6 +531,7 @@ std::string Config::SaveMembersToString() const {
str_buf << "[max_cat_to_onehot: " << max_cat_to_onehot << "]\n";
str_buf << "[top_k: " << top_k << "]\n";
str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints),",") << "]\n";
str_buf << "[feature_contri: " << Common::Join(feature_contri,",") << "]\n";
str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
str_buf << "[verbosity: " << verbosity << "]\n";
str_buf << "[max_bin: " << max_bin << "]\n";
......
......@@ -306,6 +306,19 @@ void Dataset::Construct(
monotone_types_.clear();
}
}
if (!io_config.feature_contri.empty()) {
CHECK(static_cast<size_t>(num_total_features_) == io_config.feature_contri.size());
feature_penalty_.resize(num_features_);
for (int i = 0; i < num_total_features_; ++i) {
int inner_fidx = InnerFeatureIndex(i);
if (inner_fidx >= 0) {
feature_penalty_[inner_fidx] = std::max(0.0, io_config.feature_contri[i]);
}
}
if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
feature_penalty_.clear();
}
}
}
void Dataset::FinishLoad() {
......@@ -350,6 +363,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
group_feature_start_ = dataset->group_feature_start_;
group_feature_cnt_ = dataset->group_feature_cnt_;
monotone_types_ = dataset->monotone_types_;
feature_penalty_ = dataset->feature_penalty_;
}
void Dataset::CreateValid(const Dataset* dataset) {
......@@ -403,6 +417,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
}
}
monotone_types_ = dataset->monotone_types_;
feature_penalty_ = dataset->feature_penalty_;
}
void Dataset::ReSize(data_size_t num_data) {
......@@ -555,7 +570,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_;
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
+ sizeof(double) * num_features_;
// size of feature names
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int);
......@@ -581,6 +597,13 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
monotone_types_.clear();
}
if (feature_penalty_.empty()) {
ArrayArgs<double>::Assign(&feature_penalty_, 1.0, num_features_);
}
writer->Write(feature_penalty_.data(), sizeof(double) * num_features_);
if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
feature_penalty_.clear();
}
// write feature names
for (int i = 0; i < num_total_features_; ++i) {
int str_len = static_cast<int>(feature_names_[i].size());
......
......@@ -380,6 +380,17 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
dataset->monotone_types_.clear();
}
const double* tmp_ptr_feature_penalty = reinterpret_cast<const double*>(mem_ptr);
dataset->feature_penalty_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature_penalty_.push_back(tmp_ptr_feature_penalty[i]);
}
mem_ptr += sizeof(double) * (dataset->num_features_);
if (ArrayArgs<double>::CheckAll(dataset->feature_penalty_, 1)) {
dataset->feature_penalty_.clear();
}
// get feature names
dataset->feature_names_.clear();
// write feature names
......
......@@ -19,6 +19,7 @@ public:
int8_t bias = 0;
uint32_t default_bin;
int8_t monotone_type;
double penalty;
/*! \brief pointer of tree config */
const Config* config;
BinType bin_type;
......@@ -77,6 +78,7 @@ public:
output->default_left = true;
output->gain = kMinScore;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
output->gain *= meta_->penalty;
}
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
......@@ -707,6 +709,7 @@ public:
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
feature_metas_[i].monotone_type = train_data->FeatureMonotone(i);
feature_metas_[i].penalty = train_data->FeaturePenalte(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
......
......@@ -70,6 +70,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
feature_metas_[i].monotone_type = train_data->FeatureMonotone(i);
feature_metas_[i].penalty = train_data->FeaturePenalte(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment