"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "3ed0027bffe820ba6c81f32b82f02ef8302c6aad"
Unverified Commit c920e634 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

average predictions for constant features (#1735)

* average predictions for constant features

* fix possible numerical issues in std::log.

* fix pylint

* fix bugs in c_api

* fix styles

* clean code for multi class

* rewrite test

* fix pylint

* skip test_constant_features

* refine test

* fix tests

* fix tests

* update FAQ

* fix test

* Update FAQ.rst
parent e39f1f91
...@@ -106,15 +106,12 @@ LightGBM ...@@ -106,15 +106,12 @@ LightGBM
-------------- --------------
- **Question 9**: When I'm trying to specify a categorical column with the ``categorical_feature`` parameter, - **Question 9**: When I'm trying to specify a categorical column with the ``categorical_feature`` parameter,
I get the following sequence of errors, but there are no negative values in the column. I get the following sequence of warnings, but there are no negative values in the column.
:: ::
[LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN
[LightGBM] [Fatal] Cannot construct Dataset since there are no useful features. [LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
It should be at least two unique rows.
If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.
Otherwise, please make sure you are using the right dataset
- **Solution 9**: The column you're trying to pass via ``categorical_feature`` likely contains very large values. - **Solution 9**: The column you're trying to pass via ``categorical_feature`` likely contains very large values.
Categorical features in LightGBM are limited by int32 range, Categorical features in LightGBM are limited by int32 range,
......
...@@ -42,7 +42,9 @@ public: ...@@ -42,7 +42,9 @@ public:
const data_size_t*, const data_size_t*,
data_size_t) const { return ori_output; } data_size_t) const { return ori_output; }
virtual double BoostFromScore() const { return 0.0f; } virtual double BoostFromScore(int /*class_id*/) const { return 0.0; }
virtual bool ClassNeedTrain(int /*class_id*/) const { return true; }
virtual bool SkipEmptyClass() const { return false; } virtual bool SkipEmptyClass() const { return false; }
......
...@@ -18,15 +18,6 @@ ...@@ -18,15 +18,6 @@
namespace LightGBM { namespace LightGBM {
#ifdef TIMETAG
std::chrono::duration<double, std::milli> boosting_time;
std::chrono::duration<double, std::milli> train_score_time;
std::chrono::duration<double, std::milli> out_of_bag_score_time;
std::chrono::duration<double, std::milli> valid_score_time;
std::chrono::duration<double, std::milli> metric_time;
std::chrono::duration<double, std::milli> bagging_time;
std::chrono::duration<double, std::milli> tree_time;
#endif // TIMETAG
GBDT::GBDT() : iter_(0), GBDT::GBDT() : iter_(0),
train_data_(nullptr), train_data_(nullptr),
...@@ -50,21 +41,12 @@ need_re_bagging_(false) { ...@@ -50,21 +41,12 @@ need_re_bagging_(false) {
} }
GBDT::~GBDT() { GBDT::~GBDT() {
#ifdef TIMETAG
Log::Info("GBDT::boosting costs %f", boosting_time * 1e-3);
Log::Info("GBDT::train_score costs %f", train_score_time * 1e-3);
Log::Info("GBDT::out_of_bag_score costs %f", out_of_bag_score_time * 1e-3);
Log::Info("GBDT::valid_score costs %f", valid_score_time * 1e-3);
Log::Info("GBDT::metric costs %f", metric_time * 1e-3);
Log::Info("GBDT::bagging costs %f", bagging_time * 1e-3);
Log::Info("GBDT::tree costs %f", tree_time * 1e-3);
#endif
} }
void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
const std::vector<const Metric*>& training_metrics) { const std::vector<const Metric*>& training_metrics) {
CHECK(train_data != nullptr); CHECK(train_data != nullptr);
CHECK(train_data->num_features() > 0);
train_data_ = train_data; train_data_ = train_data;
iter_ = 0; iter_ = 0;
num_iteration_for_pred_ = 0; num_iteration_for_pred_ = 0;
...@@ -125,45 +107,11 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective ...@@ -125,45 +107,11 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
// if need bagging, create buffer // if need bagging, create buffer
ResetBaggingConfig(config_.get(), true); ResetBaggingConfig(config_.get(), true);
// reset config for tree learner
class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true); class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
CHECK(num_tree_per_iteration_ == num_class_); CHECK(num_tree_per_iteration_ == num_class_);
for (int i = 0; i < num_class_; ++i) {
class_default_output_ = std::vector<double>(num_tree_per_iteration_, 0.0f); class_need_train_[i] = objective_function_->ClassNeedTrain(i);
auto label = train_data_->metadata().label();
if (num_tree_per_iteration_ > 1) {
// multi-class
std::vector<data_size_t> cnt_per_class(num_tree_per_iteration_, 0);
for (data_size_t i = 0; i < num_data_; ++i) {
int index = static_cast<int>(label[i]);
CHECK(index < num_tree_per_iteration_);
++cnt_per_class[index];
}
for (int i = 0; i < num_tree_per_iteration_; ++i) {
if (cnt_per_class[i] == num_data_) {
class_need_train_[i] = false;
class_default_output_[i] = -std::log(kEpsilon);
} else if (cnt_per_class[i] == 0) {
class_need_train_[i] = false;
class_default_output_[i] = -std::log(1.0f / kEpsilon - 1.0f);
}
}
} else {
// binary class
data_size_t cnt_pos = 0;
for (data_size_t i = 0; i < num_data_; ++i) {
if (label[i] > 0) {
++cnt_pos;
}
}
if (cnt_pos == 0) {
class_need_train_[0] = false;
class_default_output_[0] = -std::log(1.0f / kEpsilon - 1.0f);
} else if (cnt_pos == num_data_) {
class_need_train_[0] = false;
class_default_output_[0] = -std::log(kEpsilon);
}
} }
} }
} }
...@@ -294,27 +242,6 @@ void GBDT::Bagging(int iter) { ...@@ -294,27 +242,6 @@ void GBDT::Bagging(int iter) {
} }
} }
/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
*
* An improvement to this is to have options to explicitly choose
* (i) standard average
* (ii) custom average if available
* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
*
* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
*
*/
double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj) {
double init_score = 0.0f;
if (fobj != nullptr) {
init_score = fobj->BoostFromScore();
}
if (Network::num_machines() > 1) {
init_score = Network::GlobalSyncUpByMean(init_score);
}
return init_score;
}
void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
bool is_finished = false; bool is_finished = false;
auto start_time = std::chrono::steady_clock::now(); auto start_time = std::chrono::steady_clock::now();
...@@ -360,17 +287,36 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction) ...@@ -360,17 +287,36 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
} }
} }
double GBDT::BoostFromAverage() { /* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
*
* An improvement to this is to have options to explicitly choose
* (i) standard average
* (ii) custom average if available
* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
*
* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
*
*/
double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id) {
double init_score = 0.0;
if (fobj != nullptr) {
init_score = fobj->BoostFromScore(class_id);
}
if (Network::num_machines() > 1) {
init_score = Network::GlobalSyncUpByMean(init_score);
}
return init_score;
}
double GBDT::BoostFromAverage(int class_id) {
// boosting from average label; or customized "average" if implemented for the current objective // boosting from average label; or customized "average" if implemented for the current objective
if (models_.empty() && !train_score_updater_->has_init_score() if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
&& num_class_ <= 1 if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
&& objective_function_ != nullptr) { double init_score = ObtainAutomaticInitialScore(objective_function_, class_id);
if (config_->boost_from_average) {
double init_score = ObtainAutomaticInitialScore(objective_function_);
if (std::fabs(init_score) > kEpsilon) { if (std::fabs(init_score) > kEpsilon) {
train_score_updater_->AddScore(init_score, 0); train_score_updater_->AddScore(init_score, class_id);
for (auto& score_updater : valid_score_updater_) { for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_score, 0); score_updater->AddScore(init_score, class_id);
} }
Log::Info("Start training from score %lf", init_score); Log::Info("Start training from score %lf", init_score);
return init_score; return init_score;
...@@ -385,46 +331,26 @@ double GBDT::BoostFromAverage() { ...@@ -385,46 +331,26 @@ double GBDT::BoostFromAverage() {
} }
bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
double init_score = 0.0f; std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
// boosting first // boosting first
if (gradients == nullptr || hessians == nullptr) { if (gradients == nullptr || hessians == nullptr) {
init_score = BoostFromAverage(); for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
#ifdef TIMETAG init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id);
auto start_time = std::chrono::steady_clock::now(); }
#endif
Boosting(); Boosting();
gradients = gradients_.data(); gradients = gradients_.data();
hessians = hessians_.data(); hessians = hessians_.data();
#ifdef TIMETAG
boosting_time += std::chrono::steady_clock::now() - start_time;
#endif
} }
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// bagging logic // bagging logic
Bagging(iter_); Bagging(iter_);
#ifdef TIMETAG
bagging_time += std::chrono::steady_clock::now() - start_time;
#endif
bool should_continue = false; bool should_continue = false;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
const size_t bias = static_cast<size_t>(cur_tree_id) * num_data_; const size_t bias = static_cast<size_t>(cur_tree_id) * num_data_;
std::unique_ptr<Tree> new_tree(new Tree(2)); std::unique_ptr<Tree> new_tree(new Tree(2));
if (class_need_train_[cur_tree_id]) { if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
auto grad = gradients + bias; auto grad = gradients + bias;
auto hess = hessians + bias; auto hess = hessians + bias;
// need to copy gradients for bagging subset. // need to copy gradients for bagging subset.
if (is_use_subset_ && bag_data_cnt_ < num_data_) { if (is_use_subset_ && bag_data_cnt_ < num_data_) {
for (int i = 0; i < bag_data_cnt_; ++i) { for (int i = 0; i < bag_data_cnt_; ++i) {
...@@ -434,14 +360,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ...@@ -434,14 +360,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
grad = gradients_.data() + bias; grad = gradients_.data() + bias;
hess = hessians_.data() + bias; hess = hessians_.data() + bias;
} }
new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_)); new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
} }
#ifdef TIMETAG
tree_time += std::chrono::steady_clock::now() - start_time;
#endif
if (new_tree->num_leaves() > 1) { if (new_tree->num_leaves() > 1) {
should_continue = true; should_continue = true;
tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, train_score_updater_->score() + bias, tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, train_score_updater_->score() + bias,
...@@ -450,13 +371,20 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ...@@ -450,13 +371,20 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
new_tree->Shrinkage(shrinkage_rate_); new_tree->Shrinkage(shrinkage_rate_);
// update score // update score
UpdateScore(new_tree.get(), cur_tree_id); UpdateScore(new_tree.get(), cur_tree_id);
if (std::fabs(init_score) > kEpsilon) { if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
new_tree->AddBias(init_score); new_tree->AddBias(init_scores[cur_tree_id]);
} }
} else { } else {
// only add default score one-time // only add default score one-time
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) { if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
auto output = class_default_output_[cur_tree_id]; double output = 0.0;
if (!class_need_train_[cur_tree_id]) {
if (objective_function_ != nullptr) {
output = objective_function_->BoostFromScore(cur_tree_id);
}
} else {
output = init_scores[cur_tree_id];
}
new_tree->AsConstantTree(output); new_tree->AsConstantTree(output);
// updates scores // updates scores
train_score_updater_->AddScore(output, cur_tree_id); train_score_updater_->AddScore(output, cur_tree_id);
...@@ -471,8 +399,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ...@@ -471,8 +399,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
if (!should_continue) { if (!should_continue) {
Log::Warning("Stopped training because there are no more leaves that meet the split requirements"); Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { if (models_.size() > static_cast<size_t>(num_tree_per_iteration_)) {
models_.pop_back(); for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
models_.pop_back();
}
} }
return true; return true;
} }
...@@ -501,17 +431,9 @@ void GBDT::RollbackOneIter() { ...@@ -501,17 +431,9 @@ void GBDT::RollbackOneIter() {
bool GBDT::EvalAndCheckEarlyStopping() { bool GBDT::EvalAndCheckEarlyStopping() {
bool is_met_early_stopping = false; bool is_met_early_stopping = false;
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// print message for metric // print message for metric
auto best_msg = OutputMetric(iter_); auto best_msg = OutputMetric(iter_);
#ifdef TIMETAG
metric_time += std::chrono::steady_clock::now() - start_time;
#endif
is_met_early_stopping = !best_msg.empty(); is_met_early_stopping = !best_msg.empty();
if (is_met_early_stopping) { if (is_met_early_stopping) {
...@@ -528,52 +450,24 @@ bool GBDT::EvalAndCheckEarlyStopping() { ...@@ -528,52 +450,24 @@ bool GBDT::EvalAndCheckEarlyStopping() {
void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// update training score // update training score
if (!is_use_subset_) { if (!is_use_subset_) {
train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id); train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
#ifdef TIMETAG
train_score_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
// we need to predict out-of-bag scores of data for boosting // we need to predict out-of-bag scores of data for boosting
if (num_data_ - bag_data_cnt_ > 0) { if (num_data_ - bag_data_cnt_ > 0) {
train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id); train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
} }
#ifdef TIMETAG
out_of_bag_score_time += std::chrono::steady_clock::now() - start_time;
#endif
} else { } else {
train_score_updater_->AddScore(tree, cur_tree_id); train_score_updater_->AddScore(tree, cur_tree_id);
#ifdef TIMETAG
train_score_time += std::chrono::steady_clock::now() - start_time;
#endif
} }
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
// update validation score // update validation score
for (auto& score_updater : valid_score_updater_) { for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(tree, cur_tree_id); score_updater->AddScore(tree, cur_tree_id);
} }
#ifdef TIMETAG
valid_score_time += std::chrono::steady_clock::now() - start_time;
#endif
} }
std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const { std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
......
...@@ -407,7 +407,7 @@ protected: ...@@ -407,7 +407,7 @@ protected:
*/ */
std::string OutputMetric(int iter); std::string OutputMetric(int iter);
double BoostFromAverage(); double BoostFromAverage(int class_id);
/*! \brief current iteration */ /*! \brief current iteration */
int iter_; int iter_;
...@@ -481,7 +481,6 @@ protected: ...@@ -481,7 +481,6 @@ protected:
std::unique_ptr<Dataset> tmp_subset_; std::unique_ptr<Dataset> tmp_subset_;
bool is_use_subset_; bool is_use_subset_;
std::vector<bool> class_need_train_; std::vector<bool> class_need_train_;
std::vector<double> class_default_output_;
bool is_constant_hessian_; bool is_constant_hessian_;
std::unique_ptr<ObjectiveFunction> loaded_objective_; std::unique_ptr<ObjectiveFunction> loaded_objective_;
bool average_output_; bool average_output_;
......
...@@ -52,7 +52,6 @@ public: ...@@ -52,7 +52,6 @@ public:
Booster(const Dataset* train_data, Booster(const Dataset* train_data,
const char* parameters) { const char* parameters) {
CHECK(train_data->num_features() > 0);
auto param = Config::Str2Map(parameters); auto param = Config::Str2Map(parameters);
config_.Set(param); config_.Set(param);
if (config_.num_threads > 0) { if (config_.num_threads > 0) {
...@@ -116,7 +115,6 @@ public: ...@@ -116,7 +115,6 @@ public:
void ResetTrainingData(const Dataset* train_data) { void ResetTrainingData(const Dataset* train_data) {
if (train_data != train_data_) { if (train_data != train_data_) {
CHECK(train_data->num_features() > 0);
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
train_data_ = train_data; train_data_ = train_data;
CreateObjectiveAndMetrics(); CreateObjectiveAndMetrics();
......
...@@ -226,14 +226,11 @@ void Dataset::Construct( ...@@ -226,14 +226,11 @@ void Dataset::Construct(
} }
} }
if (used_features.empty()) { if (used_features.empty()) {
Log::Fatal("Cannot construct Dataset since there are no useful features.\n" Log::Warning("There are no meaningful features, as all feature values are constant.");
"It should be at least two unique rows.\n"
"If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.\n"
"Otherwise, please make sure you are using the right dataset");
} }
auto features_in_group = NoGroup(used_features); auto features_in_group = NoGroup(used_features);
if (io_config.enable_bundle) { if (io_config.enable_bundle && !used_features.empty()) {
features_in_group = FastFeatureBundling(bin_mappers, features_in_group = FastFeatureBundling(bin_mappers,
sample_non_zero_indices, num_per_col, total_sample_cnt, sample_non_zero_indices, num_per_col, total_sample_cnt,
used_features, io_config.max_conflict_rate, used_features, io_config.max_conflict_rate,
...@@ -323,14 +320,16 @@ void Dataset::Construct( ...@@ -323,14 +320,16 @@ void Dataset::Construct(
void Dataset::FinishLoad() { void Dataset::FinishLoad() {
if (is_finish_load_) { return; } if (is_finish_load_) { return; }
OMP_INIT_EX(); if (num_groups_ > 0) {
#pragma omp parallel for schedule(guided) OMP_INIT_EX();
for (int i = 0; i < num_groups_; ++i) { #pragma omp parallel for schedule(guided)
OMP_LOOP_EX_BEGIN(); for (int i = 0; i < num_groups_; ++i) {
feature_groups_[i]->bin_data_->FinishLoad(); OMP_LOOP_EX_BEGIN();
OMP_LOOP_EX_END(); feature_groups_[i]->bin_data_->FinishLoad();
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} }
OMP_THROW_EX();
is_finish_load_ = true; is_finish_load_ = true;
} }
......
...@@ -629,9 +629,6 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) { ...@@ -629,9 +629,6 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if (dataset->num_data_ <= 0) { if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str()); Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
} }
if (dataset->feature_groups_.empty()) {
Log::Fatal("No usable features in data file %s", dataset->data_filename_.c_str());
}
if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) { if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) {
Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_, Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_,
static_cast<int>(dataset->feature_names_.size())); static_cast<int>(dataset->feature_names_.size()));
......
...@@ -121,28 +121,34 @@ public: ...@@ -121,28 +121,34 @@ public:
} }
// implement custom average to boost from (if enabled among options) // implement custom average to boost from (if enabled among options)
double BoostFromScore() const override { double BoostFromScore(int) const override {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml,sumw) #pragma omp parallel for schedule(static) reduction(+:suml,sumw)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i] * weights_[i]; suml += is_pos_(label_[i]) * weights_[i];
sumw += weights_[i]; sumw += weights_[i];
} }
} else { } else {
sumw = static_cast<double>(num_data_); sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml) #pragma omp parallel for schedule(static) reduction(+:suml)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i]; suml += is_pos_(label_[i]);
} }
} }
double pavg = suml / sumw; double pavg = suml / sumw;
pavg = std::min(pavg, 1.0 - kEpsilon);
pavg = std::max<double>(pavg, kEpsilon);
double initscore = std::log(pavg / (1.0f - pavg)) / sigmoid_; double initscore = std::log(pavg / (1.0f - pavg)) / sigmoid_;
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, initscore); Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, initscore);
return initscore; return initscore;
} }
bool ClassNeedTrain(int /*class_id*/) const override {
return num_data_ > 0;
}
const char* GetName() const override { const char* GetName() const override {
return "binary"; return "binary";
} }
......
...@@ -43,11 +43,25 @@ public: ...@@ -43,11 +43,25 @@ public:
label_ = metadata.label(); label_ = metadata.label();
weights_ = metadata.weights(); weights_ = metadata.weights();
label_int_.resize(num_data_); label_int_.resize(num_data_);
class_init_probs_.resize(num_class_, 0.0);
double sum_weight = 0.0;
for (int i = 0; i < num_data_; ++i) { for (int i = 0; i < num_data_; ++i) {
label_int_[i] = static_cast<int>(label_[i]); label_int_[i] = static_cast<int>(label_[i]);
if (label_int_[i] < 0 || label_int_[i] >= num_class_) { if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]); Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]);
} }
if (weights_ == nullptr) {
class_init_probs_[label_int_[i]] += 1.0;
} else {
class_init_probs_[label_int_[i]] += weights_[i];
sum_weight += weights_[i];
}
}
if (weights_ == nullptr) {
sum_weight = num_data_;
}
for (int i = 0; i < num_class_; ++i) {
class_init_probs_[i] /= sum_weight;
} }
} }
...@@ -120,6 +134,19 @@ public: ...@@ -120,6 +134,19 @@ public:
bool NeedAccuratePrediction() const override { return false; } bool NeedAccuratePrediction() const override { return false; }
double BoostFromScore(int class_id) const override {
return std::log(std::max<double>(kEpsilon, class_init_probs_[class_id]));
}
bool ClassNeedTrain(int class_id) const override {
if (std::fabs(class_init_probs_[class_id]) <= kEpsilon
|| std::fabs(class_init_probs_[class_id]) >= 1.0 - kEpsilon) {
return false;
} else {
return true;
}
}
private: private:
/*! \brief Number of data */ /*! \brief Number of data */
data_size_t num_data_; data_size_t num_data_;
...@@ -131,6 +158,7 @@ private: ...@@ -131,6 +158,7 @@ private:
std::vector<int> label_int_; std::vector<int> label_int_;
/*! \brief Weights for data */ /*! \brief Weights for data */
const label_t* weights_; const label_t* weights_;
std::vector<double> class_init_probs_;
}; };
/*! /*!
...@@ -212,6 +240,14 @@ public: ...@@ -212,6 +240,14 @@ public:
bool NeedAccuratePrediction() const override { return false; } bool NeedAccuratePrediction() const override { return false; }
double BoostFromScore(int class_id) const override {
return binary_loss_[class_id]->BoostFromScore(0);
}
bool ClassNeedTrain(int class_id) const override {
return binary_loss_[class_id]->ClassNeedTrain(0);
}
private: private:
/*! \brief Number of data */ /*! \brief Number of data */
data_size_t num_data_; data_size_t num_data_;
......
...@@ -139,7 +139,7 @@ public: ...@@ -139,7 +139,7 @@ public:
} }
} }
double BoostFromScore() const override { double BoostFromScore(int) const override {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
...@@ -201,7 +201,7 @@ public: ...@@ -201,7 +201,7 @@ public:
} }
} }
double BoostFromScore() const override { double BoostFromScore(int) const override {
const double alpha = 0.5; const double alpha = 0.5;
if (weights_ != nullptr) { if (weights_ != nullptr) {
#define data_reader(i) (label_[i]) #define data_reader(i) (label_[i])
...@@ -436,8 +436,8 @@ public: ...@@ -436,8 +436,8 @@ public:
return "poisson"; return "poisson";
} }
double BoostFromScore() const override { double BoostFromScore(int) const override {
return std::log(RegressionL2loss::BoostFromScore()); return std::log(RegressionL2loss::BoostFromScore(0));
} }
bool IsConstantHessian() const override { bool IsConstantHessian() const override {
...@@ -493,7 +493,7 @@ public: ...@@ -493,7 +493,7 @@ public:
return "quantile"; return "quantile";
} }
double BoostFromScore() const override { double BoostFromScore(int) const override {
if (weights_ != nullptr) { if (weights_ != nullptr) {
#define data_reader(i) (label_[i]) #define data_reader(i) (label_[i])
#define weight_reader(i) (weights_[i]) #define weight_reader(i) (weights_[i])
...@@ -600,7 +600,7 @@ public: ...@@ -600,7 +600,7 @@ public:
} }
} }
double BoostFromScore() const override { double BoostFromScore(int) const override {
const double alpha = 0.5; const double alpha = 0.5;
#define data_reader(i) (label_[i]) #define data_reader(i) (label_[i])
#define weight_reader(i) (label_weight_[i]) #define weight_reader(i) (label_weight_[i])
......
...@@ -104,7 +104,7 @@ public: ...@@ -104,7 +104,7 @@ public:
} }
// implement custom average to boost from (if enabled among options) // implement custom average to boost from (if enabled among options)
double BoostFromScore() const override { double BoostFromScore(int) const override {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
...@@ -121,6 +121,8 @@ public: ...@@ -121,6 +121,8 @@ public:
} }
} }
double pavg = suml / sumw; double pavg = suml / sumw;
pavg = std::min(pavg, 1.0 - kEpsilon);
pavg = std::max<double>(pavg, kEpsilon);
double initscore = std::log(pavg / (1.0f - pavg)); double initscore = std::log(pavg / (1.0f - pavg));
Log::Info("[%s:%s]: pavg = %f -> initscore = %f", GetName(), __func__, pavg, initscore); Log::Info("[%s:%s]: pavg = %f -> initscore = %f", GetName(), __func__, pavg, initscore);
return initscore; return initscore;
...@@ -229,7 +231,7 @@ public: ...@@ -229,7 +231,7 @@ public:
return str_buf.str(); return str_buf.str();
} }
double BoostFromScore() const override { double BoostFromScore(int) const override {
double suml = 0.0f; double suml = 0.0f;
double sumw = 0.0f; double sumw = 0.0f;
if (weights_ != nullptr) { if (weights_ != nullptr) {
......
...@@ -739,3 +739,55 @@ class TestEngine(unittest.TestCase): ...@@ -739,3 +739,55 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X) pred = gbm.predict(X)
pred_mean = pred.mean() pred_mean = pred.mean()
self.assertGreater(pred_mean, 18) self.assertGreater(pred_mean, 18)
def test_constant_features(self, y_true=None, expected_pred=None, more_params=None):
if y_true is not None and expected_pred is not None:
X_train = np.ones((len(y_true), 1))
y_train = np.array(y_true)
params = {
'objective': 'regression',
'num_class': 1,
'verbose': -1,
'min_data': 1,
'num_leaves': 2,
'learning_rate': 1,
'min_data_in_bin': 1,
'boost_from_average': True
}
params.update(more_params)
lgb_train = lgb.Dataset(X_train, y_train, params=params)
gbm = lgb.train(params, lgb_train,
num_boost_round=2)
pred = gbm.predict(X_train)
self.assertTrue(np.allclose(pred, expected_pred))
def test_constant_features_regression(self):
params = {
'objective': 'regression'
}
self.test_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params)
self.test_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params)
self.test_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params)
def test_constant_features_binary(self):
params = {
'objective': 'binary'
}
self.test_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params)
self.test_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params)
def test_constant_features_multiclass(self):
params = {
'objective': 'multiclass',
'num_class': 3
}
self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
def test_constant_features_multiclassova(self):
params = {
'objective': 'multiclassova',
'num_class': 3
}
self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment