Unverified Commit c920e634 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

average predictions for constant features (#1735)

* average predictions for constant features

* fix possible numerical issues in std::log.

* fix pylint

* fix bugs in c_api

* fix styles

* clean code for multi class

* rewrite test

* fix pylint

* skip test_constant_features

* refine test

* fix tests

* fix tests

* update FAQ

* fix test

* Update FAQ.rst
parent e39f1f91
......@@ -106,15 +106,12 @@ LightGBM
--------------
- **Question 9**: When I'm trying to specify a categorical column with the ``categorical_feature`` parameter,
I get the following sequence of errors, but there are no negative values in the column.
I get the following sequence of warnings, but there are no negative values in the column.
::
[LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN
[LightGBM] [Fatal] Cannot construct Dataset since there are no useful features.
It should be at least two unique rows.
If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.
Otherwise, please make sure you are using the right dataset
[LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
- **Solution 9**: The column you're trying to pass via ``categorical_feature`` likely contains very large values.
Categorical features in LightGBM are limited by int32 range,
......
......@@ -42,7 +42,9 @@ public:
const data_size_t*,
data_size_t) const { return ori_output; }
virtual double BoostFromScore() const { return 0.0f; }
virtual double BoostFromScore(int /*class_id*/) const { return 0.0; }
virtual bool ClassNeedTrain(int /*class_id*/) const { return true; }
virtual bool SkipEmptyClass() const { return false; }
......
......@@ -18,15 +18,6 @@
namespace LightGBM {
#ifdef TIMETAG
std::chrono::duration<double, std::milli> boosting_time;
std::chrono::duration<double, std::milli> train_score_time;
std::chrono::duration<double, std::milli> out_of_bag_score_time;
std::chrono::duration<double, std::milli> valid_score_time;
std::chrono::duration<double, std::milli> metric_time;
std::chrono::duration<double, std::milli> bagging_time;
std::chrono::duration<double, std::milli> tree_time;
#endif // TIMETAG
GBDT::GBDT() : iter_(0),
train_data_(nullptr),
......@@ -50,21 +41,12 @@ need_re_bagging_(false) {
}
GBDT::~GBDT() {
#ifdef TIMETAG
Log::Info("GBDT::boosting costs %f", boosting_time * 1e-3);
Log::Info("GBDT::train_score costs %f", train_score_time * 1e-3);
Log::Info("GBDT::out_of_bag_score costs %f", out_of_bag_score_time * 1e-3);
Log::Info("GBDT::valid_score costs %f", valid_score_time * 1e-3);
Log::Info("GBDT::metric costs %f", metric_time * 1e-3);
Log::Info("GBDT::bagging costs %f", bagging_time * 1e-3);
Log::Info("GBDT::tree costs %f", tree_time * 1e-3);
#endif
}
void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
const std::vector<const Metric*>& training_metrics) {
CHECK(train_data != nullptr);
CHECK(train_data->num_features() > 0);
train_data_ = train_data;
iter_ = 0;
num_iteration_for_pred_ = 0;
......@@ -125,45 +107,11 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
// if need bagging, create buffer
ResetBaggingConfig(config_.get(), true);
// reset config for tree learner
class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
CHECK(num_tree_per_iteration_ == num_class_);
class_default_output_ = std::vector<double>(num_tree_per_iteration_, 0.0f);
auto label = train_data_->metadata().label();
if (num_tree_per_iteration_ > 1) {
// multi-class
std::vector<data_size_t> cnt_per_class(num_tree_per_iteration_, 0);
for (data_size_t i = 0; i < num_data_; ++i) {
int index = static_cast<int>(label[i]);
CHECK(index < num_tree_per_iteration_);
++cnt_per_class[index];
}
for (int i = 0; i < num_tree_per_iteration_; ++i) {
if (cnt_per_class[i] == num_data_) {
class_need_train_[i] = false;
class_default_output_[i] = -std::log(kEpsilon);
} else if (cnt_per_class[i] == 0) {
class_need_train_[i] = false;
class_default_output_[i] = -std::log(1.0f / kEpsilon - 1.0f);
}
}
} else {
// binary class
data_size_t cnt_pos = 0;
for (data_size_t i = 0; i < num_data_; ++i) {
if (label[i] > 0) {
++cnt_pos;
}
}
if (cnt_pos == 0) {
class_need_train_[0] = false;
class_default_output_[0] = -std::log(1.0f / kEpsilon - 1.0f);
} else if (cnt_pos == num_data_) {
class_need_train_[0] = false;
class_default_output_[0] = -std::log(kEpsilon);
}
for (int i = 0; i < num_class_; ++i) {
class_need_train_[i] = objective_function_->ClassNeedTrain(i);
}
}
}
......@@ -294,27 +242,6 @@ void GBDT::Bagging(int iter) {
}
}
/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
*
* An improvement to this is to have options to explicitly choose
* (i) standard average
* (ii) custom average if available
* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
*
* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
*
*/
double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj) {
double init_score = 0.0f;
if (fobj != nullptr) {
init_score = fobj->BoostFromScore();
}
if (Network::num_machines() > 1) {
init_score = Network::GlobalSyncUpByMean(init_score);
}
return init_score;
}
void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
bool is_finished = false;
auto start_time = std::chrono::steady_clock::now();
......@@ -360,17 +287,36 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
}
}
double GBDT::BoostFromAverage() {
/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
*
* An improvement to this is to have options to explicitly choose
* (i) standard average
* (ii) custom average if available
* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
*
* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
*
*/
double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id) {
double init_score = 0.0;
if (fobj != nullptr) {
init_score = fobj->BoostFromScore(class_id);
}
if (Network::num_machines() > 1) {
init_score = Network::GlobalSyncUpByMean(init_score);
}
return init_score;
}
double GBDT::BoostFromAverage(int class_id) {
// boosting from average label; or customized "average" if implemented for the current objective
if (models_.empty() && !train_score_updater_->has_init_score()
&& num_class_ <= 1
&& objective_function_ != nullptr) {
if (config_->boost_from_average) {
double init_score = ObtainAutomaticInitialScore(objective_function_);
if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
double init_score = ObtainAutomaticInitialScore(objective_function_, class_id);
if (std::fabs(init_score) > kEpsilon) {
train_score_updater_->AddScore(init_score, 0);
train_score_updater_->AddScore(init_score, class_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_score, 0);
score_updater->AddScore(init_score, class_id);
}
Log::Info("Start training from score %lf", init_score);
return init_score;
......@@ -385,46 +331,26 @@ double GBDT::BoostFromAverage() {
}
bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
double init_score = 0.0f;
std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
// boosting first
if (gradients == nullptr || hessians == nullptr) {
init_score = BoostFromAverage();
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id);
}
Boosting();
gradients = gradients_.data();
hessians = hessians_.data();
#ifdef TIMETAG
boosting_time += std::chrono::steady_clock::now() - start_time;
#endif
}
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// bagging logic
Bagging(iter_);
#ifdef TIMETAG
bagging_time += std::chrono::steady_clock::now() - start_time;
#endif
bool should_continue = false;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
const size_t bias = static_cast<size_t>(cur_tree_id) * num_data_;
std::unique_ptr<Tree> new_tree(new Tree(2));
if (class_need_train_[cur_tree_id]) {
if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
auto grad = gradients + bias;
auto hess = hessians + bias;
// need to copy gradients for bagging subset.
if (is_use_subset_ && bag_data_cnt_ < num_data_) {
for (int i = 0; i < bag_data_cnt_; ++i) {
......@@ -434,14 +360,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
grad = gradients_.data() + bias;
hess = hessians_.data() + bias;
}
new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
}
#ifdef TIMETAG
tree_time += std::chrono::steady_clock::now() - start_time;
#endif
if (new_tree->num_leaves() > 1) {
should_continue = true;
tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, train_score_updater_->score() + bias,
......@@ -450,13 +371,20 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
new_tree->Shrinkage(shrinkage_rate_);
// update score
UpdateScore(new_tree.get(), cur_tree_id);
if (std::fabs(init_score) > kEpsilon) {
new_tree->AddBias(init_score);
if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
new_tree->AddBias(init_scores[cur_tree_id]);
}
} else {
// only add default score one-time
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
auto output = class_default_output_[cur_tree_id];
if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
double output = 0.0;
if (!class_need_train_[cur_tree_id]) {
if (objective_function_ != nullptr) {
output = objective_function_->BoostFromScore(cur_tree_id);
}
} else {
output = init_scores[cur_tree_id];
}
new_tree->AsConstantTree(output);
// updates scores
train_score_updater_->AddScore(output, cur_tree_id);
......@@ -471,9 +399,11 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
if (!should_continue) {
Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
if (models_.size() > static_cast<size_t>(num_tree_per_iteration_)) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
models_.pop_back();
}
}
return true;
}
......@@ -501,17 +431,9 @@ void GBDT::RollbackOneIter() {
bool GBDT::EvalAndCheckEarlyStopping() {
bool is_met_early_stopping = false;
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// print message for metric
auto best_msg = OutputMetric(iter_);
#ifdef TIMETAG
metric_time += std::chrono::steady_clock::now() - start_time;
#endif
is_met_early_stopping = !best_msg.empty();
if (is_met_early_stopping) {
......@@ -528,52 +450,24 @@ bool GBDT::EvalAndCheckEarlyStopping() {
void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// update training score
if (!is_use_subset_) {
train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
#ifdef TIMETAG
train_score_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
// we need to predict out-of-bag scores of data for boosting
if (num_data_ - bag_data_cnt_ > 0) {
train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
}
#ifdef TIMETAG
out_of_bag_score_time += std::chrono::steady_clock::now() - start_time;
#endif
} else {
train_score_updater_->AddScore(tree, cur_tree_id);
#ifdef TIMETAG
train_score_time += std::chrono::steady_clock::now() - start_time;
#endif
}
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
// update validation score
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(tree, cur_tree_id);
}
#ifdef TIMETAG
valid_score_time += std::chrono::steady_clock::now() - start_time;
#endif
}
std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
......
......@@ -407,7 +407,7 @@ protected:
*/
std::string OutputMetric(int iter);
double BoostFromAverage();
double BoostFromAverage(int class_id);
/*! \brief current iteration */
int iter_;
......@@ -481,7 +481,6 @@ protected:
std::unique_ptr<Dataset> tmp_subset_;
bool is_use_subset_;
std::vector<bool> class_need_train_;
std::vector<double> class_default_output_;
bool is_constant_hessian_;
std::unique_ptr<ObjectiveFunction> loaded_objective_;
bool average_output_;
......
......@@ -52,7 +52,6 @@ public:
Booster(const Dataset* train_data,
const char* parameters) {
CHECK(train_data->num_features() > 0);
auto param = Config::Str2Map(parameters);
config_.Set(param);
if (config_.num_threads > 0) {
......@@ -116,7 +115,6 @@ public:
void ResetTrainingData(const Dataset* train_data) {
if (train_data != train_data_) {
CHECK(train_data->num_features() > 0);
std::lock_guard<std::mutex> lock(mutex_);
train_data_ = train_data;
CreateObjectiveAndMetrics();
......
......@@ -226,14 +226,11 @@ void Dataset::Construct(
}
}
if (used_features.empty()) {
Log::Fatal("Cannot construct Dataset since there are no useful features.\n"
"It should be at least two unique rows.\n"
"If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.\n"
"Otherwise, please make sure you are using the right dataset");
Log::Warning("There are no meaningful features, as all feature values are constant.");
}
auto features_in_group = NoGroup(used_features);
if (io_config.enable_bundle) {
if (io_config.enable_bundle && !used_features.empty()) {
features_in_group = FastFeatureBundling(bin_mappers,
sample_non_zero_indices, num_per_col, total_sample_cnt,
used_features, io_config.max_conflict_rate,
......@@ -323,14 +320,16 @@ void Dataset::Construct(
void Dataset::FinishLoad() {
if (is_finish_load_) { return; }
if (num_groups_ > 0) {
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) {
OMP_LOOP_EX_BEGIN();
feature_groups_[i]->bin_data_->FinishLoad();
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
is_finish_load_ = true;
}
......
......@@ -629,9 +629,6 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
}
if (dataset->feature_groups_.empty()) {
Log::Fatal("No usable features in data file %s", dataset->data_filename_.c_str());
}
if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) {
Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_,
static_cast<int>(dataset->feature_names_.size()));
......
......@@ -121,28 +121,34 @@ public:
}
// implement custom average to boost from (if enabled among options)
double BoostFromScore() const override {
double BoostFromScore(int) const override {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml,sumw)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i] * weights_[i];
suml += is_pos_(label_[i]) * weights_[i];
sumw += weights_[i];
}
} else {
sumw = static_cast<double>(num_data_);
#pragma omp parallel for schedule(static) reduction(+:suml)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i];
suml += is_pos_(label_[i]);
}
}
double pavg = suml / sumw;
pavg = std::min(pavg, 1.0 - kEpsilon);
pavg = std::max<double>(pavg, kEpsilon);
double initscore = std::log(pavg / (1.0f - pavg)) / sigmoid_;
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, initscore);
return initscore;
}
bool ClassNeedTrain(int /*class_id*/) const override {
return num_data_ > 0;
}
const char* GetName() const override {
return "binary";
}
......
......@@ -43,11 +43,25 @@ public:
label_ = metadata.label();
weights_ = metadata.weights();
label_int_.resize(num_data_);
class_init_probs_.resize(num_class_, 0.0);
double sum_weight = 0.0;
for (int i = 0; i < num_data_; ++i) {
label_int_[i] = static_cast<int>(label_[i]);
if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]);
}
if (weights_ == nullptr) {
class_init_probs_[label_int_[i]] += 1.0;
} else {
class_init_probs_[label_int_[i]] += weights_[i];
sum_weight += weights_[i];
}
}
if (weights_ == nullptr) {
sum_weight = num_data_;
}
for (int i = 0; i < num_class_; ++i) {
class_init_probs_[i] /= sum_weight;
}
}
......@@ -120,6 +134,19 @@ public:
bool NeedAccuratePrediction() const override { return false; }
double BoostFromScore(int class_id) const override {
return std::log(std::max<double>(kEpsilon, class_init_probs_[class_id]));
}
bool ClassNeedTrain(int class_id) const override {
if (std::fabs(class_init_probs_[class_id]) <= kEpsilon
|| std::fabs(class_init_probs_[class_id]) >= 1.0 - kEpsilon) {
return false;
} else {
return true;
}
}
private:
/*! \brief Number of data */
data_size_t num_data_;
......@@ -131,6 +158,7 @@ private:
std::vector<int> label_int_;
/*! \brief Weights for data */
const label_t* weights_;
std::vector<double> class_init_probs_;
};
/*!
......@@ -212,6 +240,14 @@ public:
bool NeedAccuratePrediction() const override { return false; }
double BoostFromScore(int class_id) const override {
return binary_loss_[class_id]->BoostFromScore(0);
}
bool ClassNeedTrain(int class_id) const override {
return binary_loss_[class_id]->ClassNeedTrain(0);
}
private:
/*! \brief Number of data */
data_size_t num_data_;
......
......@@ -139,7 +139,7 @@ public:
}
}
double BoostFromScore() const override {
double BoostFromScore(int) const override {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
......@@ -201,7 +201,7 @@ public:
}
}
double BoostFromScore() const override {
double BoostFromScore(int) const override {
const double alpha = 0.5;
if (weights_ != nullptr) {
#define data_reader(i) (label_[i])
......@@ -436,8 +436,8 @@ public:
return "poisson";
}
double BoostFromScore() const override {
return std::log(RegressionL2loss::BoostFromScore());
double BoostFromScore(int) const override {
return std::log(RegressionL2loss::BoostFromScore(0));
}
bool IsConstantHessian() const override {
......@@ -493,7 +493,7 @@ public:
return "quantile";
}
double BoostFromScore() const override {
double BoostFromScore(int) const override {
if (weights_ != nullptr) {
#define data_reader(i) (label_[i])
#define weight_reader(i) (weights_[i])
......@@ -600,7 +600,7 @@ public:
}
}
double BoostFromScore() const override {
double BoostFromScore(int) const override {
const double alpha = 0.5;
#define data_reader(i) (label_[i])
#define weight_reader(i) (label_weight_[i])
......
......@@ -104,7 +104,7 @@ public:
}
// implement custom average to boost from (if enabled among options)
double BoostFromScore() const override {
double BoostFromScore(int) const override {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
......@@ -121,6 +121,8 @@ public:
}
}
double pavg = suml / sumw;
pavg = std::min(pavg, 1.0 - kEpsilon);
pavg = std::max<double>(pavg, kEpsilon);
double initscore = std::log(pavg / (1.0f - pavg));
Log::Info("[%s:%s]: pavg = %f -> initscore = %f", GetName(), __func__, pavg, initscore);
return initscore;
......@@ -229,7 +231,7 @@ public:
return str_buf.str();
}
double BoostFromScore() const override {
double BoostFromScore(int) const override {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
......
......@@ -739,3 +739,55 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X)
pred_mean = pred.mean()
self.assertGreater(pred_mean, 18)
def test_constant_features(self, y_true=None, expected_pred=None, more_params=None):
if y_true is not None and expected_pred is not None:
X_train = np.ones((len(y_true), 1))
y_train = np.array(y_true)
params = {
'objective': 'regression',
'num_class': 1,
'verbose': -1,
'min_data': 1,
'num_leaves': 2,
'learning_rate': 1,
'min_data_in_bin': 1,
'boost_from_average': True
}
params.update(more_params)
lgb_train = lgb.Dataset(X_train, y_train, params=params)
gbm = lgb.train(params, lgb_train,
num_boost_round=2)
pred = gbm.predict(X_train)
self.assertTrue(np.allclose(pred, expected_pred))
def test_constant_features_regression(self):
params = {
'objective': 'regression'
}
self.test_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params)
self.test_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params)
self.test_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params)
def test_constant_features_binary(self):
params = {
'objective': 'binary'
}
self.test_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params)
self.test_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params)
def test_constant_features_multiclass(self):
params = {
'objective': 'multiclass',
'num_class': 3
}
self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
def test_constant_features_multiclassova(self):
params = {
'objective': 'multiclassova',
'num_class': 3
}
self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment