average predictions for constant features (#1735)

* average predictions for constant features * fix possible numerical issues in std::log. * fix pylint * fix bugs in c_api * fix styles * clean code for multi class * rewrite test * fix pylint * skip test_constant_features * refine test * fix tests * fix tests * update FAQ * fix test * Update FAQ.rst

average predictions for constant features (#1735)
* average predictions for constant features * fix possible numerical issues in std::log. * fix pylint * fix bugs in c_api * fix styles * clean code for multi class * rewrite test * fix pylint * skip test_constant_features * refine test * fix tests * fix tests * update FAQ * fix test * Update FAQ.rst
c920e634 · Guolin Ke · GitHub · e39f1f91 · c920e634 · c920e634
Unverified Commit c920e634 authored Oct 09, 2018 by Guolin Ke Committed by GitHub Oct 09, 2018
12 changed files
--- a/docs/FAQ.rst
+++ b/docs/FAQ.rst
@@ -106,15 +106,12 @@ LightGBM
 --------------

 -  **Question 9**: When I'm trying to specify a categorical column with the ``categorical_feature`` parameter,
-   I get the following sequence of errors, but there are no negative values in the column.
+   I get the following sequence of warnings, but there are no negative values in the column.

   ::

       [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN
-       [LightGBM] [Fatal] Cannot construct Dataset since there are no useful features.
-       It should be at least two unique rows.
-       If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.
-       Otherwise, please make sure you are using the right dataset
+       [LightGBM] [Warning] There are no meaningful features, as all feature values are constant.

 -  **Solution 9**: The column you're trying to pass via ``categorical_feature`` likely contains very large values.
   Categorical features in LightGBM are limited by int32 range,

--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -42,7 +42,9 @@ public:
                                 const data_size_t*,
                                 data_size_t) const { return ori_output; }

-  virtual double BoostFromScore() const { return 0.0f; }
+  virtual double BoostFromScore(int /*class_id*/) const { return 0.0; }
+
+  virtual bool ClassNeedTrain(int /*class_id*/) const { return true; }

  virtual bool SkipEmptyClass() const { return false; }


--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -18,15 +18,6 @@

 namespace LightGBM {

-#ifdef TIMETAG
-std::chrono::duration<double, std::milli> boosting_time;
-std::chrono::duration<double, std::milli> train_score_time;
-std::chrono::duration<double, std::milli> out_of_bag_score_time;
-std::chrono::duration<double, std::milli> valid_score_time;
-std::chrono::duration<double, std::milli> metric_time;
-std::chrono::duration<double, std::milli> bagging_time;
-std::chrono::duration<double, std::milli> tree_time;
-#endif // TIMETAG

 GBDT::GBDT() : iter_(0),
 train_data_(nullptr),
@@ -50,21 +41,12 @@ need_re_bagging_(false) {
 }

 GBDT::~GBDT() {
-  #ifdef TIMETAG
-  Log::Info("GBDT::boosting costs %f", boosting_time * 1e-3);
-  Log::Info("GBDT::train_score costs %f", train_score_time * 1e-3);
-  Log::Info("GBDT::out_of_bag_score costs %f", out_of_bag_score_time * 1e-3);
-  Log::Info("GBDT::valid_score costs %f", valid_score_time * 1e-3);
-  Log::Info("GBDT::metric costs %f", metric_time * 1e-3);
-  Log::Info("GBDT::bagging costs %f", bagging_time * 1e-3);
-  Log::Info("GBDT::tree costs %f", tree_time * 1e-3);
-  #endif
+
 }

 void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
                const std::vector<const Metric*>& training_metrics) {
  CHECK(train_data != nullptr);
-  CHECK(train_data->num_features() > 0);
  train_data_ = train_data;
  iter_ = 0;
  num_iteration_for_pred_ = 0;
@@ -125,45 +107,11 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
  // if need bagging, create buffer
  ResetBaggingConfig(config_.get(), true);

-  // reset config for tree learner
  class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
  if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
    CHECK(num_tree_per_iteration_ == num_class_);
-
-    class_default_output_ = std::vector<double>(num_tree_per_iteration_, 0.0f);
-    auto label = train_data_->metadata().label();
-    if (num_tree_per_iteration_ > 1) {
-      // multi-class
-      std::vector<data_size_t> cnt_per_class(num_tree_per_iteration_, 0);
-      for (data_size_t i = 0; i < num_data_; ++i) {
-        int index = static_cast<int>(label[i]);
-        CHECK(index < num_tree_per_iteration_);
-        ++cnt_per_class[index];
-      }
-      for (int i = 0; i < num_tree_per_iteration_; ++i) {
-        if (cnt_per_class[i] == num_data_) {
-          class_need_train_[i] = false;
-          class_default_output_[i] = -std::log(kEpsilon);
-        } else if (cnt_per_class[i] == 0) {
-          class_need_train_[i] = false;
-          class_default_output_[i] = -std::log(1.0f / kEpsilon - 1.0f);
-        }
-      }
-    } else {
-      // binary class
-      data_size_t cnt_pos = 0;
-      for (data_size_t i = 0; i < num_data_; ++i) {
-        if (label[i] > 0) {
-          ++cnt_pos;
-        }
-      }
-      if (cnt_pos == 0) {
-        class_need_train_[0] = false;
-        class_default_output_[0] = -std::log(1.0f / kEpsilon - 1.0f);
-      } else if (cnt_pos == num_data_) {
-        class_need_train_[0] = false;
-        class_default_output_[0] = -std::log(kEpsilon);
-      }
+    for (int i = 0; i < num_class_; ++i) {
+      class_need_train_[i] = objective_function_->ClassNeedTrain(i);
    }
  }
 }
@@ -294,27 +242,6 @@ void GBDT::Bagging(int iter) {
  }
 }

-/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
-*
-* An improvement to this is to have options to explicitly choose
-* (i) standard average
-* (ii) custom average if available
-* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
-*
-* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
-*
-*/
-double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj) {
-  double init_score = 0.0f;
-  if (fobj != nullptr) {
-    init_score = fobj->BoostFromScore();
-  }
-  if (Network::num_machines() > 1) {
-    init_score = Network::GlobalSyncUpByMean(init_score);
-  }
-  return init_score;
-}
-
 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
  bool is_finished = false;
  auto start_time = std::chrono::steady_clock::now();
@@ -360,17 +287,36 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
  }
 }

-double GBDT::BoostFromAverage() {
+/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
+*
+* An improvement to this is to have options to explicitly choose
+* (i) standard average
+* (ii) custom average if available
+* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
+*
+* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
+*
+*/
+double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id) {
+  double init_score = 0.0;
+  if (fobj != nullptr) {
+    init_score = fobj->BoostFromScore(class_id);
+  }
+  if (Network::num_machines() > 1) {
+    init_score = Network::GlobalSyncUpByMean(init_score);
+  }
+  return init_score;
+}
+
+double GBDT::BoostFromAverage(int class_id) {
  // boosting from average label; or customized "average" if implemented for the current objective
-  if (models_.empty() && !train_score_updater_->has_init_score()
-      && num_class_ <= 1
-      && objective_function_ != nullptr) {
-    if (config_->boost_from_average) {
-      double init_score = ObtainAutomaticInitialScore(objective_function_);
+  if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
+    if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
+      double init_score = ObtainAutomaticInitialScore(objective_function_, class_id);
      if (std::fabs(init_score) > kEpsilon) {
-        train_score_updater_->AddScore(init_score, 0);
+        train_score_updater_->AddScore(init_score, class_id);
        for (auto& score_updater : valid_score_updater_) {
-          score_updater->AddScore(init_score, 0);
+          score_updater->AddScore(init_score, class_id);
        }
        Log::Info("Start training from score %lf", init_score);
        return init_score;
@@ -385,46 +331,26 @@ double GBDT::BoostFromAverage() {
 }

 bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
-  double init_score = 0.0f;
+  std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
  // boosting first
  if (gradients == nullptr || hessians == nullptr) {
-    init_score = BoostFromAverage();
-    #ifdef TIMETAG
-    auto start_time = std::chrono::steady_clock::now();
-    #endif
-
+    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+      init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id);
+    }
    Boosting();
    gradients = gradients_.data();
    hessians = hessians_.data();
-
-    #ifdef TIMETAG
-    boosting_time += std::chrono::steady_clock::now() - start_time;
-    #endif
  }
-
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
-
  // bagging logic
  Bagging(iter_);

-  #ifdef TIMETAG
-  bagging_time += std::chrono::steady_clock::now() - start_time;
-  #endif
-
  bool should_continue = false;
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
-
-    #ifdef TIMETAG
-    start_time = std::chrono::steady_clock::now();
-    #endif
    const size_t bias = static_cast<size_t>(cur_tree_id) * num_data_;
    std::unique_ptr<Tree> new_tree(new Tree(2));
-    if (class_need_train_[cur_tree_id]) {
+    if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
      auto grad = gradients + bias;
      auto hess = hessians + bias;
-
      // need to copy gradients for bagging subset.
      if (is_use_subset_ && bag_data_cnt_ < num_data_) {
        for (int i = 0; i < bag_data_cnt_; ++i) {
@@ -434,14 +360,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
        grad = gradients_.data() + bias;
        hess = hessians_.data() + bias;
      }
-
      new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
    }

-    #ifdef TIMETAG
-    tree_time += std::chrono::steady_clock::now() - start_time;
-    #endif
-
    if (new_tree->num_leaves() > 1) {
      should_continue = true;
      tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, train_score_updater_->score() + bias,
@@ -450,13 +371,20 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
      new_tree->Shrinkage(shrinkage_rate_);
      // update score
      UpdateScore(new_tree.get(), cur_tree_id);
-      if (std::fabs(init_score) > kEpsilon) {
-        new_tree->AddBias(init_score);
+      if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
+        new_tree->AddBias(init_scores[cur_tree_id]);
      }
    } else {
      // only add default score one-time
-      if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
-        auto output = class_default_output_[cur_tree_id];
+      if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
+        double output = 0.0;
+        if (!class_need_train_[cur_tree_id]) {
+          if (objective_function_ != nullptr) {
+            output = objective_function_->BoostFromScore(cur_tree_id);
+          }
+        } else {
+          output = init_scores[cur_tree_id];
+        }
        new_tree->AsConstantTree(output);
        // updates scores
        train_score_updater_->AddScore(output, cur_tree_id);
@@ -471,9 +399,11 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {

  if (!should_continue) {
    Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
+    if (models_.size() > static_cast<size_t>(num_tree_per_iteration_)) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        models_.pop_back();
      }
+    }
    return true;
  }

@@ -501,17 +431,9 @@ void GBDT::RollbackOneIter() {

 bool GBDT::EvalAndCheckEarlyStopping() {
  bool is_met_early_stopping = false;
-
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
-
  // print message for metric
  auto best_msg = OutputMetric(iter_);

-  #ifdef TIMETAG
-  metric_time += std::chrono::steady_clock::now() - start_time;
-  #endif

  is_met_early_stopping = !best_msg.empty();
  if (is_met_early_stopping) {
@@ -528,52 +450,24 @@ bool GBDT::EvalAndCheckEarlyStopping() {

 void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {

-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
-
  // update training score
  if (!is_use_subset_) {
    train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);

-    #ifdef TIMETAG
-    train_score_time += std::chrono::steady_clock::now() - start_time;
-    #endif
-
-    #ifdef TIMETAG
-    start_time = std::chrono::steady_clock::now();
-    #endif
-
    // we need to predict out-of-bag scores of data for boosting
    if (num_data_ - bag_data_cnt_ > 0) {
      train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
    }

-    #ifdef TIMETAG
-    out_of_bag_score_time += std::chrono::steady_clock::now() - start_time;
-    #endif
-
  } else {
    train_score_updater_->AddScore(tree, cur_tree_id);
-
-    #ifdef TIMETAG
-    train_score_time += std::chrono::steady_clock::now() - start_time;
-    #endif
  }


-  #ifdef TIMETAG
-  start_time = std::chrono::steady_clock::now();
-  #endif
-
  // update validation score
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, cur_tree_id);
  }
-
-  #ifdef TIMETAG
-  valid_score_time += std::chrono::steady_clock::now() - start_time;
-  #endif
 }

 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -407,7 +407,7 @@ protected:
  */
  std::string OutputMetric(int iter);

-  double BoostFromAverage();
+  double BoostFromAverage(int class_id);

  /*! \brief current iteration */
  int iter_;
@@ -481,7 +481,6 @@ protected:
  std::unique_ptr<Dataset> tmp_subset_;
  bool is_use_subset_;
  std::vector<bool> class_need_train_;
-  std::vector<double> class_default_output_;
  bool is_constant_hessian_;
  std::unique_ptr<ObjectiveFunction> loaded_objective_;
  bool average_output_;

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -52,7 +52,6 @@ public:

  Booster(const Dataset* train_data,
          const char* parameters) {
-    CHECK(train_data->num_features() > 0);
    auto param = Config::Str2Map(parameters);
    config_.Set(param);
    if (config_.num_threads > 0) {
@@ -116,7 +115,6 @@ public:

  void ResetTrainingData(const Dataset* train_data) {
    if (train_data != train_data_) {
-      CHECK(train_data->num_features() > 0);
      std::lock_guard<std::mutex> lock(mutex_);
      train_data_ = train_data;
      CreateObjectiveAndMetrics();

--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -226,14 +226,11 @@ void Dataset::Construct(
    }
  }
  if (used_features.empty()) {
-    Log::Fatal("Cannot construct Dataset since there are no useful features.\n"
-               "It should be at least two unique rows.\n"
-               "If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.\n"
-               "Otherwise, please make sure you are using the right dataset");
+    Log::Warning("There are no meaningful features, as all feature values are constant.");
  }
  auto features_in_group = NoGroup(used_features);

-  if (io_config.enable_bundle) {
+  if (io_config.enable_bundle && !used_features.empty()) {
    features_in_group = FastFeatureBundling(bin_mappers,
                                            sample_non_zero_indices, num_per_col, total_sample_cnt,
                                            used_features, io_config.max_conflict_rate,
@@ -323,14 +320,16 @@ void Dataset::Construct(

 void Dataset::FinishLoad() {
  if (is_finish_load_) { return; }
+  if (num_groups_ > 0) {
    OMP_INIT_EX();
-  #pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(guided)
    for (int i = 0; i < num_groups_; ++i) {
      OMP_LOOP_EX_BEGIN();
      feature_groups_[i]->bin_data_->FinishLoad();
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
+  }
  is_finish_load_ = true;
 }


--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -629,9 +629,6 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
  if (dataset->num_data_ <= 0) {
    Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
  }
-  if (dataset->feature_groups_.empty()) {
-    Log::Fatal("No usable features in data file %s", dataset->data_filename_.c_str());
-  }
  if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) {
    Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_,
               static_cast<int>(dataset->feature_names_.size()));

--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -121,28 +121,34 @@ public:
  }
  
  // implement custom average to boost from (if enabled among options)
-  double BoostFromScore() const override {
+  double BoostFromScore(int) const override {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
      #pragma omp parallel for schedule(static) reduction(+:suml,sumw)
      for (data_size_t i = 0; i < num_data_; ++i) {
-        suml += label_[i] * weights_[i];
+        suml += is_pos_(label_[i]) * weights_[i];
        sumw += weights_[i];
      }
    } else {
      sumw = static_cast<double>(num_data_);
      #pragma omp parallel for schedule(static) reduction(+:suml)
      for (data_size_t i = 0; i < num_data_; ++i) {
-        suml += label_[i];
+        suml += is_pos_(label_[i]);
      }
    }
    double pavg = suml / sumw;
+    pavg = std::min(pavg, 1.0 - kEpsilon);
+    pavg = std::max<double>(pavg, kEpsilon);
    double initscore = std::log(pavg / (1.0f - pavg)) / sigmoid_;
    Log::Info("[%s:%s]: pavg=%f -> initscore=%f",  GetName(), __func__, pavg, initscore);
    return initscore;
  }

+  bool ClassNeedTrain(int /*class_id*/) const override { 
+    return num_data_ > 0; 
+  }
+
  const char* GetName() const override {
    return "binary";
  }

--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -43,11 +43,25 @@ public:
    label_ = metadata.label();
    weights_ = metadata.weights();
    label_int_.resize(num_data_);
+    class_init_probs_.resize(num_class_, 0.0);
+    double sum_weight = 0.0;
    for (int i = 0; i < num_data_; ++i) {
      label_int_[i] = static_cast<int>(label_[i]);
      if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
        Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]);
      }
+      if (weights_ == nullptr) {
+        class_init_probs_[label_int_[i]] += 1.0;
+      } else {
+        class_init_probs_[label_int_[i]] += weights_[i];
+        sum_weight += weights_[i];
+      }
+    }
+    if (weights_ == nullptr) {
+      sum_weight = num_data_;
+    }
+    for (int i = 0; i < num_class_; ++i) {
+      class_init_probs_[i] /= sum_weight;
    }
  }

@@ -120,6 +134,19 @@ public:

  bool NeedAccuratePrediction() const override { return false; }

+  double BoostFromScore(int class_id) const override {
+    return std::log(std::max<double>(kEpsilon, class_init_probs_[class_id]));
+  }
+
+  bool ClassNeedTrain(int class_id) const override { 
+    if (std::fabs(class_init_probs_[class_id]) <= kEpsilon 
+        || std::fabs(class_init_probs_[class_id]) >= 1.0 - kEpsilon) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
 private:
  /*! \brief Number of data */
  data_size_t num_data_;
@@ -131,6 +158,7 @@ private:
  std::vector<int> label_int_;
  /*! \brief Weights for data */
  const label_t* weights_;
+  std::vector<double> class_init_probs_;
 };

 /*!
@@ -212,6 +240,14 @@ public:

  bool NeedAccuratePrediction() const override { return false; }

+  double BoostFromScore(int class_id) const override {
+    return binary_loss_[class_id]->BoostFromScore(0);
+  }
+
+  bool ClassNeedTrain(int class_id) const override {
+    return binary_loss_[class_id]->ClassNeedTrain(0);
+  }
+
 private:
  /*! \brief Number of data */
  data_size_t num_data_;

--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -139,7 +139,7 @@ public:
    }
  }

-  double BoostFromScore() const override {
+  double BoostFromScore(int) const override {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
@@ -201,7 +201,7 @@ public:
    }
  }

-  double BoostFromScore() const override {
+  double BoostFromScore(int) const override {
    const double alpha = 0.5;
    if (weights_ != nullptr) {
      #define data_reader(i) (label_[i])
@@ -436,8 +436,8 @@ public:
    return "poisson";
  }

-  double BoostFromScore() const override {
-    return std::log(RegressionL2loss::BoostFromScore());
+  double BoostFromScore(int) const override {
+    return std::log(RegressionL2loss::BoostFromScore(0));
  }

  bool IsConstantHessian() const override {
@@ -493,7 +493,7 @@ public:
    return "quantile";
  }

-  double BoostFromScore() const override {
+  double BoostFromScore(int) const override {
    if (weights_ != nullptr) {
      #define data_reader(i) (label_[i])
      #define weight_reader(i) (weights_[i])
@@ -600,7 +600,7 @@ public:
    }
  }

-  double BoostFromScore() const override {
+  double BoostFromScore(int) const override {
    const double alpha = 0.5;
    #define data_reader(i) (label_[i])
    #define weight_reader(i) (label_weight_[i])

--- a/src/objective/xentropy_objective.hpp
+++ b/src/objective/xentropy_objective.hpp
@@ -104,7 +104,7 @@ public:
  }

  // implement custom average to boost from (if enabled among options)
-  double BoostFromScore() const override {
+  double BoostFromScore(int) const override {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
@@ -121,6 +121,8 @@ public:
      }
    }
    double pavg = suml / sumw;
+    pavg = std::min(pavg, 1.0 - kEpsilon);
+    pavg = std::max<double>(pavg, kEpsilon);
    double initscore = std::log(pavg / (1.0f - pavg));
    Log::Info("[%s:%s]: pavg = %f -> initscore = %f",  GetName(), __func__, pavg, initscore);
    return initscore;
@@ -229,7 +231,7 @@ public:
    return str_buf.str();
  }

-  double BoostFromScore() const override {
+  double BoostFromScore(int) const override {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -739,3 +739,55 @@ class TestEngine(unittest.TestCase):
        pred = gbm.predict(X)
        pred_mean = pred.mean()
        self.assertGreater(pred_mean, 18)
+
+    def test_constant_features(self, y_true=None, expected_pred=None, more_params=None):
+        if y_true is not None and expected_pred is not None:
+            X_train = np.ones((len(y_true), 1))
+            y_train = np.array(y_true)
+            params = {
+                'objective': 'regression',
+                'num_class': 1,
+                'verbose': -1,
+                'min_data': 1,
+                'num_leaves': 2,
+                'learning_rate': 1,
+                'min_data_in_bin': 1,
+                'boost_from_average': True
+            }
+            params.update(more_params)
+            lgb_train = lgb.Dataset(X_train, y_train, params=params)
+            gbm = lgb.train(params, lgb_train,
+                            num_boost_round=2)
+            pred = gbm.predict(X_train)
+            self.assertTrue(np.allclose(pred, expected_pred))
+
+    def test_constant_features_regression(self):
+        params = {
+            'objective': 'regression'
+        }
+        self.test_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params)
+        self.test_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params)
+        self.test_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params)
+
+    def test_constant_features_binary(self):
+        params = {
+            'objective': 'binary'
+        }
+        self.test_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params)
+        self.test_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params)
+
+    def test_constant_features_multiclass(self):
+        params = {
+            'objective': 'multiclass',
+            'num_class': 3
+        }
+        self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
+        self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
+
+    def test_constant_features_multiclassova(self):
+        params = {
+            'objective': 'multiclassova',
+            'num_class': 3
+        }
+        self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
+        self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)