add FitByExistingTree.

8a6bd5ec · Guolin Ke · 32ef85da · 8a6bd5ec · 8a6bd5ec · 8a6bd5ec
Commit 8a6bd5ec authored Mar 25, 2017 by Guolin Ke
8 changed files
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -43,6 +43,11 @@ public:
  */
  virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;

+  /*!
+  * \brief use a existing tree to fit the new gradients and hessians.
+  */
+  virtual Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const = 0;
+
  /*!
  * \brief Set bagging data
  * \param used_indices Used data indices
@@ -55,7 +60,7 @@ public:
  * \brief Using last trained tree to predict score then adding to out_score;
  * \param out_score output score
  */
-  virtual void AddPredictionToScore(double* out_score) const = 0;
+  virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0;

  TreeLearner() = default;
  /*! \brief Disable copy */

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -452,7 +452,7 @@ void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
 #endif
  // update training score
  if (!is_use_subset_) {
-    train_score_updater_->AddScore(tree_learner_.get(), curr_class);
+    train_score_updater_->AddScore(tree_learner_.get(), tree, curr_class);
  } else {
    train_score_updater_->AddScore(tree, curr_class);
  }

--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -70,19 +70,19 @@ public:
  /*!
  * \brief Adding prediction score, only used for training data.
  *        The training data is partitioned into tree leaves after training
-  *        Based on which We can get prediction quckily.
+  *        Based on which We can get prediction quickly.
  * \param tree_learner
  * \param curr_class Current class for multiclass training
  */
-  inline void AddScore(const TreeLearner* tree_learner, int curr_class) {
-    tree_learner->AddPredictionToScore(score_.data() + curr_class * num_data_);
+  inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int curr_class) {
+    tree_learner->AddPredictionToScore(tree, score_.data() + curr_class * num_data_);
  }
  /*!
  * \brief Using tree model to get prediction number, then adding to scores for parts of data
  *        Used for prediction of training out-of-bag data
  * \param tree Trained tree model
-  * \param data_indices Indices of data that will be proccessed
-  * \param data_cnt Number of data that will be proccessed
+  * \param data_indices Indices of data that will be processed
+  * \param data_cnt Number of data that will be processed
  * \param curr_class Current class for multiclass training
  */
  inline void AddScore(const Tree* tree, const data_size_t* data_indices,

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -8,7 +8,7 @@

 #include <cstring>

-namespace LightGBM 
+namespace LightGBM
 {

 class FeatureMetainfo {
@@ -45,10 +45,10 @@ public:
    data_ = data;
    if (bin_type == BinType::NumericalBin) {
      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
-        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
+                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
    } else {
      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
-        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
+                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
    }
  }

@@ -68,12 +68,12 @@ public:
  }

  void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
-    SplitInfo* output) {
+                         SplitInfo* output) {
    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
  }

  void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
-    SplitInfo* output) {
+                                  SplitInfo* output) {
    double best_sum_left_gradient = NAN;
    double best_sum_left_hessian = NAN;
    double best_gain = kMinScore;
@@ -82,7 +82,8 @@ public:
    double sum_right_gradient = 0.0f;
    double sum_right_hessian = kEpsilon;
    data_size_t right_count = 0;
-    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
+    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
+                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
    double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
    is_splittable_ = false;
    const int bias = meta_->bias;
@@ -95,7 +96,7 @@ public:
      right_count += data_[t].cnt;
      // if data not enough, or sum hessian too small
      if (right_count < meta_->tree_config->min_data_in_leaf
-        || sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+          || sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
      data_size_t left_count = num_data - right_count;
      // if data not enough
      if (left_count < meta_->tree_config->min_data_in_leaf) break;
@@ -106,8 +107,10 @@ public:

      double sum_left_gradient = sum_gradient - sum_right_gradient;
      // current split gain
-      double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian)
-        + GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
+      double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
+                                             meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+        + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
+                           meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
      // gain with split is worse than without split
      if (current_gain <= min_gain_shift) continue;

@@ -126,12 +129,14 @@ public:
    if (is_splittable_) {
      // update split information
      output->threshold = best_threshold;
-      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
+      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
+                                                        meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-        sum_hessian - best_sum_left_hessian);
+                                                         sum_hessian - best_sum_left_hessian,
+                                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
@@ -142,13 +147,14 @@ public:
  }

  void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
-    SplitInfo* output) {
+                                    SplitInfo* output) {
    double best_gain = kMinScore;
    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
    data_size_t best_left_count = 0;
    double best_sum_left_gradient = 0.0f;
    double best_sum_left_hessian = 0.0f;
-    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
+    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
+                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
    double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
    is_splittable_ = false;
    const int bias = meta_->bias;
@@ -158,7 +164,7 @@ public:
    for (; t >= t_end; --t) {
      // if data not enough, or sum hessian too small
      if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
-        || data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+          || data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
      data_size_t other_count = num_data - data_[t].cnt;
      // if data not enough
      if (other_count < meta_->tree_config->min_data_in_leaf) continue;
@@ -169,8 +175,10 @@ public:

      double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
      // current split gain
-      double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
-        + GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon);
+      double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
+                                             meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+        + GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
+                           meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
      // gain with split is worse than without split
      if (current_gain <= min_gain_shift) continue;

@@ -199,12 +207,14 @@ public:
      data_size_t other_count = num_data - cnt_bin0;
      double sum_other_hessian = sum_hessian - sum_bin0_hessian - kEpsilon;
      if (cnt_bin0 >= meta_->tree_config->min_data_in_leaf
-        && sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
-        && other_count >= meta_->tree_config->min_data_in_leaf
-        && sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
+          && sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
+          && other_count >= meta_->tree_config->min_data_in_leaf
+          && sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
        double sum_other_gradient = sum_gradient - sum_bin0_gradient;
-        double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
-          + GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon);
+        double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
+                                               meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+          + GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon,
+                             meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
        if (current_gain > min_gain_shift) {
          is_splittable_ = true;
          // better split point
@@ -221,12 +231,14 @@ public:
    if (is_splittable_) {
      // update split information
      output->threshold = best_threshold;
-      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
+      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
+                                                        meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-        sum_hessian - best_sum_left_hessian);
+                                                         sum_hessian - best_sum_left_hessian,
+                                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
@@ -260,18 +272,17 @@ public:
  */
  void set_is_splittable(bool val) { is_splittable_ = val; }

-private:
  /*!
  * \brief Calculate the split gain based on regularized sum_gradients and sum_hessians
  * \param sum_gradients
  * \param sum_hessians
  * \return split gain
  */
-  double GetLeafSplitGain(double sum_gradients, double sum_hessians) const {
+  static double GetLeafSplitGain(double sum_gradients, double sum_hessians, double l1, double l2) {
    double abs_sum_gradients = std::fabs(sum_gradients);
-    double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
+    double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - l1);
    return (reg_abs_sum_gradients * reg_abs_sum_gradients)
-      / (sum_hessians + meta_->tree_config->lambda_l2);
+      / (sum_hessians + l2);

  }

@@ -281,12 +292,15 @@ private:
  * \param sum_hessians
  * \return leaf output
  */
-  double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians) const {
+  static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2) {
    double abs_sum_gradients = std::fabs(sum_gradients);
-    double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
+    double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - l1);
    return -std::copysign(reg_abs_sum_gradients, sum_gradients)
-      / (sum_hessians + meta_->tree_config->lambda_l2);
+      / (sum_hessians + l2);
  }
+
+private:
+
  const FeatureMetainfo* meta_;
  /*! \brief sum of gradient of each bin */
  HistogramBinEntry* data_;
@@ -346,7 +360,7 @@ public:
  void DynamicChangeSize(const Dataset* train_data, const TreeConfig* tree_config, int cache_size, int total_size) {
    if (feature_metas_.empty()) {
      feature_metas_.resize(train_data->num_features());
-#pragma omp parallel for schedule(static)
+      #pragma omp parallel for schedule(static)
      for (int i = 0; i < train_data->num_features(); ++i) {
        feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
        if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
@@ -363,7 +377,7 @@ public:
    Reset(cache_size, total_size);
    pool_.resize(cache_size);
    data_.resize(cache_size);
-#pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(static)
    for (int i = old_cache_size; i < cache_size_; ++i) {
      pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
      data_[i].resize(num_total_bin);
@@ -382,7 +396,7 @@ public:
  }

  void ResetConfig(const TreeConfig* tree_config) {
-#pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(static)
    for (int i = 0; i < static_cast<int>(feature_metas_.size()); ++i) {
      feature_metas_[i].tree_config = tree_config;
    }

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -103,7 +103,7 @@ public:
  void ResetConfig(const TreeConfig* tree_config) override;
 protected:
  void BeforeTrain() override;
-  bool BeforeFindBestSplit(int left_leaf, int right_leaf) override;
+  bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
  void FindBestThresholds() override;
  void FindBestSplitsForLeaves() override;
  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -179,8 +179,6 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
 #endif

  auto tree = std::unique_ptr<Tree>(new Tree(tree_config_->num_leaves));
-  // save pointer to last trained tree
-  last_trained_tree_ = tree.get();
  // root leaf
  int left_leaf = 0;
  int cur_depth = 1;
@@ -191,7 +189,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
    start_time = std::chrono::steady_clock::now();
  #endif
    // some initial works before finding best split
-    if (BeforeFindBestSplit(left_leaf, right_leaf)) {
+    if (BeforeFindBestSplit(tree.get(), left_leaf, right_leaf)) {
    #ifdef TIMETAG
      init_split_time += std::chrono::steady_clock::now() - start_time;
    #endif
@@ -223,6 +221,29 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
  return tree.release();
 }

+Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const {
+  auto tree = std::unique_ptr<Tree>(new Tree(*old_tree));
+  CHECK(data_partition_->num_leaves() >= tree->num_leaves());
+  #pragma omp parallel for schedule(static)
+  for (int i = 0; i < data_partition_->num_leaves(); ++i) {
+    data_size_t cnt_leaf_data = 0;
+    auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
+    double sum_grad = 0.0f;
+    double sum_hess = 0.0f;
+    for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
+      auto idx = tmp_idx[j];
+      sum_grad += gradients[idx];
+      sum_hess += hessians[idx];
+    }
+    // avoid zero hessians.
+    if (sum_hess <= 0) sum_hess = kEpsilon;
+    double output = FeatureHistogram::CalculateSplittedLeafOutput(sum_grad, sum_hess,
+                                                                  tree_config_->lambda_l1, tree_config_->lambda_l2);
+    tree->SetLeafOutput(i, output);
+  }
+  return tree.release();
+}
+
 void SerialTreeLearner::BeforeTrain() {

  // reset histogram pool
@@ -305,11 +326,11 @@ void SerialTreeLearner::BeforeTrain() {
  }
 }

-bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
+bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
  // check depth of current leaf
  if (tree_config_->max_depth > 0) {
    // only need to check left leaf, since right leaf is in same level of left leaf
-    if (last_trained_tree_->leaf_depth(left_leaf) >= tree_config_->max_depth) {
+    if (tree->leaf_depth(left_leaf) >= tree_config_->max_depth) {
      best_split_per_leaf_[left_leaf].gain = kMinScore;
      if (right_leaf >= 0) {
        best_split_per_leaf_[right_leaf].gain = kMinScore;

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -38,15 +38,18 @@ public:

  Tree* Train(const score_t* gradients, const score_t *hessians) override;

+  Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
+
  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
    data_partition_->SetUsedDataIndices(used_indices, num_data);
  }

-  void AddPredictionToScore(double* out_score) const override {
-    if (last_trained_tree_->num_leaves() <= 1) { return; }
+  void AddPredictionToScore(const Tree* tree, double* out_score) const override {
+    if (tree->num_leaves() <= 1) { return; }
+    CHECK(tree->num_leaves() <= data_partition_->num_leaves());
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < data_partition_->num_leaves(); ++i) {
-      double output = static_cast<double>(last_trained_tree_->LeafOutput(i));
+      double output = static_cast<double>(tree->LeafOutput(i));
      data_size_t cnt_leaf_data = 0;
      auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
      for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
@@ -64,7 +67,7 @@ protected:
  /*!
  * \brief Some initial works before FindBestSplit
  */
-  virtual bool BeforeFindBestSplit(int left_leaf, int right_leaf);
+  virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf);


  /*!
@@ -95,9 +98,6 @@ protected:
  * \return The number of data in the leaf_idx leaf
  */
  inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
-
-  /*! \brief Last trained decision tree */
-  const Tree* last_trained_tree_;
  /*! \brief number of data */
  data_size_t num_data_;
  /*! \brief number of features */

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -133,8 +133,8 @@ void VotingParallelTreeLearner::BeforeTrain() {
  global_data_count_in_leaf_[0] = std::get<0>(data);
 }

-bool VotingParallelTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
-  if (SerialTreeLearner::BeforeFindBestSplit(left_leaf, right_leaf)) {
+bool VotingParallelTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
+  if (SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf)) {
    data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
    data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
    if (right_leaf < 0) {