Path smoothing (#2950)

* Path smoothing * Try to fix issue with gpu version. * Fix failing CI for R package. * Minor fixes. * Minor refactor. * Restore old code to get CI working. * Fix style issues. * Fix ci for R package. * Minor fixes for docs and code style. * Update docs.

Path smoothing (#2950)
* Path smoothing * Try to fix issue with gpu version. * Fix failing CI for R package. * Minor fixes. * Minor refactor. * Restore old code to get CI working. * Fix style issues. * Fix ci for R package. * Minor fixes for docs and code style. * Update docs.
e50a9151 · Belinda Trotta · GitHub · 6823af94 · e50a9151 · e50a9151
Unverified Commit e50a9151 authored May 03, 2020 by Belinda Trotta Committed by GitHub May 03, 2020
13 changed files
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -98,7 +98,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
    exit -1
 fi
-ALLOWED_CHECK_NOTES=2
+ALLOWED_CHECK_NOTES=3
 NUM_CHECK_NOTES=$(
    cat ${LOG_FILE_NAME} \
        | grep -e '^Status: .* NOTE.*' \

--- a/docs/Parameters-Tuning.rst
+++ b/docs/Parameters-Tuning.rst
@@ -81,4 +81,6 @@ Deal with Over-fitting
 -  Try ``extra_trees``
+-  Try increasing ``path_smooth``
 .. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -522,6 +522,22 @@ Learning Control Parameters
   -  applied once per forest
+-  ``path_smooth`` :raw-html:`<a id="path_smooth" title="Permalink to this parameter" href="#path_smooth">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = double, constraints: ``path_smooth >=  0.0``
+   -  controls smoothing applied to tree nodes
+   -  helps prevent overfitting on leaves with few samples
+   -  if set to zero, no smoothing is applied
+   -  if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
+   -  larger values give stronger regularisation
+      -  the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
+      -  note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
 -  ``verbosity`` :raw-html:`<a id="verbosity" title="Permalink to this parameter" href="#verbosity">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``verbose``
   -  controls the level of LightGBM's verbosity

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -495,6 +495,16 @@ struct Config {
  // desc = applied once per forest
  std::vector<double> cegb_penalty_feature_coupled;
+  // check = >= 0.0
+  // desc = controls smoothing applied to tree nodes
+  // desc = helps prevent overfitting on leaves with few samples
+  // desc = if set to zero, no smoothing is applied
+  // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
+  // desc = larger values give stronger regularisation
+  // descl2 = the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
+  // descl2 = note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
+  double path_smooth = 0;
  // alias = verbose
  // desc = controls the level of LightGBM's verbosity
  // desc = ``< 0``: Fatal, ``= 0``: Error (Warning), ``= 1``: Info, ``> 1``: Debug

--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -142,6 +142,9 @@ class Tree {
  /*! \brief Get depth of specific leaf*/
  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
+  /*! \brief Get parent of specific leaf*/
+  inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }
  /*! \brief Get feature of specific split*/
  inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
@@ -163,8 +166,6 @@ class Tree {
    return split_feature_inner_[node_idx];
  }
-  inline int leaf_parent(int leaf_idx) const { return leaf_parent_[leaf_idx]; }
  inline uint32_t threshold_in_bin(int node_idx) const {
    return threshold_in_bin_[node_idx];
  }

--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -314,6 +314,14 @@ void Config::CheckParamConflict() {
    force_col_wise = true;
    force_row_wise = false;
  }
+  // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated
+  // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can
+  // be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the
+  // calculated split gain can be positive even with zero gradient and hessian.
+  if (path_smooth > kEpsilon && min_data_in_leaf < 2) {
+    min_data_in_leaf = 2;
+    Log::Warning("min_data_in_leaf has been increased to 2 because this is required when path smoothing is active.");
+  }
  if (is_parallel && monotone_constraints_method == std::string("intermediate")) {
    // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints.
    Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method.");

--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -229,6 +229,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
  "cegb_penalty_split",
  "cegb_penalty_feature_lazy",
  "cegb_penalty_feature_coupled",
+  "path_smooth",
  "verbosity",
  "input_model",
  "output_model",
@@ -450,6 +451,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
    cegb_penalty_feature_coupled = Common::StringToArray<double>(tmp_str, ',');
  }
+  GetDouble(params, "path_smooth", &path_smooth);
+  CHECK_GE(path_smooth,  0.0);
  GetInt(params, "verbosity", &verbosity);
  GetString(params, "input_model", &input_model);
@@ -654,6 +658,7 @@ std::string Config::SaveMembersToString() const {
  str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
  str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n";
  str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n";
+  str_buf << "[path_smooth: " << path_smooth << "]\n";
  str_buf << "[verbosity: " << verbosity << "]\n";
  str_buf << "[max_bin: " << max_bin << "]\n";
  str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n";

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -1090,8 +1090,8 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right
        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
      }
    } else {
-      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output);
-      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output);
      if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
          (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -21,7 +21,7 @@ class LeafSplits {
 public:
  explicit LeafSplits(data_size_t num_data)
    :num_data_in_leaf_(num_data), num_data_(num_data),
-    data_indices_(nullptr) {
+    data_indices_(nullptr), weight_(0) {
  }
  void ResetNumData(data_size_t num_data) {
    num_data_ = num_data;
@@ -37,11 +37,13 @@ class LeafSplits {
  * \param sum_gradients
  * \param sum_hessians
  */
-  void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) {
+  void Init(int leaf, const DataPartition* data_partition, double sum_gradients,
+            double sum_hessians, double weight) {
    leaf_index_ = leaf;
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
+    weight_ = weight;
  }
  /*!
@@ -135,6 +137,10 @@ class LeafSplits {
  /*! \brief Get indices of data of current leaf */
  const data_size_t* data_indices() const { return data_indices_; }
+  /*! \brief Get weight of current leaf */
+  double weight() const { return weight_; }
 private:
  /*! \brief current leaf index */
@@ -149,6 +155,8 @@ class LeafSplits {
  double sum_hessians_;
  /*! \brief indices of data of current leaf */
  const data_size_t* data_indices_;
+  /*! \brief weight of current leaf */
+  double weight_;
 };
 }  // namespace LightGBM

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -214,9 +214,16 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t*
      sum_grad += gradients[idx];
      sum_hess += hessians[idx];
    }
-    double output = FeatureHistogram::CalculateSplittedLeafOutput<true, true>(
+    double output;
+    if ((config_->path_smooth > kEpsilon) & (i > 0)) {
+      output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true>(
          sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
-        config_->max_delta_step);
+          config_->max_delta_step, config_->path_smooth, cnt_leaf_data, tree->leaf_parent(i));
+    } else {
+      output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(
+          sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
+          config_->max_delta_step, config_->path_smooth, cnt_leaf_data, 0);
+    }
    auto old_leaf_output = tree->LeafOutput(i);
    auto new_leaf_output = output * tree->shrinkage();
    tree->SetLeafOutput(i, config_->refit_decay_rate * old_leaf_output + (1.0 - config_->refit_decay_rate) * new_leaf_output);
@@ -449,6 +456,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
              left_leaf_splits->sum_hessians(),
              left_threshold,
              left_leaf_splits->num_data_in_leaf(),
+              left_leaf_splits->weight(),
              &left_split);
      left_split.feature = left_feature;
      forceSplitMap[*left_leaf] = left_split;
@@ -470,6 +478,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
        right_leaf_splits->sum_hessians(),
        right_threshold,
        right_leaf_splits->num_data_in_leaf(),
+        right_leaf_splits->weight(),
        &right_split);
      right_split.feature = right_feature;
      forceSplitMap[*right_leaf] = right_split;
@@ -613,18 +622,22 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
    CHECK_GT(best_split_info.left_count, 0);
    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                               best_split_info.left_sum_gradient,
-                               best_split_info.left_sum_hessian);
+                               best_split_info.left_sum_hessian,
+                               best_split_info.left_output);
    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                              best_split_info.right_sum_gradient,
-                              best_split_info.right_sum_hessian);
+                              best_split_info.right_sum_hessian,
+                              best_split_info.right_output);
  } else {
    CHECK_GT(best_split_info.right_count, 0);
    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                               best_split_info.right_sum_gradient,
-                               best_split_info.right_sum_hessian);
+                               best_split_info.right_sum_hessian,
+                               best_split_info.right_output);
    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                              best_split_info.left_sum_gradient,
-                              best_split_info.left_sum_hessian);
+                              best_split_info.left_sum_hessian,
+                              best_split_info.left_output);
  }
  auto leaves_need_update = constraints_->Update(
      tree, is_numerical_split, *left_leaf, *right_leaf,
@@ -685,9 +698,19 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
    return;
  }
  SplitInfo new_split;
+  double parent_output;
+  if (leaf_splits->leaf_index() == 0) {
+    // for root leaf the "parent" output is its own output because we don't apply any smoothing to the root
+    parent_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
+        leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), config_->lambda_l1,
+        config_->lambda_l2, config_->max_delta_step, constraints_->Get(leaf_splits->leaf_index()),
+        config_->path_smooth, static_cast<data_size_t>(num_data), 0);
+  } else {
+    parent_output = leaf_splits->weight();
+  }
  histogram_array_[feature_index].FindBestThreshold(
      leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
-      constraints_->Get(leaf_splits->leaf_index()), &new_split);
+      constraints_->Get(leaf_splits->leaf_index()),  parent_output, &new_split);
  new_split.feature = real_fidx;
  if (cegb_ != nullptr) {
    new_split.gain -=

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -436,17 +436,21 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
  if (best_split_info.left_count < best_split_info.right_count) {
    smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
      best_split_info.left_sum_gradient,
-      best_split_info.left_sum_hessian);
+      best_split_info.left_sum_hessian,
+      best_split_info.left_output);
    larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
      best_split_info.right_sum_gradient,
-      best_split_info.right_sum_hessian);
+      best_split_info.right_sum_hessian,
+      best_split_info.right_output);
  } else {
    smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
      best_split_info.right_sum_gradient,
-      best_split_info.right_sum_hessian);
+      best_split_info.right_sum_hessian,
+      best_split_info.right_output);
    larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
      best_split_info.left_sum_gradient,
-      best_split_info.left_sum_hessian);
+      best_split_info.left_sum_hessian,
+      best_split_info.left_output);
  }
 }

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -2118,6 +2118,23 @@ class TestEngine(unittest.TestCase):
        err_new = mean_squared_error(y, predicted_new)
        self.assertLess(err, err_new)
+    def test_path_smoothing(self):
+        # check path smoothing increases regularization
+        X, y = load_boston(True)
+        lgb_x = lgb.Dataset(X, label=y)
+        params = {'objective': 'regression',
+                  'num_leaves': 32,
+                  'verbose': -1,
+                  'seed': 0}
+        est = lgb.train(params, lgb_x, num_boost_round=10)
+        predicted = est.predict(X)
+        err = mean_squared_error(y, predicted)
+        params['path_smooth'] = 1
+        est = lgb.train(params, lgb_x, num_boost_round=10)
+        predicted_new = est.predict(X)
+        err_new = mean_squared_error(y, predicted_new)
+        self.assertLess(err, err_new)
    @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
    def test_trees_to_dataframe(self):