Add Random Forest Mode (#678)

* add draft of RF. * fix score bugs. * fix scores. * fix tests. * update document * fix GetPredictAt

Add Random Forest Mode (#678)
* add draft of RF. * fix score bugs. * fix scores. * fix tests. * update document * fix GetPredictAt
6a7470a2 · Guolin Ke · c05cfa89 · 6a7470a2 · 6a7470a2 · 6a7470a2
Commit 6a7470a2 authored Jul 10, 2017 by Guolin Ke
11 changed files
--- a/docs/Parameters.md
+++ b/docs/Parameters.md
@@ -39,8 +39,9 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
  * `binary`, binary classification application 
  * `lambdarank`, [lambdarank](https://pdfs.semanticscholar.org/fc9a/e09f9ced555558fdf1e997c0a5411fb51f15.pdf) application
  * `multiclass`, multi-class classification application, should set `num_class` as well
-* `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`dart`, alias=`boost`,`boosting_type`
+* `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
  * `gbdt`, traditional Gradient Boosting Decision Tree 
+  * `rf`, Random Forest
  * `dart`, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866)
  * `goss`, Gradient-based One-Side Sampling
 * `data`, default=`""`, type=string, alias=`train`,`train_data`

--- a/docs/Quick-Start.md
+++ b/docs/Quick-Start.md
@@ -51,9 +51,11 @@ Some important parameters:
  * ```binary```, binary classification application 
  * ```lambdarank```, lambdarank application
  * ```multiclass```, multi-class classification application, should set ```num_class``` as well
-* ```boosting```, default=```gbdt```, type=enum, options=```gbdt```,```dart```, alias=```boost```,```boosting_type```
+* `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
-  * ```gbdt```, traditional Gradient Boosting Decision Tree 
+  * `gbdt`, traditional Gradient Boosting Decision Tree 
-  * ```dart```, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866)
+  * `rf`, Random Forest
+  * `dart`, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866)
+  * `goss`, Gradient-based One-Side Sampling
 * ```data```, default=```""```, type=string, alias=```train```,```train_data```
  * training data, LightGBM will train from this data
 * ```valid```, default=```""```, type=multi-string, alias=```test```,```valid_data```,```test_data```

--- a/src/boosting/boosting.cpp
+++ b/src/boosting/boosting.cpp
@@ -2,6 +2,7 @@
 #include "gbdt.h"
 #include "dart.hpp"
 #include "goss.hpp"
+#include "rf.hpp"
 namespace LightGBM {
@@ -34,6 +35,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
      return new DART();
    } else if (type == std::string("goss")) {
      return new GOSS();
+    } else if (type == std::string("rf")) {
+      return new RF();
    } else {
      return nullptr;
    }
@@ -47,6 +50,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
        ret.reset(new DART());
      } else if (type == std::string("goss")) {
        ret.reset(new GOSS());
+      } else if (type == std::string("rf")) {
+        return new RF();
      } else {
        Log::Fatal("unknown boosting type %s", type.c_str());
      }

--- a/src/boosting/dart.hpp
+++ b/src/boosting/dart.hpp
@@ -39,6 +39,12 @@ public:
    sum_weight_ = 0.0f;
  }
+  void ResetConfig(const BoostingConfig* config) override {
+    GBDT::ResetConfig(config);
+    random_for_drop_ = Random(gbdt_config_->drop_seed);
+    sum_weight_ = 0.0f;
+  }
  /*!
  * \brief one training iteration
  */

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -44,9 +44,10 @@ GBDT::GBDT()
  boost_from_average_(false) {
  #pragma omp parallel
  #pragma omp master
-    {
+  {
-      num_threads_ = omp_get_num_threads();
+    num_threads_ = omp_get_num_threads();
-    }
+  }
+  average_output_ = false;
 }
 GBDT::~GBDT() {
@@ -164,10 +165,9 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
  }
  objective_function_ = objective_function;
-  num_tree_per_iteration_ = num_class_;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
-    num_tree_per_iteration_ = objective_function_->NumTreePerIteration();
+    CHECK(num_tree_per_iteration_ == objective_function_->NumTreePerIteration());
  } else {
    is_constant_hessian_ = false;
  }
@@ -608,6 +608,10 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
  #endif
 }
+std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
+  return metric->Eval(score, objective_function_);
+}
 std::string GBDT::OutputMetric(int iter) {
  bool need_output = (iter % gbdt_config_->output_freq) == 0;
  std::string ret = "";
@@ -617,7 +621,7 @@ std::string GBDT::OutputMetric(int iter) {
  if (need_output) {
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
-      auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_);
+      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
      for (size_t k = 0; k < name.size(); ++k) {
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
@@ -634,8 +638,7 @@ std::string GBDT::OutputMetric(int iter) {
  if (need_output || early_stopping_round_ > 0) {
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
-        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score(),
+        auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score());
-                                                      objective_function_);
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
@@ -674,7 +677,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const {
  std::vector<double> ret;
  if (data_idx == 0) {
    for (auto& sub_metric : training_metrics_) {
-      auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_);
+      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
      for (auto score : scores) {
        ret.push_back(score);
      }
@@ -682,8 +685,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const {
  } else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
-      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score(),
+      auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score());
-                                                           objective_function_);
      for (auto score : test_scores) {
        ret.push_back(score);
      }
@@ -712,7 +714,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
    num_data = valid_score_updater_[used_idx]->num_data();
    *out_len = static_cast<int64_t>(num_data) * num_class_;
  }
-  if (objective_function_ != nullptr) {
+  if (objective_function_ != nullptr && !average_output_) {
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
      std::vector<double> tree_pred(num_tree_per_iteration_);
@@ -842,7 +844,12 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
  // Predict
  str_buf << "void GBDT::Predict(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
  str_buf << "\t" << "PredictRaw(features, output, early_stop);" << std::endl;
-  str_buf << "\t" << "if (objective_function_ != nullptr) {" << std::endl;
+  str_buf << "\t" << "if (average_output_) {" << std::endl;
+  str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
+  str_buf << "\t\t\t" << "output[k] /= num_iteration_for_pred_;" << std::endl;
+  str_buf << "\t\t" << "}" << std::endl;
+  str_buf << "\t" << "}" << std::endl;
+  str_buf << "\t" << "else if (objective_function_ != nullptr) {" << std::endl;
  str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << std::endl;
  str_buf << "\t" << "}" << std::endl;
  str_buf << "}" << std::endl;
@@ -920,6 +927,10 @@ std::string GBDT::SaveModelToString(int num_iteration) const {
    ss << "boost_from_average" << std::endl;
  }
+  if (average_output_) {
+    ss << "average_output" << std::endl;
+  }
  ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;
  ss << "feature_infos=" << Common::Join(feature_infos_, " ") << std::endl;
@@ -999,6 +1010,11 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
  if (line.size() > 0) {
    boost_from_average_ = true;
  }
+  // get average_output
+  line = Common::FindFromLines(lines, "average_output");
+  if (line.size() > 0) {
+    average_output_ = true;
+  }
  // get feature names
  line = Common::FindFromLines(lines, "feature_names=");
  if (line.size() > 0) {

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -275,17 +275,19 @@ protected:
  * \param tree Trained tree of this iteration
  * \param cur_tree_id Current tree for multiclass training
  */
-  void UpdateScoreOutOfBag(const Tree* tree, const int cur_tree_id);
+  virtual void UpdateScoreOutOfBag(const Tree* tree, const int cur_tree_id);
  /*!
  * \brief calculate the object function
  */
-  void Boosting();
+  virtual void Boosting();
  /*!
  * \brief updating score after tree was trained
  * \param tree Trained tree of this iteration
  * \param cur_tree_id Current tree for multiclass training
  */
  virtual void UpdateScore(const Tree* tree, const int cur_tree_id);
+  virtual std::vector<double> EvalOneMetric(const Metric* metric, const double* score) const;
  /*!
  * \brief Print metric result of current iteration
  * \param iter Current interation
@@ -373,6 +375,7 @@ protected:
  std::vector<double> class_default_output_;
  bool is_constant_hessian_;
  std::unique_ptr<ObjectiveFunction> loaded_objective_;
+  bool average_output_;
 };
 }  // namespace LightGBM

--- a/src/boosting/gbdt_prediction.cpp
+++ b/src/boosting/gbdt_prediction.cpp
@@ -28,7 +28,11 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa
 void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
  PredictRaw(features, output, early_stop);
-  if (objective_function_ != nullptr) {
+  if (average_output_) {
+    for (int k = 0; k < num_tree_per_iteration_; ++k) {
+      output[k] /= num_iteration_for_pred_;
+    }
+  } else if (objective_function_ != nullptr) {
    objective_function_->ConvertOutput(output, output);
  }
 }

--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
+#ifndef LIGHTGBM_BOOSTING_RF_H_
+#define LIGHTGBM_BOOSTING_RF_H_
+#include <LightGBM/boosting.h>
+#include <LightGBM/metric.h>
+#include "score_updater.hpp"
+#include "gbdt.h"
+#include <cstdio>
+#include <vector>
+#include <string>
+#include <fstream>
+namespace LightGBM {
+/*!
+* \brief Rondom Forest implementation
+*/
+class RF: public GBDT {
+public:
+  RF() : GBDT() { 
+    average_output_ = true;
+  }
+  ~RF() {}
+  void Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
+            const std::vector<const Metric*>& training_metrics) override {
+    CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
+    CHECK(config->tree_config.feature_fraction < 1.0f && config->tree_config.feature_fraction > 0.0f);
+    GBDT::Init(config, train_data, objective_function, training_metrics);
+    if (num_init_iteration_ > 0) {
+      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+        MultiplyScore(cur_tree_id, 1.0f / num_init_iteration_);
+      }
+    } else {
+      CHECK(train_data->metadata().init_score() == nullptr);
+    }
+    // cannot use RF for multi-class. 
+    CHECK(num_tree_per_iteration_ == 1);
+    // not shrinkage rate for the RF
+    shrinkage_rate_ = 1.0f;
+    // only boosting one time
+    Boosting();
+    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+      tmp_grad_.resize(total_size);
+      tmp_hess_.resize(total_size);
+    }
+  }
+  void ResetConfig(const BoostingConfig* config) override {
+    CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
+    CHECK(config->tree_config.feature_fraction < 1.0f && config->tree_config.feature_fraction > 0.0f);
+    GBDT::ResetConfig(config);
+    // not shrinkage rate for the RF
+    shrinkage_rate_ = 1.0f;
+  }
+  void ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
+                         const std::vector<const Metric*>& training_metrics) override {
+    GBDT::ResetTrainingData(train_data, objective_function, training_metrics);
+    if (iter_ + num_init_iteration_ > 0) {
+      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+        train_score_updater_->MultiplyScore(1.0f / (iter_ + num_init_iteration_), cur_tree_id);
+      }
+    }
+    // cannot use RF for multi-class.
+    CHECK(num_tree_per_iteration_ == 1);
+    // only boosting one time
+    Boosting();
+    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+      tmp_grad_.resize(total_size);
+      tmp_hess_.resize(total_size);
+    }
+  }
+  void Boosting() override {
+    if (objective_function_ == nullptr) {
+      Log::Fatal("No object function provided");
+    }
+    std::vector<double> tmp_score(num_tree_per_iteration_ * num_data_, 0.0f);
+    objective_function_->
+      GetGradients(tmp_score.data(), gradients_.data(), hessians_.data());
+  }
+  bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override {
+    // bagging logic
+    Bagging(iter_);
+    if (gradient == nullptr || hessian == nullptr) {
+      gradient = gradients_.data();
+      hessian = hessians_.data();
+    }
+    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+      // get sub gradients
+      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+        size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
+        // cannot multi-threading here.
+        for (int i = 0; i < bag_data_cnt_; ++i) {
+          tmp_grad_[bias + i] = gradient[bias + bag_data_indices_[i]];
+          tmp_hess_[bias + i] = hessian[bias + bag_data_indices_[i]];
+        }
+      }
+      gradient = tmp_grad_.data();
+      hessian = tmp_hess_.data();
+    }
+    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+      std::unique_ptr<Tree> new_tree(new Tree(2));
+      if (class_need_train_[cur_tree_id]) {
+        size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
+        new_tree.reset(
+          tree_learner_->Train(gradient + bias, hessian + bias, is_constant_hessian_));
+      }
+      if (new_tree->num_leaves() > 1) {
+        // update score
+        MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
+        ConvertTreeOutput(new_tree.get());
+        UpdateScore(new_tree.get(), cur_tree_id);
+        UpdateScoreOutOfBag(new_tree.get(), cur_tree_id);
+        MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));
+      } else {
+        // only add default score one-time
+        if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
+          double output = class_default_output_[cur_tree_id];
+          objective_function_->ConvertOutput(&output, &output);
+          new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
+                          output, output, 0, 0, -1, 0, 0, 0);
+          train_score_updater_->AddScore(output, cur_tree_id);
+          for (auto& score_updater : valid_score_updater_) {
+            score_updater->AddScore(output, cur_tree_id);
+          }
+        }
+      }
+      // add model
+      models_.push_back(std::move(new_tree));
+    }
+    ++iter_;
+    if (is_eval) {
+      return EvalAndCheckEarlyStopping();
+    } else {
+      return false;
+    }
+  }
+  void RollbackOneIter() override {
+    if (iter_ <= 0) { return; }
+    int cur_iter = iter_ + num_init_iteration_ - 1;
+    // reset score
+    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+      auto curr_tree = cur_iter * num_tree_per_iteration_ + cur_tree_id;
+      models_[curr_tree]->Shrinkage(-1.0);
+      MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
+      train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
+      for (auto& score_updater : valid_score_updater_) {
+        score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
+      }
+      MultiplyScore(cur_tree_id, 1.0f / (iter_ + num_init_iteration_ - 1));
+    }
+    // remove model
+    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+      models_.pop_back();
+    }
+    --iter_;
+  }
+  void MultiplyScore(const int cur_tree_id, double val) {
+    train_score_updater_->MultiplyScore(val, cur_tree_id);
+    for (auto& score_updater : valid_score_updater_) {
+      score_updater->MultiplyScore(val, cur_tree_id);
+    }
+  }
+  void ConvertTreeOutput(Tree* tree) {
+    tree->Shrinkage(1.0f);
+    for (int i = 0; i < tree->num_leaves(); ++i) {
+      double output = tree->LeafOutput(i);
+      objective_function_->ConvertOutput(&output, &output);
+      tree->SetLeafOutput(i, output);
+    }
+  }
+  void AddValidDataset(const Dataset* valid_data,
+                       const std::vector<const Metric*>& valid_metrics) override {
+    GBDT::AddValidDataset(valid_data, valid_metrics);
+    if (iter_ + num_init_iteration_ > 0) {
+      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+        valid_score_updater_.back()->MultiplyScore(1.0f / (iter_ + num_init_iteration_), cur_tree_id);
+      }
+    }
+  }
+  bool NeedAccuratePrediction() const override {
+    // No early stopping for prediction
+    return true;
+  };
+  std::vector<double> EvalOneMetric(const Metric* metric, const double* score) const override {
+    return metric->Eval(score, nullptr);
+  }
+private:
+  std::vector<score_t> tmp_grad_;
+  std::vector<score_t> tmp_hess_;
+};
+}  // namespace LightGBM
+#endif   // LIGHTGBM_BOOSTING_RF_H_
--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -58,6 +58,14 @@ public:
      score_[offset + i] += val;
    }
  }
+  inline void MultiplyScore(double val, int cur_tree_id) {
+    int64_t offset = cur_tree_id * num_data_;
+    #pragma omp parallel for schedule(static)
+    for (int64_t i = 0; i < num_data_; ++i) {
+      score_[offset + i] *= val;
+    }
+  }
  /*!
  * \brief Using tree model to get prediction number, then adding to scores for all data
  *        Note: this function generally will be used on validation data too.

--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -43,6 +43,8 @@ std::string GetBoostingType(const std::unordered_map<std::string, std::string>&
      boosting_type = "dart";
    } else if (value == std::string("goss")) {
      boosting_type = "goss";
+    } else if (value == std::string("rf") || value == std::string("randomforest")) {
+      boosting_type = "rf";
    } else {
      Log::Fatal("Unknown boosting type %s", value.c_str());
    }

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -50,6 +50,31 @@ class TestEngine(unittest.TestCase):
        self.assertLess(ret, 0.15)
        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
+    def test_rf(self):
+        X, y = load_breast_cancer(True)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+        params = {
+            'boosting_type': 'rf',
+            'objective': 'binary',
+            'bagging_freq': 1,
+            'bagging_fraction': 0.5,
+            'feature_fraction': 0.5,
+            'num_leaves': 50,
+            'metric': 'binary_logloss',
+            'verbose': -1
+        }
+        lgb_train = lgb.Dataset(X_train, y_train)
+        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+        evals_result = {}
+        gbm = lgb.train(params, lgb_train,
+                        num_boost_round=50,
+                        valid_sets=lgb_eval,
+                        verbose_eval=False,
+                        evals_result=evals_result)
+        ret = log_loss(y_test, gbm.predict(X_test))
+        self.assertLess(ret, 0.25)
+        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
    def test_regreesion(self):
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)