"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "a1fdeb1f6d2aeb5d069ee6ace6d9679c4a899ac3"
Commit 6a7470a2 authored by Guolin Ke's avatar Guolin Ke
Browse files

Add Random Forest Mode (#678)

* add draft of RF.

* fix score bugs.

* fix scores.

* fix tests.

* update document

* fix GetPredictAt
parent c05cfa89
...@@ -39,8 +39,9 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s ...@@ -39,8 +39,9 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
* `binary`, binary classification application * `binary`, binary classification application
* `lambdarank`, [lambdarank](https://pdfs.semanticscholar.org/fc9a/e09f9ced555558fdf1e997c0a5411fb51f15.pdf) application * `lambdarank`, [lambdarank](https://pdfs.semanticscholar.org/fc9a/e09f9ced555558fdf1e997c0a5411fb51f15.pdf) application
* `multiclass`, multi-class classification application, should set `num_class` as well * `multiclass`, multi-class classification application, should set `num_class` as well
* `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`dart`, alias=`boost`,`boosting_type` * `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
* `gbdt`, traditional Gradient Boosting Decision Tree * `gbdt`, traditional Gradient Boosting Decision Tree
* `rf`, Random Forest
* `dart`, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866) * `dart`, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866)
* `goss`, Gradient-based One-Side Sampling * `goss`, Gradient-based One-Side Sampling
* `data`, default=`""`, type=string, alias=`train`,`train_data` * `data`, default=`""`, type=string, alias=`train`,`train_data`
......
...@@ -51,9 +51,11 @@ Some important parameters: ...@@ -51,9 +51,11 @@ Some important parameters:
* ```binary```, binary classification application * ```binary```, binary classification application
* ```lambdarank```, lambdarank application * ```lambdarank```, lambdarank application
* ```multiclass```, multi-class classification application, should set ```num_class``` as well * ```multiclass```, multi-class classification application, should set ```num_class``` as well
* ```boosting```, default=```gbdt```, type=enum, options=```gbdt```,```dart```, alias=```boost```,```boosting_type``` * `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
* ```gbdt```, traditional Gradient Boosting Decision Tree * `gbdt`, traditional Gradient Boosting Decision Tree
* ```dart```, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866) * `rf`, Random Forest
* `dart`, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866)
* `goss`, Gradient-based One-Side Sampling
* ```data```, default=```""```, type=string, alias=```train```,```train_data``` * ```data```, default=```""```, type=string, alias=```train```,```train_data```
* training data, LightGBM will train from this data * training data, LightGBM will train from this data
* ```valid```, default=```""```, type=multi-string, alias=```test```,```valid_data```,```test_data``` * ```valid```, default=```""```, type=multi-string, alias=```test```,```valid_data```,```test_data```
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "gbdt.h" #include "gbdt.h"
#include "dart.hpp" #include "dart.hpp"
#include "goss.hpp" #include "goss.hpp"
#include "rf.hpp"
namespace LightGBM { namespace LightGBM {
...@@ -34,6 +35,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename ...@@ -34,6 +35,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
return new DART(); return new DART();
} else if (type == std::string("goss")) { } else if (type == std::string("goss")) {
return new GOSS(); return new GOSS();
} else if (type == std::string("rf")) {
return new RF();
} else { } else {
return nullptr; return nullptr;
} }
...@@ -47,6 +50,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename ...@@ -47,6 +50,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
ret.reset(new DART()); ret.reset(new DART());
} else if (type == std::string("goss")) { } else if (type == std::string("goss")) {
ret.reset(new GOSS()); ret.reset(new GOSS());
} else if (type == std::string("rf")) {
return new RF();
} else { } else {
Log::Fatal("unknown boosting type %s", type.c_str()); Log::Fatal("unknown boosting type %s", type.c_str());
} }
......
...@@ -39,6 +39,12 @@ public: ...@@ -39,6 +39,12 @@ public:
sum_weight_ = 0.0f; sum_weight_ = 0.0f;
} }
void ResetConfig(const BoostingConfig* config) override {
GBDT::ResetConfig(config);
random_for_drop_ = Random(gbdt_config_->drop_seed);
sum_weight_ = 0.0f;
}
/*! /*!
* \brief one training iteration * \brief one training iteration
*/ */
......
...@@ -44,9 +44,10 @@ GBDT::GBDT() ...@@ -44,9 +44,10 @@ GBDT::GBDT()
boost_from_average_(false) { boost_from_average_(false) {
#pragma omp parallel #pragma omp parallel
#pragma omp master #pragma omp master
{ {
num_threads_ = omp_get_num_threads(); num_threads_ = omp_get_num_threads();
} }
average_output_ = false;
} }
GBDT::~GBDT() { GBDT::~GBDT() {
...@@ -164,10 +165,9 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* ...@@ -164,10 +165,9 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
} }
objective_function_ = objective_function; objective_function_ = objective_function;
num_tree_per_iteration_ = num_class_;
if (objective_function_ != nullptr) { if (objective_function_ != nullptr) {
is_constant_hessian_ = objective_function_->IsConstantHessian(); is_constant_hessian_ = objective_function_->IsConstantHessian();
num_tree_per_iteration_ = objective_function_->NumTreePerIteration(); CHECK(num_tree_per_iteration_ == objective_function_->NumTreePerIteration());
} else { } else {
is_constant_hessian_ = false; is_constant_hessian_ = false;
} }
...@@ -608,6 +608,10 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { ...@@ -608,6 +608,10 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
#endif #endif
} }
std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
return metric->Eval(score, objective_function_);
}
std::string GBDT::OutputMetric(int iter) { std::string GBDT::OutputMetric(int iter) {
bool need_output = (iter % gbdt_config_->output_freq) == 0; bool need_output = (iter % gbdt_config_->output_freq) == 0;
std::string ret = ""; std::string ret = "";
...@@ -617,7 +621,7 @@ std::string GBDT::OutputMetric(int iter) { ...@@ -617,7 +621,7 @@ std::string GBDT::OutputMetric(int iter) {
if (need_output) { if (need_output) {
for (auto& sub_metric : training_metrics_) { for (auto& sub_metric : training_metrics_) {
auto name = sub_metric->GetName(); auto name = sub_metric->GetName();
auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_); auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
for (size_t k = 0; k < name.size(); ++k) { for (size_t k = 0; k < name.size(); ++k) {
std::stringstream tmp_buf; std::stringstream tmp_buf;
tmp_buf << "Iteration:" << iter tmp_buf << "Iteration:" << iter
...@@ -634,8 +638,7 @@ std::string GBDT::OutputMetric(int iter) { ...@@ -634,8 +638,7 @@ std::string GBDT::OutputMetric(int iter) {
if (need_output || early_stopping_round_ > 0) { if (need_output || early_stopping_round_ > 0) {
for (size_t i = 0; i < valid_metrics_.size(); ++i) { for (size_t i = 0; i < valid_metrics_.size(); ++i) {
for (size_t j = 0; j < valid_metrics_[i].size(); ++j) { for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score(), auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score());
objective_function_);
auto name = valid_metrics_[i][j]->GetName(); auto name = valid_metrics_[i][j]->GetName();
for (size_t k = 0; k < name.size(); ++k) { for (size_t k = 0; k < name.size(); ++k) {
std::stringstream tmp_buf; std::stringstream tmp_buf;
...@@ -674,7 +677,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const { ...@@ -674,7 +677,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const {
std::vector<double> ret; std::vector<double> ret;
if (data_idx == 0) { if (data_idx == 0) {
for (auto& sub_metric : training_metrics_) { for (auto& sub_metric : training_metrics_) {
auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_); auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
for (auto score : scores) { for (auto score : scores) {
ret.push_back(score); ret.push_back(score);
} }
...@@ -682,8 +685,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const { ...@@ -682,8 +685,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const {
} else { } else {
auto used_idx = data_idx - 1; auto used_idx = data_idx - 1;
for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) { for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score(), auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score());
objective_function_);
for (auto score : test_scores) { for (auto score : test_scores) {
ret.push_back(score); ret.push_back(score);
} }
...@@ -712,7 +714,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) { ...@@ -712,7 +714,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
num_data = valid_score_updater_[used_idx]->num_data(); num_data = valid_score_updater_[used_idx]->num_data();
*out_len = static_cast<int64_t>(num_data) * num_class_; *out_len = static_cast<int64_t>(num_data) * num_class_;
} }
if (objective_function_ != nullptr) { if (objective_function_ != nullptr && !average_output_) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
std::vector<double> tree_pred(num_tree_per_iteration_); std::vector<double> tree_pred(num_tree_per_iteration_);
...@@ -842,7 +844,12 @@ std::string GBDT::ModelToIfElse(int num_iteration) const { ...@@ -842,7 +844,12 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
// Predict // Predict
str_buf << "void GBDT::Predict(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl; str_buf << "void GBDT::Predict(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
str_buf << "\t" << "PredictRaw(features, output, early_stop);" << std::endl; str_buf << "\t" << "PredictRaw(features, output, early_stop);" << std::endl;
str_buf << "\t" << "if (objective_function_ != nullptr) {" << std::endl; str_buf << "\t" << "if (average_output_) {" << std::endl;
str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
str_buf << "\t\t\t" << "output[k] /= num_iteration_for_pred_;" << std::endl;
str_buf << "\t\t" << "}" << std::endl;
str_buf << "\t" << "}" << std::endl;
str_buf << "\t" << "else if (objective_function_ != nullptr) {" << std::endl;
str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << std::endl; str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << std::endl;
str_buf << "\t" << "}" << std::endl; str_buf << "\t" << "}" << std::endl;
str_buf << "}" << std::endl; str_buf << "}" << std::endl;
...@@ -920,6 +927,10 @@ std::string GBDT::SaveModelToString(int num_iteration) const { ...@@ -920,6 +927,10 @@ std::string GBDT::SaveModelToString(int num_iteration) const {
ss << "boost_from_average" << std::endl; ss << "boost_from_average" << std::endl;
} }
if (average_output_) {
ss << "average_output" << std::endl;
}
ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl; ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;
ss << "feature_infos=" << Common::Join(feature_infos_, " ") << std::endl; ss << "feature_infos=" << Common::Join(feature_infos_, " ") << std::endl;
...@@ -999,6 +1010,11 @@ bool GBDT::LoadModelFromString(const std::string& model_str) { ...@@ -999,6 +1010,11 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
if (line.size() > 0) { if (line.size() > 0) {
boost_from_average_ = true; boost_from_average_ = true;
} }
// get average_output
line = Common::FindFromLines(lines, "average_output");
if (line.size() > 0) {
average_output_ = true;
}
// get feature names // get feature names
line = Common::FindFromLines(lines, "feature_names="); line = Common::FindFromLines(lines, "feature_names=");
if (line.size() > 0) { if (line.size() > 0) {
......
...@@ -275,17 +275,19 @@ protected: ...@@ -275,17 +275,19 @@ protected:
* \param tree Trained tree of this iteration * \param tree Trained tree of this iteration
* \param cur_tree_id Current tree for multiclass training * \param cur_tree_id Current tree for multiclass training
*/ */
void UpdateScoreOutOfBag(const Tree* tree, const int cur_tree_id); virtual void UpdateScoreOutOfBag(const Tree* tree, const int cur_tree_id);
/*! /*!
* \brief calculate the object function * \brief calculate the object function
*/ */
void Boosting(); virtual void Boosting();
/*! /*!
* \brief updating score after tree was trained * \brief updating score after tree was trained
* \param tree Trained tree of this iteration * \param tree Trained tree of this iteration
* \param cur_tree_id Current tree for multiclass training * \param cur_tree_id Current tree for multiclass training
*/ */
virtual void UpdateScore(const Tree* tree, const int cur_tree_id); virtual void UpdateScore(const Tree* tree, const int cur_tree_id);
virtual std::vector<double> EvalOneMetric(const Metric* metric, const double* score) const;
/*! /*!
* \brief Print metric result of current iteration * \brief Print metric result of current iteration
* \param iter Current interation * \param iter Current interation
...@@ -373,6 +375,7 @@ protected: ...@@ -373,6 +375,7 @@ protected:
std::vector<double> class_default_output_; std::vector<double> class_default_output_;
bool is_constant_hessian_; bool is_constant_hessian_;
std::unique_ptr<ObjectiveFunction> loaded_objective_; std::unique_ptr<ObjectiveFunction> loaded_objective_;
bool average_output_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -28,7 +28,11 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa ...@@ -28,7 +28,11 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa
void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
PredictRaw(features, output, early_stop); PredictRaw(features, output, early_stop);
if (objective_function_ != nullptr) { if (average_output_) {
for (int k = 0; k < num_tree_per_iteration_; ++k) {
output[k] /= num_iteration_for_pred_;
}
} else if (objective_function_ != nullptr) {
objective_function_->ConvertOutput(output, output); objective_function_->ConvertOutput(output, output);
} }
} }
......
#ifndef LIGHTGBM_BOOSTING_RF_H_
#define LIGHTGBM_BOOSTING_RF_H_
#include <LightGBM/boosting.h>
#include <LightGBM/metric.h>
#include "score_updater.hpp"
#include "gbdt.h"
#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
namespace LightGBM {
/*!
* \brief Rondom Forest implementation
*/
class RF: public GBDT {
public:
RF() : GBDT() {
average_output_ = true;
}
~RF() {}
void Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
const std::vector<const Metric*>& training_metrics) override {
CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
CHECK(config->tree_config.feature_fraction < 1.0f && config->tree_config.feature_fraction > 0.0f);
GBDT::Init(config, train_data, objective_function, training_metrics);
if (num_init_iteration_ > 0) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
MultiplyScore(cur_tree_id, 1.0f / num_init_iteration_);
}
} else {
CHECK(train_data->metadata().init_score() == nullptr);
}
// cannot use RF for multi-class.
CHECK(num_tree_per_iteration_ == 1);
// not shrinkage rate for the RF
shrinkage_rate_ = 1.0f;
// only boosting one time
Boosting();
if (is_use_subset_ && bag_data_cnt_ < num_data_) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
tmp_grad_.resize(total_size);
tmp_hess_.resize(total_size);
}
}
void ResetConfig(const BoostingConfig* config) override {
CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
CHECK(config->tree_config.feature_fraction < 1.0f && config->tree_config.feature_fraction > 0.0f);
GBDT::ResetConfig(config);
// not shrinkage rate for the RF
shrinkage_rate_ = 1.0f;
}
void ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::ResetTrainingData(train_data, objective_function, training_metrics);
if (iter_ + num_init_iteration_ > 0) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
train_score_updater_->MultiplyScore(1.0f / (iter_ + num_init_iteration_), cur_tree_id);
}
}
// cannot use RF for multi-class.
CHECK(num_tree_per_iteration_ == 1);
// only boosting one time
Boosting();
if (is_use_subset_ && bag_data_cnt_ < num_data_) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
tmp_grad_.resize(total_size);
tmp_hess_.resize(total_size);
}
}
void Boosting() override {
if (objective_function_ == nullptr) {
Log::Fatal("No object function provided");
}
std::vector<double> tmp_score(num_tree_per_iteration_ * num_data_, 0.0f);
objective_function_->
GetGradients(tmp_score.data(), gradients_.data(), hessians_.data());
}
bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override {
// bagging logic
Bagging(iter_);
if (gradient == nullptr || hessian == nullptr) {
gradient = gradients_.data();
hessian = hessians_.data();
}
if (is_use_subset_ && bag_data_cnt_ < num_data_) {
// get sub gradients
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
// cannot multi-threading here.
for (int i = 0; i < bag_data_cnt_; ++i) {
tmp_grad_[bias + i] = gradient[bias + bag_data_indices_[i]];
tmp_hess_[bias + i] = hessian[bias + bag_data_indices_[i]];
}
}
gradient = tmp_grad_.data();
hessian = tmp_hess_.data();
}
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
std::unique_ptr<Tree> new_tree(new Tree(2));
if (class_need_train_[cur_tree_id]) {
size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
new_tree.reset(
tree_learner_->Train(gradient + bias, hessian + bias, is_constant_hessian_));
}
if (new_tree->num_leaves() > 1) {
// update score
MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
ConvertTreeOutput(new_tree.get());
UpdateScore(new_tree.get(), cur_tree_id);
UpdateScoreOutOfBag(new_tree.get(), cur_tree_id);
MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));
} else {
// only add default score one-time
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
double output = class_default_output_[cur_tree_id];
objective_function_->ConvertOutput(&output, &output);
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
output, output, 0, 0, -1, 0, 0, 0);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(output, cur_tree_id);
}
}
}
// add model
models_.push_back(std::move(new_tree));
}
++iter_;
if (is_eval) {
return EvalAndCheckEarlyStopping();
} else {
return false;
}
}
void RollbackOneIter() override {
if (iter_ <= 0) { return; }
int cur_iter = iter_ + num_init_iteration_ - 1;
// reset score
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
auto curr_tree = cur_iter * num_tree_per_iteration_ + cur_tree_id;
models_[curr_tree]->Shrinkage(-1.0);
MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
}
MultiplyScore(cur_tree_id, 1.0f / (iter_ + num_init_iteration_ - 1));
}
// remove model
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
models_.pop_back();
}
--iter_;
}
void MultiplyScore(const int cur_tree_id, double val) {
train_score_updater_->MultiplyScore(val, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->MultiplyScore(val, cur_tree_id);
}
}
void ConvertTreeOutput(Tree* tree) {
tree->Shrinkage(1.0f);
for (int i = 0; i < tree->num_leaves(); ++i) {
double output = tree->LeafOutput(i);
objective_function_->ConvertOutput(&output, &output);
tree->SetLeafOutput(i, output);
}
}
void AddValidDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) override {
GBDT::AddValidDataset(valid_data, valid_metrics);
if (iter_ + num_init_iteration_ > 0) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
valid_score_updater_.back()->MultiplyScore(1.0f / (iter_ + num_init_iteration_), cur_tree_id);
}
}
}
bool NeedAccuratePrediction() const override {
// No early stopping for prediction
return true;
};
std::vector<double> EvalOneMetric(const Metric* metric, const double* score) const override {
return metric->Eval(score, nullptr);
}
private:
std::vector<score_t> tmp_grad_;
std::vector<score_t> tmp_hess_;
};
} // namespace LightGBM
#endif // LIGHTGBM_BOOSTING_RF_H_
...@@ -58,6 +58,14 @@ public: ...@@ -58,6 +58,14 @@ public:
score_[offset + i] += val; score_[offset + i] += val;
} }
} }
inline void MultiplyScore(double val, int cur_tree_id) {
int64_t offset = cur_tree_id * num_data_;
#pragma omp parallel for schedule(static)
for (int64_t i = 0; i < num_data_; ++i) {
score_[offset + i] *= val;
}
}
/*! /*!
* \brief Using tree model to get prediction number, then adding to scores for all data * \brief Using tree model to get prediction number, then adding to scores for all data
* Note: this function generally will be used on validation data too. * Note: this function generally will be used on validation data too.
......
...@@ -43,6 +43,8 @@ std::string GetBoostingType(const std::unordered_map<std::string, std::string>& ...@@ -43,6 +43,8 @@ std::string GetBoostingType(const std::unordered_map<std::string, std::string>&
boosting_type = "dart"; boosting_type = "dart";
} else if (value == std::string("goss")) { } else if (value == std::string("goss")) {
boosting_type = "goss"; boosting_type = "goss";
} else if (value == std::string("rf") || value == std::string("randomforest")) {
boosting_type = "rf";
} else { } else {
Log::Fatal("Unknown boosting type %s", value.c_str()); Log::Fatal("Unknown boosting type %s", value.c_str());
} }
......
...@@ -50,6 +50,31 @@ class TestEngine(unittest.TestCase): ...@@ -50,6 +50,31 @@ class TestEngine(unittest.TestCase):
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
def test_rf(self):
X, y = load_breast_cancer(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
'boosting_type': 'rf',
'objective': 'binary',
'bagging_freq': 1,
'bagging_fraction': 0.5,
'feature_fraction': 0.5,
'num_leaves': 50,
'metric': 'binary_logloss',
'verbose': -1
}
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=50,
valid_sets=lgb_eval,
verbose_eval=False,
evals_result=evals_result)
ret = log_loss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.25)
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
def test_regreesion(self): def test_regreesion(self):
X, y = load_boston(True) X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment