"vscode:/vscode.git/clone" did not exist on "3ccdea1a085162f3195abcd589e889672e099042"
rf.hpp 7.78 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
9
10
11
#ifndef LIGHTGBM_BOOSTING_RF_H_
#define LIGHTGBM_BOOSTING_RF_H_

#include <LightGBM/boosting.h>
#include <LightGBM/metric.h>

#include <string>
12
#include <cstdio>
Guolin Ke's avatar
Guolin Ke committed
13
#include <fstream>
14
15
16
17
18
19
#include <memory>
#include <utility>
#include <vector>

#include "gbdt.h"
#include "score_updater.hpp"
Guolin Ke's avatar
Guolin Ke committed
20
21
22
23
24

namespace LightGBM {
/*!
* \brief Rondom Forest implementation
*/
Guolin Ke's avatar
Guolin Ke committed
25
class RF : public GBDT {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
  RF() : GBDT() {
Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
32
    average_output_ = true;
  }

  ~RF() {}

Guolin Ke's avatar
Guolin Ke committed
33
  void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
Guolin Ke's avatar
Guolin Ke committed
34
    const std::vector<const Metric*>& training_metrics) override {
Guolin Ke's avatar
Guolin Ke committed
35
    CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
36
    CHECK(config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f);
Guolin Ke's avatar
Guolin Ke committed
37
38
39
40
41
42
43
    GBDT::Init(config, train_data, objective_function, training_metrics);

    if (num_init_iteration_ > 0) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        MultiplyScore(cur_tree_id, 1.0f / num_init_iteration_);
      }
    } else {
Nikita Titov's avatar
Nikita Titov committed
44
      CHECK_EQ(train_data->metadata().init_score(), nullptr);
Guolin Ke's avatar
Guolin Ke committed
45
    }
Nikita Titov's avatar
Nikita Titov committed
46
    CHECK_EQ(num_tree_per_iteration_, num_class_);
Guolin Ke's avatar
Guolin Ke committed
47
48
49
    // not shrinkage rate for the RF
    shrinkage_rate_ = 1.0f;
    // only boosting one time
Guolin Ke's avatar
Guolin Ke committed
50
    Boosting();
Guolin Ke's avatar
Guolin Ke committed
51
    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
52
53
      tmp_grad_.resize(num_data_);
      tmp_hess_.resize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
54
55
56
    }
  }

Guolin Ke's avatar
Guolin Ke committed
57
  void ResetConfig(const Config* config) override {
Guolin Ke's avatar
Guolin Ke committed
58
    CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
59
    CHECK(config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f);
Guolin Ke's avatar
Guolin Ke committed
60
61
62
63
64
65
    GBDT::ResetConfig(config);
    // not shrinkage rate for the RF
    shrinkage_rate_ = 1.0f;
  }

  void ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
Guolin Ke's avatar
Guolin Ke committed
66
    const std::vector<const Metric*>& training_metrics) override {
Guolin Ke's avatar
Guolin Ke committed
67
68
69
70
71
72
    GBDT::ResetTrainingData(train_data, objective_function, training_metrics);
    if (iter_ + num_init_iteration_ > 0) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        train_score_updater_->MultiplyScore(1.0f / (iter_ + num_init_iteration_), cur_tree_id);
      }
    }
Nikita Titov's avatar
Nikita Titov committed
73
    CHECK_EQ(num_tree_per_iteration_, num_class_);
Guolin Ke's avatar
Guolin Ke committed
74
    // only boosting one time
Guolin Ke's avatar
Guolin Ke committed
75
    Boosting();
Guolin Ke's avatar
Guolin Ke committed
76
    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
77
78
      tmp_grad_.resize(num_data_);
      tmp_hess_.resize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
79
80
81
    }
  }

Guolin Ke's avatar
Guolin Ke committed
82
83
  void Boosting() override {
    if (objective_function_ == nullptr) {
84
      Log::Fatal("RF mode do not support custom objective function, please use built-in objectives.");
Guolin Ke's avatar
Guolin Ke committed
85
86
87
88
    }
    init_scores_.resize(num_tree_per_iteration_, 0.0);
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
      init_scores_[cur_tree_id] = BoostFromAverage(cur_tree_id, false);
89
    }
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
    size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
    std::vector<double> tmp_scores(total_size, 0.0f);
    #pragma omp parallel for schedule(static)
    for (int j = 0; j < num_tree_per_iteration_; ++j) {
94
      size_t offset = static_cast<size_t>(j)* num_data_;
Guolin Ke's avatar
Guolin Ke committed
95
      for (data_size_t i = 0; i < num_data_; ++i) {
96
        tmp_scores[offset + i] = init_scores_[j];
97
      }
Guolin Ke's avatar
Guolin Ke committed
98
    }
Guolin Ke's avatar
Guolin Ke committed
99
100
    objective_function_->
      GetGradients(tmp_scores.data(), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
101
102
  }

Guolin Ke's avatar
Guolin Ke committed
103
  bool TrainOneIter(const score_t* gradients, const score_t* hessians) override {
Guolin Ke's avatar
Guolin Ke committed
104
105
    // bagging logic
    Bagging(iter_);
Nikita Titov's avatar
Nikita Titov committed
106
107
    CHECK_EQ(gradients, nullptr);
    CHECK_EQ(hessians, nullptr);
Guolin Ke's avatar
Guolin Ke committed
108

109
110
    gradients = gradients_.data();
    hessians = hessians_.data();
Guolin Ke's avatar
Guolin Ke committed
111
112
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
      std::unique_ptr<Tree> new_tree(new Tree(2));
113
      size_t offset = static_cast<size_t>(cur_tree_id)* num_data_;
Guolin Ke's avatar
Guolin Ke committed
114
      if (class_need_train_[cur_tree_id]) {
115
116
        auto grad = gradients + offset;
        auto hess = hessians + offset;
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
121
122
123
124
125

        // need to copy gradients for bagging subset.
        if (is_use_subset_ && bag_data_cnt_ < num_data_) {
          for (int i = 0; i < bag_data_cnt_; ++i) {
            tmp_grad_[i] = grad[bag_data_indices_[i]];
            tmp_hess_[i] = hess[bag_data_indices_[i]];
          }
          grad = tmp_grad_.data();
          hess = tmp_hess_.data();
Guolin Ke's avatar
Guolin Ke committed
126
        }
Guolin Ke's avatar
Guolin Ke committed
127

128
        new_tree.reset(tree_learner_->Train(grad, hess, forced_splits_json_));
Guolin Ke's avatar
Guolin Ke committed
129
      }
Guolin Ke's avatar
Guolin Ke committed
130

Guolin Ke's avatar
Guolin Ke committed
131
      if (new_tree->num_leaves() > 1) {
132
133
134
        double pred = init_scores_[cur_tree_id];
        auto residual_getter = [pred](const label_t* label, int i) {return static_cast<double>(label[i]) - pred; };
        tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter,
135
          num_data_, bag_data_indices_.data(), bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
136
137
138
        if (std::fabs(init_scores_[cur_tree_id]) > kEpsilon) {
          new_tree->AddBias(init_scores_[cur_tree_id]);
        }
Guolin Ke's avatar
Guolin Ke committed
139
140
141
142
        // update score
        MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
        UpdateScore(new_tree.get(), cur_tree_id);
        MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));
Guolin Ke's avatar
Guolin Ke committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
      } else {
        // only add default score one-time
        if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
          double output = 0.0;
          if (!class_need_train_[cur_tree_id]) {
            if (objective_function_ != nullptr) {
              output = objective_function_->BoostFromScore(cur_tree_id);
            } else {
              output = init_scores_[cur_tree_id];
            }
          }
          new_tree->AsConstantTree(output);
          MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
          UpdateScore(new_tree.get(), cur_tree_id);
          MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));
        }
Guolin Ke's avatar
Guolin Ke committed
159
160
161
162
163
      }
      // add model
      models_.push_back(std::move(new_tree));
    }
    ++iter_;
Guolin Ke's avatar
Guolin Ke committed
164
    return false;
Guolin Ke's avatar
Guolin Ke committed
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
  }

  void RollbackOneIter() override {
    if (iter_ <= 0) { return; }
    int cur_iter = iter_ + num_init_iteration_ - 1;
    // reset score
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
      auto curr_tree = cur_iter * num_tree_per_iteration_ + cur_tree_id;
      models_[curr_tree]->Shrinkage(-1.0);
      MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
      train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
      for (auto& score_updater : valid_score_updater_) {
        score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
      }
      MultiplyScore(cur_tree_id, 1.0f / (iter_ + num_init_iteration_ - 1));
    }
    // remove model
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
      models_.pop_back();
    }
    --iter_;
  }

  void MultiplyScore(const int cur_tree_id, double val) {
    train_score_updater_->MultiplyScore(val, cur_tree_id);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->MultiplyScore(val, cur_tree_id);
    }
  }

  void AddValidDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
196
    const std::vector<const Metric*>& valid_metrics) override {
Guolin Ke's avatar
Guolin Ke committed
197
198
199
200
201
202
203
204
205
206
207
208
209
    GBDT::AddValidDataset(valid_data, valid_metrics);
    if (iter_ + num_init_iteration_ > 0) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        valid_score_updater_.back()->MultiplyScore(1.0f / (iter_ + num_init_iteration_), cur_tree_id);
      }
    }
  }

  bool NeedAccuratePrediction() const override {
    // No early stopping for prediction
    return true;
  };

Nikita Titov's avatar
Nikita Titov committed
210
 private:
Guolin Ke's avatar
Guolin Ke committed
211
212
  std::vector<score_t> tmp_grad_;
  std::vector<score_t> tmp_hess_;
Guolin Ke's avatar
Guolin Ke committed
213
  std::vector<double> init_scores_;
Guolin Ke's avatar
Guolin Ke committed
214
215
216
};

}  // namespace LightGBM
217
#endif  // LIGHTGBM_BOOSTING_RF_H_