gbdt.h 14.6 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
#ifndef LIGHTGBM_BOOSTING_GBDT_H_
#define LIGHTGBM_BOOSTING_GBDT_H_

#include <LightGBM/boosting.h>
5
#include <LightGBM/objective_function.h>
Guolin Ke's avatar
Guolin Ke committed
6
#include <LightGBM/prediction_early_stop.h>
7

Guolin Ke's avatar
Guolin Ke committed
8
9
10
11
12
#include "score_updater.hpp"

#include <cstdio>
#include <vector>
#include <string>
13
#include <fstream>
Guolin Ke's avatar
Guolin Ke committed
14
#include <memory>
15
#include <mutex>
Guolin Ke's avatar
Guolin Ke committed
16
17

namespace LightGBM {
Guolin Ke's avatar
Guolin Ke committed
18

Guolin Ke's avatar
Guolin Ke committed
19
20
21
/*!
* \brief GBDT algorithm implementation. including Training, prediction, bagging.
*/
Guolin Ke's avatar
Guolin Ke committed
22
class GBDT: public GBDTBase {
Guolin Ke's avatar
Guolin Ke committed
23
public:
Guolin Ke's avatar
Guolin Ke committed
24

Guolin Ke's avatar
Guolin Ke committed
25
26
27
  /*!
  * \brief Constructor
  */
28
  GBDT();
Guolin Ke's avatar
Guolin Ke committed
29

Guolin Ke's avatar
Guolin Ke committed
30
31
32
33
  /*!
  * \brief Destructor
  */
  ~GBDT();
Guolin Ke's avatar
Guolin Ke committed
34

Guolin Ke's avatar
Guolin Ke committed
35
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
36
  * \brief Initialization logic
zhangyafeikimi's avatar
zhangyafeikimi committed
37
  * \param gbdt_config Config for boosting
Guolin Ke's avatar
Guolin Ke committed
38
  * \param train_data Training data
39
  * \param objective_function Training objective function
Guolin Ke's avatar
Guolin Ke committed
40
41
  * \param training_metrics Training metrics
  */
42
  void Init(const BoostingConfig* gbdt_config, const Dataset* train_data, const ObjectiveFunction* objective_function,
Guolin Ke's avatar
Guolin Ke committed
43
            const std::vector<const Metric*>& training_metrics) override;
wxchan's avatar
wxchan committed
44
45

  /*!
Guolin Ke's avatar
Guolin Ke committed
46
  * \brief Merge model from other boosting object. Will insert to the front of current boosting object
wxchan's avatar
wxchan committed
47
48
49
50
51
52
53
54
55
56
57
58
  * \param other
  */
  void MergeFrom(const Boosting* other) override {
    auto other_gbdt = reinterpret_cast<const GBDT*>(other);
    // tmp move to other vector
    auto original_models = std::move(models_);
    models_ = std::vector<std::unique_ptr<Tree>>();
    // push model from other first
    for (const auto& tree : other_gbdt->models_) {
      auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
      models_.push_back(std::move(new_tree));
    }
Guolin Ke's avatar
Guolin Ke committed
59
    num_init_iteration_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
wxchan's avatar
wxchan committed
60
61
62
63
64
    // push model in current object
    for (const auto& tree : original_models) {
      auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
      models_.push_back(std::move(new_tree));
    }
Guolin Ke's avatar
Guolin Ke committed
65
    num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
wxchan's avatar
wxchan committed
66
67
  }

Guolin Ke's avatar
Guolin Ke committed
68
69
70
71
72
73
  /*!
  * \brief Reset the training data
  * \param train_data New Training data
  * \param objective_function Training objective function
  * \param training_metrics Training metrics
  */
74
75
  void ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
                         const std::vector<const Metric*>& training_metrics) override;
wxchan's avatar
wxchan committed
76

Guolin Ke's avatar
Guolin Ke committed
77
78
79
80
81
82
  /*!
  * \brief Reset Boosting Config
  * \param gbdt_config Config for boosting
  */
  void ResetConfig(const BoostingConfig* gbdt_config) override;

Guolin Ke's avatar
Guolin Ke committed
83
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
84
85
86
  * \brief Adding a validation dataset
  * \param valid_data Validation dataset
  * \param valid_metrics Metrics for validation dataset
Guolin Ke's avatar
Guolin Ke committed
87
  */
wxchan's avatar
wxchan committed
88
  void AddValidDataset(const Dataset* valid_data,
89
                       const std::vector<const Metric*>& valid_metrics) override;
Guolin Ke's avatar
Guolin Ke committed
90

Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
95
  /*!
  * \brief Perform a full training procedure
  * \param snapshot_freq frequence of snapshot
  * \param model_output_path path of model file
  */
Guolin Ke's avatar
Guolin Ke committed
96
97
  void Train(int snapshot_freq, const std::string& model_output_path) override;

Guolin Ke's avatar
Guolin Ke committed
98
  /*!
Guolin Ke's avatar
Guolin Ke committed
99
  * \brief Training logic
Guolin Ke's avatar
Guolin Ke committed
100
101
102
  * \param gradients nullptr for using default objective, otherwise use self-defined boosting
  * \param hessians nullptr for using default objective, otherwise use self-defined boosting
  * \return True if cannot train any more
Guolin Ke's avatar
Guolin Ke committed
103
  */
Guolin Ke's avatar
Guolin Ke committed
104
  virtual bool TrainOneIter(const score_t* gradients, const score_t* hessians) override;
105

wxchan's avatar
wxchan committed
106
107
108
109
110
  /*!
  * \brief Rollback one iteration
  */
  void RollbackOneIter() override;

Guolin Ke's avatar
Guolin Ke committed
111
112
113
  /*!
  * \brief Get current iteration
  */
Guolin Ke's avatar
Guolin Ke committed
114
  int GetCurrentIteration() const override { return static_cast<int>(models_.size()) / num_tree_per_iteration_; }
wxchan's avatar
wxchan committed
115

Guolin Ke's avatar
Guolin Ke committed
116
117
118
119
  /*!
  * \brief Can use early stopping for prediction or not
  * \return True if cannot use early stopping for prediction
  */
120
  bool NeedAccuratePrediction() const override {
121
122
123
124
125
126
127
    if (objective_function_ == nullptr) {
      return true;
    } else {
      return objective_function_->NeedAccuratePrediction();
    }
  }

Guolin Ke's avatar
Guolin Ke committed
128
129
130
131
132
  /*!
  * \brief Get evaluation result at data_idx data
  * \param data_idx 0: training data, 1: 1st validation data
  * \return evaluation result
  */
133
  std::vector<double> GetEvalAt(int data_idx) const override;
134

Guolin Ke's avatar
Guolin Ke committed
135
136
  /*!
  * \brief Get current training score
Guolin Ke's avatar
Guolin Ke committed
137
  * \param out_len length of returned score
Guolin Ke's avatar
Guolin Ke committed
138
139
  * \return training score
  */
140
  virtual const double* GetTrainingScore(int64_t* out_len) override;
141

Guolin Ke's avatar
Guolin Ke committed
142
143
144
145
146
  /*!
  * \brief Get size of prediction at data_idx data
  * \param data_idx 0: training data, 1: 1st validation data
  * \return The size of prediction
  */
Guolin Ke's avatar
Guolin Ke committed
147
148
149
150
151
152
153
154
  virtual int64_t GetNumPredictAt(int data_idx) const override {
    CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
    data_size_t num_data = train_data_->num_data();
    if (data_idx > 0) {
      num_data = valid_score_updater_[data_idx - 1]->num_data();
    }
    return num_data * num_class_;
  }
Guolin Ke's avatar
Guolin Ke committed
155

Guolin Ke's avatar
Guolin Ke committed
156
157
158
159
  /*!
  * \brief Get prediction result at data_idx data
  * \param data_idx 0: training data, 1: 1st validation data
  * \param result used to store prediction result, should allocate memory before call this function
160
  * \param out_len length of returned score
Guolin Ke's avatar
Guolin Ke committed
161
  */
Guolin Ke's avatar
Guolin Ke committed
162
  void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;
Guolin Ke's avatar
Guolin Ke committed
163

Guolin Ke's avatar
Guolin Ke committed
164
165
166
167
168
169
170
  /*!
  * \brief Get number of prediction for one data
  * \param num_iteration number of used iterations
  * \param is_pred_leaf True if predicting  leaf index
  * \param is_pred_contrib True if predicting feature contribution
  * \return number of prediction
  */
171
  inline int NumPredictOneRow(int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const override {
Guolin Ke's avatar
Guolin Ke committed
172
173
174
175
176
177
178
179
    int num_preb_in_one_row = num_class_;
    if (is_pred_leaf) {
      int max_iteration = GetCurrentIteration();
      if (num_iteration > 0) {
        num_preb_in_one_row *= static_cast<int>(std::min(max_iteration, num_iteration));
      } else {
        num_preb_in_one_row *= max_iteration;
      }
180
181
    } else if (is_pred_contrib) {
      num_preb_in_one_row = max_feature_idx_ + 2; // +1 for 0-based indexing, +1 for baseline
Guolin Ke's avatar
Guolin Ke committed
182
183
184
    }
    return num_preb_in_one_row;
  }
Guolin Ke's avatar
Guolin Ke committed
185

cbecker's avatar
cbecker committed
186
  void PredictRaw(const double* features, double* output,
187
                  const PredictionEarlyStopInstance* earlyStop) const override;
wxchan's avatar
wxchan committed
188

cbecker's avatar
cbecker committed
189
190
  void Predict(const double* features, double* output,
               const PredictionEarlyStopInstance* earlyStop) const override;
Guolin Ke's avatar
Guolin Ke committed
191

192
  void PredictLeafIndex(const double* features, double* output) const override;
wxchan's avatar
wxchan committed
193

194
195
196
  void PredictContrib(const double* features, double* output,
                      const PredictionEarlyStopInstance* earlyStop) const override;

Guolin Ke's avatar
Guolin Ke committed
197
  /*!
wxchan's avatar
wxchan committed
198
  * \brief Dump model to json format string
199
  * \param num_iteration Number of iterations that want to dump, -1 means dump all
wxchan's avatar
wxchan committed
200
  * \return Json format string of model
Guolin Ke's avatar
Guolin Ke committed
201
  */
202
  std::string DumpModel(int num_iteration) const override;
wxchan's avatar
wxchan committed
203

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
  /*!
  * \brief Translate model to if-else statement
  * \param num_iteration Number of iterations that want to translate, -1 means translate all
  * \return if-else format codes of model
  */
  std::string ModelToIfElse(int num_iteration) const override;

  /*!
  * \brief Translate model to if-else statement
  * \param num_iteration Number of iterations that want to translate, -1 means translate all
  * \param filename Filename that want to save to
  * \return is_finish Is training finished or not
  */
  bool SaveModelToIfElse(int num_iteration, const char* filename) const override;

wxchan's avatar
wxchan committed
219
220
  /*!
  * \brief Save model to file
wxchan's avatar
wxchan committed
221
  * \param num_iterations Number of model that want to save, -1 means save all
wxchan's avatar
wxchan committed
222
  * \param filename Filename that want to save to
223
  * \return is_finish Is training finished or not
wxchan's avatar
wxchan committed
224
  */
225
  virtual bool SaveModelToFile(int num_iterations, const char* filename) const override;
wxchan's avatar
wxchan committed
226

227
228
  /*!
  * \brief Save model to string
wxchan's avatar
wxchan committed
229
  * \param num_iterations Number of model that want to save, -1 means save all
230
231
  * \return Non-empty string if succeeded
  */
232
  virtual std::string SaveModelToString(int num_iterations) const override;
233

Guolin Ke's avatar
Guolin Ke committed
234
235
236
  /*!
  * \brief Restore from a serialized string
  */
237
  bool LoadModelFromString(const std::string& model_str) override;
wxchan's avatar
wxchan committed
238

239
240
241
242
243
244
245
246
  /*!
  * \brief Calculate feature importances
  * \param num_iteration Number of model that want to use for feature importance, -1 means use all
  * \param importance_type: 0 for split, 1 for gain
  * \return vector of feature_importance
  */
  std::vector<double> FeatureImportance(int num_iteration, int importance_type) const override;

Guolin Ke's avatar
Guolin Ke committed
247
248
249
250
251
  /*!
  * \brief Get max feature index of this model
  * \return Max feature index of this model
  */
  inline int MaxFeatureIdx() const override { return max_feature_idx_; }
Guolin Ke's avatar
Guolin Ke committed
252

wxchan's avatar
wxchan committed
253
254
255
256
257
258
  /*!
  * \brief Get feature names of this model
  * \return Feature names of this model
  */
  inline std::vector<std::string> FeatureNames() const override { return feature_names_; }

Guolin Ke's avatar
Guolin Ke committed
259
260
261
262
263
264
  /*!
  * \brief Get index of label column
  * \return index of label column
  */
  inline int LabelIdx() const override { return label_idx_; }

Guolin Ke's avatar
Guolin Ke committed
265
266
267
268
  /*!
  * \brief Get number of weak sub-models
  * \return Number of weak sub-models
  */
wxchan's avatar
wxchan committed
269
  inline int NumberOfTotalModel() const override { return static_cast<int>(models_.size()); }
Guolin Ke's avatar
Guolin Ke committed
270

Guolin Ke's avatar
Guolin Ke committed
271
272
273
274
  /*!
  * \brief Get number of tree per iteration
  * \return number of tree per iteration
  */
Guolin Ke's avatar
Guolin Ke committed
275
  inline int NumModelPerIteration() const override { return num_tree_per_iteration_; }
Guolin Ke's avatar
Guolin Ke committed
276

277
278
279
280
  /*!
  * \brief Get number of classes
  * \return Number of classes
  */
Guolin Ke's avatar
Guolin Ke committed
281
  inline int NumberOfClasses() const override { return num_class_; }
282

283
  inline void InitPredict(int num_iteration) override {
Guolin Ke's avatar
Guolin Ke committed
284
    num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
wxchan's avatar
wxchan committed
285
    if (num_iteration > 0) {
Guolin Ke's avatar
Guolin Ke committed
286
      num_iteration_for_pred_ = std::min(num_iteration, num_iteration_for_pred_);
287
288
    }
  }
wxchan's avatar
wxchan committed
289

Guolin Ke's avatar
Guolin Ke committed
290
  inline double GetLeafValue(int tree_idx, int leaf_idx) const override {
Guolin Ke's avatar
Guolin Ke committed
291
292
293
294
295
    CHECK(tree_idx >= 0 && static_cast<size_t>(tree_idx) < models_.size());
    CHECK(leaf_idx >= 0 && leaf_idx < models_[tree_idx]->num_leaves());
    return models_[tree_idx]->LeafOutput(leaf_idx);
  }

Guolin Ke's avatar
Guolin Ke committed
296
  inline void SetLeafValue(int tree_idx, int leaf_idx, double val) override {
Guolin Ke's avatar
Guolin Ke committed
297
298
299
300
301
    CHECK(tree_idx >= 0 && static_cast<size_t>(tree_idx) < models_.size());
    CHECK(leaf_idx >= 0 && leaf_idx < models_[tree_idx]->num_leaves());
    models_[tree_idx]->SetLeafOutput(leaf_idx, val);
  }

302
303
304
  /*!
  * \brief Get Type name of this boosting object
  */
Guolin Ke's avatar
Guolin Ke committed
305
  virtual const char* SubModelName() const override { return "tree"; }
306

307
protected:
Guolin Ke's avatar
Guolin Ke committed
308
309
310
311
312
313
314
315
316

  /*!
  * \brief Print eval result and check early stopping
  */
  bool EvalAndCheckEarlyStopping();

  /*!
  * \brief reset config for bagging
  */
Guolin Ke's avatar
Guolin Ke committed
317
  void ResetBaggingConfig(const BoostingConfig* config, bool is_change_dataset);
Guolin Ke's avatar
Guolin Ke committed
318

Guolin Ke's avatar
Guolin Ke committed
319
320
321
322
  /*!
  * \brief Implement bagging logic
  * \param iter Current interation
  */
323
324
325
326
327
328
329
330
331
  virtual void Bagging(int iter);

  /*!
  * \brief Helper function for bagging, used for multi-threading optimization
  * \param start start indice of bagging
  * \param cnt count
  * \param buffer output buffer
  * \return count of left size
  */
Guolin Ke's avatar
Guolin Ke committed
332
  data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer);
Guolin Ke's avatar
Guolin Ke committed
333

Guolin Ke's avatar
Guolin Ke committed
334
335
336
  /*!
  * \brief calculate the object function
  */
Guolin Ke's avatar
Guolin Ke committed
337
  virtual void Boosting();
Guolin Ke's avatar
Guolin Ke committed
338

Guolin Ke's avatar
Guolin Ke committed
339
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
340
  * \brief updating score after tree was trained
Guolin Ke's avatar
Guolin Ke committed
341
  * \param tree Trained tree of this iteration
342
  * \param cur_tree_id Current tree for multiclass training
Guolin Ke's avatar
Guolin Ke committed
343
  */
344
  virtual void UpdateScore(const Tree* tree, const int cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
345

Guolin Ke's avatar
Guolin Ke committed
346
347
348
349
  /*!
  * \brief eval results for one metric

  */
Guolin Ke's avatar
Guolin Ke committed
350
  virtual std::vector<double> EvalOneMetric(const Metric* metric, const double* score) const;
Guolin Ke's avatar
Guolin Ke committed
351

Guolin Ke's avatar
Guolin Ke committed
352
  /*!
Hui Xue's avatar
Hui Xue committed
353
  * \brief Print metric result of current iteration
Guolin Ke's avatar
Guolin Ke committed
354
  * \param iter Current interation
Guolin Ke's avatar
Guolin Ke committed
355
  * \return best_msg if met early_stopping
Guolin Ke's avatar
Guolin Ke committed
356
  */
Guolin Ke's avatar
Guolin Ke committed
357
  std::string OutputMetric(int iter);
358

Guolin Ke's avatar
Guolin Ke committed
359
360
  double BoostFromAverage();

361
362
  /*! \brief current iteration */
  int iter_;
Guolin Ke's avatar
Guolin Ke committed
363
364
365
  /*! \brief Pointer to training data */
  const Dataset* train_data_;
  /*! \brief Config of gbdt */
Guolin Ke's avatar
Guolin Ke committed
366
  std::unique_ptr<BoostingConfig> gbdt_config_;
Hui Xue's avatar
Hui Xue committed
367
  /*! \brief Tree learner, will use this class to learn trees */
368
  std::unique_ptr<TreeLearner> tree_learner_;
Guolin Ke's avatar
Guolin Ke committed
369
  /*! \brief Objective function */
370
  const ObjectiveFunction* objective_function_;
Hui Xue's avatar
Hui Xue committed
371
  /*! \brief Store and update training data's score */
Guolin Ke's avatar
Guolin Ke committed
372
  std::unique_ptr<ScoreUpdater> train_score_updater_;
Guolin Ke's avatar
Guolin Ke committed
373
374
375
  /*! \brief Metrics for training data */
  std::vector<const Metric*> training_metrics_;
  /*! \brief Store and update validation data's scores */
Guolin Ke's avatar
Guolin Ke committed
376
  std::vector<std::unique_ptr<ScoreUpdater>> valid_score_updater_;
Guolin Ke's avatar
Guolin Ke committed
377
378
  /*! \brief Metric for validation data */
  std::vector<std::vector<const Metric*>> valid_metrics_;
wxchan's avatar
wxchan committed
379
380
  /*! \brief Number of rounds for early stopping */
  int early_stopping_round_;
Guolin Ke's avatar
Guolin Ke committed
381
  /*! \brief Best iteration(s) for early stopping */
wxchan's avatar
wxchan committed
382
  std::vector<std::vector<int>> best_iter_;
Guolin Ke's avatar
Guolin Ke committed
383
  /*! \brief Best score(s) for early stopping */
384
  std::vector<std::vector<double>> best_score_;
Guolin Ke's avatar
Guolin Ke committed
385
386
  /*! \brief output message of best iteration */
  std::vector<std::vector<std::string>> best_msg_;
Guolin Ke's avatar
Guolin Ke committed
387
  /*! \brief Trained models(trees) */
Guolin Ke's avatar
Guolin Ke committed
388
  std::vector<std::unique_ptr<Tree>> models_;
Guolin Ke's avatar
Guolin Ke committed
389
390
391
  /*! \brief Max feature index of training data*/
  int max_feature_idx_;
  /*! \brief First order derivative of training data */
392
  std::vector<score_t> gradients_;
Guolin Ke's avatar
Guolin Ke committed
393
  /*! \brief Secend order derivative of training data */
394
  std::vector<score_t> hessians_;
Guolin Ke's avatar
Guolin Ke committed
395
  /*! \brief Store the indices of in-bag data */
Guolin Ke's avatar
Guolin Ke committed
396
  std::vector<data_size_t> bag_data_indices_;
Guolin Ke's avatar
Guolin Ke committed
397
398
  /*! \brief Number of in-bag data */
  data_size_t bag_data_cnt_;
399
400
  /*! \brief Store the indices of in-bag data */
  std::vector<data_size_t> tmp_indices_;
wxchan's avatar
wxchan committed
401
  /*! \brief Number of training data */
Guolin Ke's avatar
Guolin Ke committed
402
  data_size_t num_data_;
403
404
405
  /*! \brief Number of trees per iterations */
  int num_tree_per_iteration_;
  /*! \brief Number of class */
406
  int num_class_;
Guolin Ke's avatar
Guolin Ke committed
407
408
  /*! \brief Index of label column */
  data_size_t label_idx_;
409
  /*! \brief number of used model */
wxchan's avatar
wxchan committed
410
  int num_iteration_for_pred_;
Guolin Ke's avatar
Guolin Ke committed
411
412
  /*! \brief Shrinkage rate for one iteration */
  double shrinkage_rate_;
wxchan's avatar
wxchan committed
413
414
  /*! \brief Number of loaded initial models */
  int num_init_iteration_;
Guolin Ke's avatar
Guolin Ke committed
415
416
  /*! \brief Feature names */
  std::vector<std::string> feature_names_;
Guolin Ke's avatar
Guolin Ke committed
417
  std::vector<std::string> feature_infos_;
418
419
420
421
422
423
424
425
426
427
428
429
  /*! \brief number of threads */
  int num_threads_;
  /*! \brief Buffer for multi-threading bagging */
  std::vector<data_size_t> offsets_buf_;
  /*! \brief Buffer for multi-threading bagging */
  std::vector<data_size_t> left_cnts_buf_;
  /*! \brief Buffer for multi-threading bagging */
  std::vector<data_size_t> right_cnts_buf_;
  /*! \brief Buffer for multi-threading bagging */
  std::vector<data_size_t> left_write_pos_buf_;
  /*! \brief Buffer for multi-threading bagging */
  std::vector<data_size_t> right_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
430
431
  std::unique_ptr<Dataset> tmp_subset_;
  bool is_use_subset_;
432
433
  std::vector<bool> class_need_train_;
  std::vector<double> class_default_output_;
434
  bool is_constant_hessian_;
435
  std::unique_ptr<ObjectiveFunction> loaded_objective_;
Guolin Ke's avatar
Guolin Ke committed
436
  bool average_output_;
Guolin Ke's avatar
Guolin Ke committed
437
  bool need_re_bagging_;
Guolin Ke's avatar
Guolin Ke committed
438
439
440
};

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
441
#endif   // LightGBM_BOOSTING_GBDT_H_