gbdt.cpp 17.1 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include "gbdt.h"

#include <LightGBM/utils/common.h>

#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
15
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

19
GBDT::GBDT()
20
  : train_score_updater_(nullptr),
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
  gradients_(nullptr), hessians_(nullptr),
  out_of_bag_data_indices_(nullptr), bag_data_indices_(nullptr) {
}

GBDT::~GBDT() {
26
27
28
  for (auto& tree_learner: tree_learner_){
    if (tree_learner != nullptr) { delete tree_learner; }  
  }
Guolin Ke's avatar
Guolin Ke committed
29
30
31
32
33
34
35
36
37
38
39
40
41
  if (gradients_ != nullptr) { delete[] gradients_; }
  if (hessians_ != nullptr) { delete[] hessians_; }
  if (out_of_bag_data_indices_ != nullptr) { delete[] out_of_bag_data_indices_; }
  if (bag_data_indices_ != nullptr) { delete[] bag_data_indices_; }
  for (auto& tree : models_) {
    if (tree != nullptr) { delete tree; }
  }
  if (train_score_updater_ != nullptr) { delete train_score_updater_; }
  for (auto& score_tracker : valid_score_updater_) {
    if (score_tracker != nullptr) { delete score_tracker; }
  }
}

42
43
44
45
46
47
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
  gbdt_config_ = dynamic_cast<const GBDTConfig*>(config);
  iter_ = 0;
  max_feature_idx_ = 0;
  early_stopping_round_ = gbdt_config_->early_stopping_round;
Guolin Ke's avatar
Guolin Ke committed
48
  train_data_ = train_data;
49
50
  num_class_ = config->num_class;
  tree_learner_ = std::vector<TreeLearner*>(num_class_, nullptr);
Guolin Ke's avatar
Guolin Ke committed
51
  // create tree learner
52
53
54
55
56
57
  for (int i = 0; i < num_class_; ++i){
      tree_learner_[i] =
        TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config);
      // init tree learner
      tree_learner_[i]->Init(train_data_);
  }
Guolin Ke's avatar
Guolin Ke committed
58
59
60
61
62
63
  object_function_ = object_function;
  // push training metrics
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
  // create score tracker
64
  train_score_updater_ = new ScoreUpdater(train_data_, num_class_);
Guolin Ke's avatar
Guolin Ke committed
65
66
  num_data_ = train_data_->num_data();
  // create buffer for gradients and hessians
67
  if (object_function_ != nullptr) {
68
69
    gradients_ = new score_t[num_data_ * num_class_];
    hessians_ = new score_t[num_data_ * num_class_];
70
  }
Guolin Ke's avatar
Guolin Ke committed
71
72

  // get max feature index
73
  max_feature_idx_ = train_data_->num_total_features() - 1;
Guolin Ke's avatar
Guolin Ke committed
74
75
  // get label index
  label_idx_ = train_data_->label_idx();
Guolin Ke's avatar
Guolin Ke committed
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  // if need bagging, create buffer
  if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
    out_of_bag_data_indices_ = new data_size_t[num_data_];
    bag_data_indices_ = new data_size_t[num_data_];
  } else {
    out_of_bag_data_cnt_ = 0;
    out_of_bag_data_indices_ = nullptr;
    bag_data_cnt_ = num_data_;
    bag_data_indices_ = nullptr;
  }
  // initialize random generator
  random_ = Random(gbdt_config_->bagging_seed);

}

void GBDT::AddDataset(const Dataset* valid_data,
         const std::vector<const Metric*>& valid_metrics) {
  // for a validation dataset, we need its score and metric
94
  valid_score_updater_.push_back(new ScoreUpdater(valid_data, num_class_));
Guolin Ke's avatar
Guolin Ke committed
95
  valid_metrics_.emplace_back();
wxchan's avatar
wxchan committed
96
97
  best_iter_.emplace_back();
  best_score_.emplace_back();
Guolin Ke's avatar
Guolin Ke committed
98
99
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
wxchan's avatar
wxchan committed
100
101
    best_iter_.back().push_back(0);
    best_score_.back().push_back(-1);
Guolin Ke's avatar
Guolin Ke committed
102
103
104
105
  }
}


106
void GBDT::Bagging(int iter, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
  // if need bagging
  if (out_of_bag_data_indices_ != nullptr && iter % gbdt_config_->bagging_freq == 0) {
    // if doesn't have query data
    if (train_data_->metadata().query_boundaries() == nullptr) {
      bag_data_cnt_ =
        static_cast<data_size_t>(gbdt_config_->bagging_fraction * num_data_);
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one record
      for (data_size_t i = 0; i < num_data_; ++i) {
        double prob =
          (bag_data_cnt_ - cur_left_cnt) / static_cast<double>(num_data_ - i);
        if (random_.NextDouble() < prob) {
          bag_data_indices_[cur_left_cnt++] = i;
        } else {
          out_of_bag_data_indices_[cur_right_cnt++] = i;
        }
      }
    } else {
      // if have query data
      const data_size_t* query_boundaries = train_data_->metadata().query_boundaries();
      data_size_t num_query = train_data_->metadata().num_queries();
      data_size_t bag_query_cnt =
          static_cast<data_size_t>(num_query * gbdt_config_->bagging_fraction);
      data_size_t cur_left_query_cnt = 0;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one query
      for (data_size_t i = 0; i < num_query; ++i) {
        double prob =
            (bag_query_cnt - cur_left_query_cnt) / static_cast<double>(num_query - i);
        if (random_.NextDouble() < prob) {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            bag_data_indices_[cur_left_cnt++] = j;
          }
          cur_left_query_cnt++;
        } else {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            out_of_bag_data_indices_[cur_right_cnt++] = j;
          }
        }
      }
      bag_data_cnt_ = cur_left_cnt;
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
    }
153
    Log::Info("re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
154
    // set bagging data to tree learner
155
    tree_learner_[curr_class]->SetBaggingData(bag_data_indices_, bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
156
157
158
  }
}

159
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
160
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
161
162
  if (out_of_bag_data_indices_ != nullptr) {
    train_score_updater_->
163
      AddScore(tree, out_of_bag_data_indices_, out_of_bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
164
165
166
  }
}

167
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    // boosting first
    if (gradient == nullptr || hessian == nullptr) {
      Boosting();
      gradient = gradients_;
      hessian = hessians_;
    }
    
    for (int curr_class = 0; curr_class < num_class_; ++curr_class){
      // bagging logic
      Bagging(iter_, curr_class);
      
      // train a new tree
      Tree * new_tree = tree_learner_[curr_class]->Train(gradient + curr_class * num_data_, hessian+ curr_class * num_data_);
      // if cannot learn a new tree, then stop
      if (new_tree->num_leaves() <= 1) {
        Log::Info("Can't training anymore, there isn't any leaf meets split requirements.");
        return true;
      }
      
      // shrinkage by learning rate
      new_tree->Shrinkage(gbdt_config_->learning_rate);
      // update score
      UpdateScore(new_tree, curr_class);
      UpdateScoreOutOfBag(new_tree, curr_class);
      
      // add model
      models_.push_back(new_tree);
    }
  
197
198
199
200
201
202
203
204
205
206
  bool is_met_early_stopping = false;
  // print message for metric
  if (is_eval) {
    is_met_early_stopping = OutputMetric(iter_ + 1);
  }
  ++iter_;
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
    // pop last early_stopping_round_ models
207
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
208
209
210
211
212
213
      delete models_.back();
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
  
Guolin Ke's avatar
Guolin Ke committed
214
215
}

216
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
217
  // update training score
218
  train_score_updater_->AddScore(tree_learner_[curr_class], curr_class);
Guolin Ke's avatar
Guolin Ke committed
219
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
220
221
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
222
223
224
  }
}

wxchan's avatar
wxchan committed
225
226
bool GBDT::OutputMetric(int iter) {
  bool ret = false;
Guolin Ke's avatar
Guolin Ke committed
227
  // print training metric
228
229
230
231
232
233
  if ((iter % gbdt_config_->output_freq) == 0) {
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
      Log::Info("Iteration:%d, %s : %s", iter, name, Common::ArrayToString<float>(scores, ' ').c_str());
    }
Guolin Ke's avatar
Guolin Ke committed
234
235
  }
  // print validation metric
236
237
238
239
240
241
242
  if ((iter % gbdt_config_->output_freq) == 0 || early_stopping_round_ > 0) {
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
        if ((iter % gbdt_config_->output_freq) == 0) {
          auto name = valid_metrics_[i][j]->GetName();
          Log::Info("Iteration:%d, %s : %s", iter, name, Common::ArrayToString<float>(test_scores, ' ').c_str());
wxchan's avatar
wxchan committed
243
        }
244
245
246
247
248
249
250
251
252
253
        if (!ret && early_stopping_round_ > 0) {
          bool the_bigger_the_better = valid_metrics_[i][j]->is_bigger_better();
          if (best_score_[i][j] < 0
            || (!the_bigger_the_better && test_scores.back() < best_score_[i][j])
            || (the_bigger_the_better && test_scores.back() > best_score_[i][j])) {
            best_score_[i][j] = test_scores.back();
            best_iter_[i][j] = iter;
          } else {
            if (iter - best_iter_[i][j] >= early_stopping_round_) ret = true;
          }
wxchan's avatar
wxchan committed
254
255
        }
      }
Guolin Ke's avatar
Guolin Ke committed
256
257
    }
  }
wxchan's avatar
wxchan committed
258
  return ret;
Guolin Ke's avatar
Guolin Ke committed
259
260
}

261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
/*! \brief Get eval result */
std::vector<std::string> GBDT::EvalCurrent(bool is_eval_train) const {
  std::vector<std::string> ret;
  if (is_eval_train) {
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
      std::stringstream str_buf;
      str_buf << name << " : " << Common::ArrayToString<float>(scores, ' ');
      ret.emplace_back(str_buf.str());
    }
  }

  for (size_t i = 0; i < valid_metrics_.size(); ++i) {
    for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
      auto name = valid_metrics_[i][j]->GetName();
      auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
      std::stringstream str_buf;
      str_buf << name << " : " << Common::ArrayToString<float>(test_scores, ' ');
      ret.emplace_back(str_buf.str());
    }
  }
  return ret;
}

/*! \brief Get prediction result */
const std::vector<const score_t*> GBDT::PredictCurrent(bool is_predict_train) const {
  std::vector<const score_t*> ret;
  if (is_predict_train) {
    ret.push_back(train_score_updater_->score());
  }
  for (size_t i = 0; i < valid_metrics_.size(); ++i) {
    ret.push_back(valid_score_updater_[i]->score());
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
298
void GBDT::Boosting() {
299
300
301
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
302
  // objective function will calculate gradients and hessians
Guolin Ke's avatar
Guolin Ke committed
303
304
305
306
  object_function_->
    GetGradients(train_score_updater_->score(), gradients_, hessians_);
}

307
308
309
310
311
312
313
void GBDT::SaveModelToFile(bool is_finish, const char* filename) {

  // first time to this function, open file
  if (saved_model_size_ == -1) {
    model_output_file_.open(filename);
    // output model type
    model_output_file_ << "gbdt" << std::endl;
314
315
    // output number of class
    model_output_file_ << "num_class=" << num_class_ << std::endl;
316
317
318
319
320
321
322
323
324
325
326
327
328
    // output label index
    model_output_file_ << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    model_output_file_ << "max_feature_idx=" << max_feature_idx_ << std::endl;
    // output sigmoid parameter
    model_output_file_ << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
    model_output_file_ << std::endl;
    saved_model_size_ = 0;
  }
  // already saved
  if (!model_output_file_.is_open()) {
    return;
  }
329
  int rest = static_cast<int>(models_.size()) - early_stopping_round_ * num_class_;
330
331
332
333
334
  // output tree models
  for (int i = saved_model_size_; i < rest; ++i) {
    model_output_file_ << "Tree=" << i << std::endl;
    model_output_file_ << models_[i]->ToString() << std::endl;
  }
Guolin Ke's avatar
Guolin Ke committed
335
336
337
  
  saved_model_size_ = Common::Max(saved_model_size_, rest);
  
338
339
340
341
342
343
344
345
346
  model_output_file_.flush();
  // training finished, can close file
  if (is_finish) {
    for (int i = saved_model_size_; i < static_cast<int>(models_.size()); ++i) {
      model_output_file_ << "Tree=" << i << std::endl;
      model_output_file_ << models_[i]->ToString() << std::endl;
    }
    model_output_file_ << std::endl << FeatureImportance() << std::endl;
    model_output_file_.close();
Guolin Ke's avatar
Guolin Ke committed
347
348
349
  }
}

350
void GBDT::ModelsFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
351
352
353
354
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
  size_t i = 0;
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
  
  // get number of class
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("num_class=");
    if (find_pos != std::string::npos) {
      std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
      Common::Atoi(strs[1].c_str(), &num_class_);
      ++i;
      break;
    } else {
      ++i;
    }
  }
  if (i == lines.size()) {
    Log::Fatal("Model file doesn't contain number of class");
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
372
373

  // get index of label
374
  i = 0;
Guolin Ke's avatar
Guolin Ke committed
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("label_index=");
    if (find_pos != std::string::npos) {
      std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
      Common::Atoi(strs[1].c_str(), &label_idx_);
      ++i;
      break;
    } else {
      ++i;
    }
  }
  if (i == lines.size()) {
    Log::Fatal("Model file doesn't contain label index");
    return;
  }

Guolin Ke's avatar
Guolin Ke committed
391
  // get max_feature_idx first
Guolin Ke's avatar
Guolin Ke committed
392
  i = 0;
Guolin Ke's avatar
Guolin Ke committed
393
394
395
396
397
398
399
400
401
402
403
404
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("max_feature_idx=");
    if (find_pos != std::string::npos) {
      std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
      Common::Atoi(strs[1].c_str(), &max_feature_idx_);
      ++i;
      break;
    } else {
      ++i;
    }
  }
  if (i == lines.size()) {
405
    Log::Fatal("Model file doesn't contain max_feature_idx");
Guolin Ke's avatar
Guolin Ke committed
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
    return;
  }
  // get sigmoid parameter
  i = 0;
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("sigmoid=");
    if (find_pos != std::string::npos) {
      std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
      Common::Atof(strs[1].c_str(), &sigmoid_);
      ++i;
      break;
    } else {
      ++i;
    }
  }
  // if sigmoid doesn't exists
  if (i == lines.size()) {
    sigmoid_ = -1.0;
  }
  // get tree models
  i = 0;
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
      std::string tree_str = Common::Join(lines, start, end, '\n');
      models_.push_back(new Tree(tree_str));
    } else {
      ++i;
    }
  }
440
  Log::Info("%d models has been loaded\n", models_.size());
Guolin Ke's avatar
Guolin Ke committed
441
442
}

443
std::string GBDT::FeatureImportance() const {
444
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
445
    for (size_t iter = 0; iter < models_.size(); ++iter) {
446
447
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
            ++feature_importances[models_[iter]->split_feature_real(split_idx)];
wxchan's avatar
wxchan committed
448
449
        }
    }
450
451
452
453
454
455
456
457
458
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
      pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]);
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
459
      return lhs.first > rhs.first;
460
    });
461
    std::stringstream str_buf;
462
    // write to model file
463
    str_buf << std::endl << "feature importances:" << std::endl;
464
    for (size_t i = 0; i < pairs.size(); ++i) {
465
      str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
466
    }
467
    return str_buf.str();
wxchan's avatar
wxchan committed
468
469
}

470
471
472
473
474
475
float GBDT::PredictRaw(const float* value, int num_used_model) const {
  if (num_used_model < 0) {
    num_used_model = static_cast<int>(models_.size());
  }
  float ret = 0.0f;
  for (int i = 0; i < num_used_model; ++i) {
Guolin Ke's avatar
Guolin Ke committed
476
477
478
479
480
    ret += models_[i]->Predict(value);
  }
  return ret;
}

481
482
483
484
485
486
float GBDT::Predict(const float* value, int num_used_model) const {
  if (num_used_model < 0) {
    num_used_model = static_cast<int>(models_.size());
  }
  float ret = 0.0f;
  for (int i = 0; i < num_used_model; ++i) {
Guolin Ke's avatar
Guolin Ke committed
487
488
489
490
    ret += models_[i]->Predict(value);
  }
  // if need sigmoid transform
  if (sigmoid_ > 0) {
491
    ret = 1.0f / (1.0f + std::exp(- 2.0f * sigmoid_ * ret));
Guolin Ke's avatar
Guolin Ke committed
492
493
494
495
  }
  return ret;
}

496
497
498
499
500
501
502
503
504
505
506
507
508
509
std::vector<float> GBDT::PredictMulticlass(const float* value, int num_used_model) const {
  if (num_used_model < 0) {
      num_used_model = static_cast<int>(models_.size()) / num_class_;
  }
  std::vector<float> ret(num_class_, 0.0f);
  for (int i = 0; i < num_used_model; ++i) {
    for (int j = 0; j < num_class_; ++j){
        ret[j] += models_[i * num_class_ + j] -> Predict(value);
    }
  }
  Common::Softmax(&ret);
  return ret;
}

510
511
512
513
std::vector<int> GBDT::PredictLeafIndex(const float* value, int num_used_model) const {
  if (num_used_model < 0) {
    num_used_model = static_cast<int>(models_.size());
  }
wxchan's avatar
wxchan committed
514
  std::vector<int> ret;
515
  for (int i = 0; i < num_used_model; ++i) {
wxchan's avatar
wxchan committed
516
517
518
519
520
    ret.push_back(models_[i]->PredictLeafIndex(value));
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
521
}  // namespace LightGBM