gbdt.cpp 29.2 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#include "gbdt.h"

#include <LightGBM/metric.h>
Guolin Ke's avatar
Guolin Ke committed
8
#include <LightGBM/network.h>
9
10
11
12
#include <LightGBM/objective_function.h>
#include <LightGBM/prediction_early_stop.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
13

14
#include <chrono>
Guolin Ke's avatar
Guolin Ke committed
15
16
17
18
19
#include <ctime>
#include <sstream>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
20

Guolin Ke's avatar
Guolin Ke committed
21
GBDT::GBDT() : iter_(0),
Guolin Ke's avatar
Guolin Ke committed
22
23
24
train_data_(nullptr),
objective_function_(nullptr),
early_stopping_round_(0),
25
es_first_metric_only_(false),
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
max_feature_idx_(0),
num_tree_per_iteration_(1),
num_class_(1),
num_iteration_for_pred_(0),
shrinkage_rate_(0.1f),
num_init_iteration_(0),
Guolin Ke's avatar
Guolin Ke committed
32
33
need_re_bagging_(false),
balanced_bagging_(false) {
Guolin Ke's avatar
Guolin Ke committed
34
35
  #pragma omp parallel
  #pragma omp master
Guolin Ke's avatar
Guolin Ke committed
36
37
38
39
  {
    num_threads_ = omp_get_num_threads();
  }
  average_output_ = false;
Guolin Ke's avatar
Guolin Ke committed
40
  tree_learner_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
41
42
43
44
45
}

GBDT::~GBDT() {
}

Guolin Ke's avatar
Guolin Ke committed
46
void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
47
                const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
48
  CHECK(train_data != nullptr);
49
  train_data_ = train_data;
50
  iter_ = 0;
wxchan's avatar
wxchan committed
51
  num_iteration_for_pred_ = 0;
52
  max_feature_idx_ = 0;
wxchan's avatar
wxchan committed
53
  num_class_ = config->num_class;
Guolin Ke's avatar
Guolin Ke committed
54
55
  config_ = std::unique_ptr<Config>(new Config(*config));
  early_stopping_round_ = config_->early_stopping_round;
56
  es_first_metric_only_ = config_->first_metric_only;
Guolin Ke's avatar
Guolin Ke committed
57
  shrinkage_rate_ = config_->learning_rate;
58

59
  std::string forced_splits_path = config->forcedsplits_filename;
60
  // load forced_splits file
61
62
63
64
65
66
67
68
  if (forced_splits_path != "") {
      std::ifstream forced_splits_file(forced_splits_path.c_str());
      std::stringstream buffer;
      buffer << forced_splits_file.rdbuf();
      std::string err;
      forced_splits_json_ = Json::parse(buffer.str(), err);
  }

69
70
71
72
  objective_function_ = objective_function;
  num_tree_per_iteration_ = num_class_;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
Guolin Ke's avatar
Guolin Ke committed
73
    num_tree_per_iteration_ = objective_function_->NumModelPerIteration();
74
75
76
77
  } else {
    is_constant_hessian_ = false;
  }

Guolin Ke's avatar
Guolin Ke committed
78
  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

  // init tree learner
  tree_learner_->Init(train_data_, is_constant_hessian_);

  // push training metrics
  training_metrics_.clear();
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
  training_metrics_.shrink_to_fit();

  train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));

  num_data_ = train_data_->num_data();
  // create buffer for gradients and hessians
  if (objective_function_ != nullptr) {
    size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
    gradients_.resize(total_size);
    hessians_.resize(total_size);
  }
  // get max feature index
  max_feature_idx_ = train_data_->num_total_features() - 1;
  // get label index
  label_idx_ = train_data_->label_idx();
  // get feature names
  feature_names_ = train_data_->feature_names();
  feature_infos_ = train_data_->feature_infos();
106
  monotone_constraints_ = config->monotone_constraints;
107
108

  // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
109
  ResetBaggingConfig(config_.get(), true);
110
111
112
113

  class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
  if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
    CHECK(num_tree_per_iteration_ == num_class_);
114
115
    for (int i = 0; i < num_class_; ++i) {
      class_need_train_[i] = objective_function_->ClassNeedTrain(i);
116
117
    }
  }
wxchan's avatar
wxchan committed
118
119
120
}

void GBDT::AddValidDataset(const Dataset* valid_data,
121
                           const std::vector<const Metric*>& valid_metrics) {
wxchan's avatar
wxchan committed
122
  if (!train_data_->CheckAlign(*valid_data)) {
123
    Log::Fatal("Cannot add validation data, since it has different bin mappers with training data");
124
  }
Guolin Ke's avatar
Guolin Ke committed
125
  // for a validation dataset, we need its score and metric
126
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
wxchan's avatar
wxchan committed
127
128
  // update score
  for (int i = 0; i < iter_; ++i) {
129
130
131
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
      auto curr_tree = (i + num_init_iteration_) * num_tree_per_iteration_ + cur_tree_id;
      new_score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
132
133
    }
  }
Guolin Ke's avatar
Guolin Ke committed
134
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
135
136
137
138
  valid_metrics_.emplace_back();
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
  }
Guolin Ke's avatar
Guolin Ke committed
139
  valid_metrics_.back().shrink_to_fit();
140

141
142
143
144
145
146
147
  if (early_stopping_round_ > 0) {
    auto num_metrics = valid_metrics.size();
    if (es_first_metric_only_) { num_metrics = 1; }
    best_iter_.emplace_back(num_metrics, 0);
    best_score_.emplace_back(num_metrics, kMinScore);
    best_msg_.emplace_back(num_metrics);
  }
Guolin Ke's avatar
Guolin Ke committed
148
149
}

Guolin Ke's avatar
Guolin Ke committed
150
151
152
153
154
155
156
157
158
159
void GBDT::Boosting() {
  if (objective_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
  // objective function will calculate gradients and hessians
  int64_t num_score = 0;
  objective_function_->
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
}

160
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer) {
161
162
163
  if (cnt <= 0) {
    return 0;
  }
Guolin Ke's avatar
Guolin Ke committed
164
  data_size_t bag_data_cnt = static_cast<data_size_t>(config_->bagging_fraction * cnt);
165
166
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
167
  auto right_buffer = buffer + bag_data_cnt;
168
169
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
170
    float prob = (bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
Guolin Ke's avatar
Guolin Ke committed
171
    if (cur_rand.NextFloat() < prob) {
172
173
      buffer[cur_left_cnt++] = start + i;
    } else {
Guolin Ke's avatar
Guolin Ke committed
174
      right_buffer[cur_right_cnt++] = start + i;
175
176
177
178
179
    }
  }
  CHECK(cur_left_cnt == bag_data_cnt);
  return cur_left_cnt;
}
Guolin Ke's avatar
Guolin Ke committed
180

Guolin Ke's avatar
Guolin Ke committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
data_size_t GBDT::BalancedBaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer) {
  if (cnt <= 0) {
    return 0;
  }
  auto label_ptr = train_data_->metadata().label();
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_pos = cnt - 1;
  // from right to left
  auto right_buffer = buffer;
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
    bool is_pos = label_ptr[start + i] > 0;
    bool is_in_bag = false;
    if (is_pos) {
      is_in_bag = cur_rand.NextFloat() < config_->pos_bagging_fraction;
    } else {
      is_in_bag = cur_rand.NextFloat() < config_->neg_bagging_fraction;
    }
    if (is_in_bag) {
      buffer[cur_left_cnt++] = start + i;
    } else {
      right_buffer[cur_right_pos--] = start + i;
    }
  }
  // reverse right buffer
  std::reverse(buffer + cur_left_cnt, buffer + cnt);
  return cur_left_cnt;
}

210
void GBDT::Bagging(int iter) {
Guolin Ke's avatar
Guolin Ke committed
211
  // if need bagging
Guolin Ke's avatar
Guolin Ke committed
212
  if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0)
Guolin Ke's avatar
Guolin Ke committed
213
      || need_re_bagging_) {
Guolin Ke's avatar
Guolin Ke committed
214
    need_re_bagging_ = false;
Guolin Ke's avatar
Guolin Ke committed
215
    const data_size_t min_inner_size = 1000;
216
217
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
218
    OMP_INIT_EX();
219
    #pragma omp parallel for schedule(static, 1)
220
    for (int i = 0; i < num_threads_; ++i) {
221
      OMP_LOOP_EX_BEGIN();
222
223
224
225
226
227
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Guolin Ke's avatar
Guolin Ke committed
228
      Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
Guolin Ke's avatar
Guolin Ke committed
229
230
231
232
233
234
      data_size_t cur_left_count = 0;
      if (balanced_bagging_) {
        cur_left_count = BalancedBaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
      } else {
        cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
      }
235
236
237
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
238
      OMP_LOOP_EX_END();
239
    }
240
    OMP_THROW_EX();
241
242
243
244
245
246
247
248
249
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

Guolin Ke's avatar
Guolin Ke committed
250
    #pragma omp parallel for schedule(static, 1)
251
    for (int i = 0; i < num_threads_; ++i) {
252
      OMP_LOOP_EX_BEGIN();
253
254
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
255
                    tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
256
      }
257
258
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
259
                    tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
260
      }
261
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
262
    }
263
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
264
    bag_data_cnt_ = left_cnt;
Guolin Ke's avatar
Guolin Ke committed
265
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
266
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
267
268
269
270
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
Guolin Ke's avatar
Guolin Ke committed
271
272
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
Guolin Ke's avatar
Guolin Ke committed
273
274
      tree_learner_->ResetTrainingData(tmp_subset_.get());
    }
Guolin Ke's avatar
Guolin Ke committed
275
276
277
  }
}

Guolin Ke's avatar
Guolin Ke committed
278
279
280
void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
  bool is_finished = false;
  auto start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
281
  for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
Guolin Ke's avatar
Guolin Ke committed
282
283
284
285
    is_finished = TrainOneIter(nullptr, nullptr);
    if (!is_finished) {
      is_finished = EvalAndCheckEarlyStopping();
    }
Guolin Ke's avatar
Guolin Ke committed
286
287
288
289
290
291
292
    auto end_time = std::chrono::steady_clock::now();
    // output used time per iteration
    Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
              std::milli>(end_time - start_time) * 1e-3, iter + 1);
    if (snapshot_freq > 0
        && (iter + 1) % snapshot_freq == 0) {
      std::string snapshot_out = model_output_path + ".snapshot_iter_" + std::to_string(iter + 1);
293
      SaveModelToFile(0, -1, snapshot_out.c_str());
Guolin Ke's avatar
Guolin Ke committed
294
295
296
297
    }
  }
}

298
299
300
301
302
303
304
305
306
307
308
309
310
void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction) {
  CHECK(tree_leaf_prediction.size() > 0);
  CHECK(static_cast<size_t>(num_data_) == tree_leaf_prediction.size());
  CHECK(static_cast<size_t>(models_.size()) == tree_leaf_prediction[0].size());
  int num_iterations = static_cast<int>(models_.size() / num_tree_per_iteration_);
  std::vector<int> leaf_pred(num_data_);
  for (int iter = 0; iter < num_iterations; ++iter) {
    Boosting();
    for (int tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) {
      int model_index = iter * num_tree_per_iteration_ + tree_id;
      #pragma omp parallel for schedule(static)
      for (int i = 0; i < num_data_; ++i) {
        leaf_pred[i] = tree_leaf_prediction[i][model_index];
Guolin Ke's avatar
Guolin Ke committed
311
        CHECK(leaf_pred[i] < models_[model_index]->num_leaves());
312
313
314
315
316
317
318
319
320
321
322
      }
      size_t bias = static_cast<size_t>(tree_id) * num_data_;
      auto grad = gradients_.data() + bias;
      auto hess = hessians_.data() + bias;
      auto new_tree = tree_learner_->FitByExistingTree(models_[model_index].get(), leaf_pred, grad, hess);
      train_score_updater_->AddScore(tree_learner_.get(), new_tree, tree_id);
      models_[model_index].reset(new_tree);
    }
  }
}

323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
*
* An improvement to this is to have options to explicitly choose
* (i) standard average
* (ii) custom average if available
* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
*
* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
*
*/
double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id) {
  double init_score = 0.0;
  if (fobj != nullptr) {
    init_score = fobj->BoostFromScore(class_id);
  }
  if (Network::num_machines() > 1) {
    init_score = Network::GlobalSyncUpByMean(init_score);
  }
  return init_score;
}

Guolin Ke's avatar
Guolin Ke committed
344
double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
345
  // boosting from average label; or customized "average" if implemented for the current objective
346
347
348
  if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
    if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
      double init_score = ObtainAutomaticInitialScore(objective_function_, class_id);
349
      if (std::fabs(init_score) > kEpsilon) {
Guolin Ke's avatar
Guolin Ke committed
350
351
352
353
354
        if (update_scorer) {
          train_score_updater_->AddScore(init_score, class_id);
          for (auto& score_updater : valid_score_updater_) {
            score_updater->AddScore(init_score, class_id);
          }
355
356
357
        }
        Log::Info("Start training from score %lf", init_score);
        return init_score;
Guolin Ke's avatar
Guolin Ke committed
358
      }
359
360
361
    } else if (std::string(objective_function_->GetName()) == std::string("regression_l1")
               || std::string(objective_function_->GetName()) == std::string("quantile")
               || std::string(objective_function_->GetName()) == std::string("mape")) {
362
      Log::Warning("Disabling boost_from_average in %s may cause the slow convergence", objective_function_->GetName());
363
    }
364
  }
Guolin Ke's avatar
Guolin Ke committed
365
366
  return 0.0f;
}
Guolin Ke's avatar
Guolin Ke committed
367

Guolin Ke's avatar
Guolin Ke committed
368
bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
369
  std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
Guolin Ke's avatar
Guolin Ke committed
370
  // boosting first
Guolin Ke's avatar
Guolin Ke committed
371
  if (gradients == nullptr || hessians == nullptr) {
372
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
373
      init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true);
374
    }
Guolin Ke's avatar
Guolin Ke committed
375
    Boosting();
Guolin Ke's avatar
Guolin Ke committed
376
377
    gradients = gradients_.data();
    hessians = hessians_.data();
Guolin Ke's avatar
Guolin Ke committed
378
  }
379
380
  // bagging logic
  Bagging(iter_);
Guolin Ke's avatar
Guolin Ke committed
381

Guolin Ke's avatar
Guolin Ke committed
382
  bool should_continue = false;
383
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
384
    const size_t bias = static_cast<size_t>(cur_tree_id) * num_data_;
385
    std::unique_ptr<Tree> new_tree(new Tree(2));
386
    if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
Guolin Ke's avatar
Guolin Ke committed
387
388
389
390
391
392
393
394
395
396
397
      auto grad = gradients + bias;
      auto hess = hessians + bias;
      // need to copy gradients for bagging subset.
      if (is_use_subset_ && bag_data_cnt_ < num_data_) {
        for (int i = 0; i < bag_data_cnt_; ++i) {
          gradients_[bias + i] = grad[bag_data_indices_[i]];
          hessians_[bias + i] = hess[bag_data_indices_[i]];
        }
        grad = gradients_.data() + bias;
        hess = hessians_.data() + bias;
      }
398
      new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
399
    }
Guolin Ke's avatar
Guolin Ke committed
400

Guolin Ke's avatar
Guolin Ke committed
401
    if (new_tree->num_leaves() > 1) {
Guolin Ke's avatar
Guolin Ke committed
402
      should_continue = true;
403
404
405
      auto score_ptr = train_score_updater_->score() + bias;
      auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast<double>(label[i]) - score_ptr[i]; };
      tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter,
406
                                     num_data_, bag_data_indices_.data(), bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
407
408
409
      // shrinkage by learning rate
      new_tree->Shrinkage(shrinkage_rate_);
      // update score
410
      UpdateScore(new_tree.get(), cur_tree_id);
411
412
      if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
        new_tree->AddBias(init_scores[cur_tree_id]);
Guolin Ke's avatar
Guolin Ke committed
413
      }
414
415
    } else {
      // only add default score one-time
416
417
418
419
420
421
422
423
424
      if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
        double output = 0.0;
        if (!class_need_train_[cur_tree_id]) {
          if (objective_function_ != nullptr) {
            output = objective_function_->BoostFromScore(cur_tree_id);
          }
        } else {
          output = init_scores[cur_tree_id];
        }
425
        new_tree->AsConstantTree(output);
Guolin Ke's avatar
Guolin Ke committed
426
        // updates scores
427
        train_score_updater_->AddScore(output, cur_tree_id);
428
        for (auto& score_updater : valid_score_updater_) {
429
          score_updater->AddScore(output, cur_tree_id);
430
431
432
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
433
434
435
    // add model
    models_.push_back(std::move(new_tree));
  }
Guolin Ke's avatar
Guolin Ke committed
436

Guolin Ke's avatar
Guolin Ke committed
437
  if (!should_continue) {
438
    Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
439
440
441
442
    if (models_.size() > static_cast<size_t>(num_tree_per_iteration_)) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        models_.pop_back();
      }
Guolin Ke's avatar
Guolin Ke committed
443
444
445
    }
    return true;
  }
446

Guolin Ke's avatar
Guolin Ke committed
447
448
  ++iter_;
  return false;
Guolin Ke's avatar
Guolin Ke committed
449
}
450

wxchan's avatar
wxchan committed
451
void GBDT::RollbackOneIter() {
452
  if (iter_ <= 0) { return; }
wxchan's avatar
wxchan committed
453
  // reset score
454
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
455
    auto curr_tree = models_.size() - num_tree_per_iteration_ + cur_tree_id;
wxchan's avatar
wxchan committed
456
    models_[curr_tree]->Shrinkage(-1.0);
457
    train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
458
    for (auto& score_updater : valid_score_updater_) {
459
      score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
460
461
462
    }
  }
  // remove model
463
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
wxchan's avatar
wxchan committed
464
465
466
467
468
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
469
bool GBDT::EvalAndCheckEarlyStopping() {
470
471
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
472
  auto best_msg = OutputMetric(iter_);
Guolin Ke's avatar
Guolin Ke committed
473
474


Guolin Ke's avatar
Guolin Ke committed
475
  is_met_early_stopping = !best_msg.empty();
476
477
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
478
              iter_, iter_ - early_stopping_round_);
Guolin Ke's avatar
Guolin Ke committed
479
    Log::Info("Output of best iteration round:\n%s", best_msg.c_str());
480
    // pop last early_stopping_round_ models
481
    for (int i = 0; i < early_stopping_round_ * num_tree_per_iteration_; ++i) {
482
483
484
485
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
486
487
}

488
void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
489
  // update training score
Guolin Ke's avatar
Guolin Ke committed
490
  if (!is_use_subset_) {
491
    train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
492
493
494
495
496
497

    // we need to predict out-of-bag scores of data for boosting
    if (num_data_ - bag_data_cnt_ > 0) {
      train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
    }

Guolin Ke's avatar
Guolin Ke committed
498
  } else {
499
    train_score_updater_->AddScore(tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
500
  }
Guolin Ke's avatar
Guolin Ke committed
501
502


Guolin Ke's avatar
Guolin Ke committed
503
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
504
  for (auto& score_updater : valid_score_updater_) {
505
    score_updater->AddScore(tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
506
507
508
  }
}

Guolin Ke's avatar
Guolin Ke committed
509
510
511
512
std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
  return metric->Eval(score, objective_function_);
}

Guolin Ke's avatar
Guolin Ke committed
513
std::string GBDT::OutputMetric(int iter) {
Guolin Ke's avatar
Guolin Ke committed
514
  bool need_output = (iter % config_->metric_freq) == 0;
Guolin Ke's avatar
Guolin Ke committed
515
516
  std::string ret = "";
  std::stringstream msg_buf;
517
  std::vector<std::pair<size_t, size_t>> meet_early_stopping_pairs;
Guolin Ke's avatar
Guolin Ke committed
518
  // print training metric
Guolin Ke's avatar
Guolin Ke committed
519
  if (need_output) {
520
521
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
Guolin Ke's avatar
Guolin Ke committed
522
      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
523
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
524
525
526
527
528
529
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
          << ", training " << name[k]
          << " : " << scores[k];
        Log::Info(tmp_buf.str().c_str());
        if (early_stopping_round_ > 0) {
530
          msg_buf << tmp_buf.str() << '\n';
Guolin Ke's avatar
Guolin Ke committed
531
        }
532
      }
533
    }
Guolin Ke's avatar
Guolin Ke committed
534
535
  }
  // print validation metric
Guolin Ke's avatar
Guolin Ke committed
536
  if (need_output || early_stopping_round_ > 0) {
537
538
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
539
        auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score());
Guolin Ke's avatar
Guolin Ke committed
540
541
542
543
544
545
546
547
548
549
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
          tmp_buf << "Iteration:" << iter
            << ", valid_" << i + 1 << " " << name[k]
            << " : " << test_scores[k];
          if (need_output) {
            Log::Info(tmp_buf.str().c_str());
          }
          if (early_stopping_round_ > 0) {
550
            msg_buf << tmp_buf.str() << '\n';
551
          }
wxchan's avatar
wxchan committed
552
        }
553
        if (es_first_metric_only_ && j > 0) { continue; }
Guolin Ke's avatar
Guolin Ke committed
554
        if (ret.empty() && early_stopping_round_ > 0) {
555
556
557
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
558
            best_iter_[i][j] = iter;
Guolin Ke's avatar
Guolin Ke committed
559
            meet_early_stopping_pairs.emplace_back(i, j);
560
          } else {
Guolin Ke's avatar
Guolin Ke committed
561
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; }
562
          }
wxchan's avatar
wxchan committed
563
564
        }
      }
Guolin Ke's avatar
Guolin Ke committed
565
566
    }
  }
Guolin Ke's avatar
Guolin Ke committed
567
568
569
  for (auto& pair : meet_early_stopping_pairs) {
    best_msg_[pair.first][pair.second] = msg_buf.str();
  }
wxchan's avatar
wxchan committed
570
  return ret;
Guolin Ke's avatar
Guolin Ke committed
571
572
}

573
/*! \brief Get eval result */
574
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
Guolin Ke's avatar
Guolin Ke committed
575
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
576
577
  std::vector<double> ret;
  if (data_idx == 0) {
578
    for (auto& sub_metric : training_metrics_) {
Guolin Ke's avatar
Guolin Ke committed
579
      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
580
581
582
      for (auto score : scores) {
        ret.push_back(score);
      }
583
    }
584
  } else {
585
586
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
587
      auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score());
588
589
590
      for (auto score : test_scores) {
        ret.push_back(score);
      }
591
592
593
594
595
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
596
/*! \brief Get training scores result */
597
const double* GBDT::GetTrainingScore(int64_t* out_len) {
598
  *out_len = static_cast<int64_t>(train_score_updater_->num_data()) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
599
  return train_score_updater_->score();
600
601
}

602
603
604
void GBDT::PredictContrib(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
  int early_stop_round_counter = 0;
  // set zero
Guolin Ke's avatar
Guolin Ke committed
605
606
  const int num_features = max_feature_idx_ + 1;
  std::memset(output, 0, sizeof(double) * num_tree_per_iteration_ * (num_features + 1));
607
608
609
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
    // predict all the trees for one iteration
    for (int k = 0; k < num_tree_per_iteration_; ++k) {
Guolin Ke's avatar
Guolin Ke committed
610
      models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features + 1));
611
612
613
614
615
616
617
618
619
620
621
622
    }
    // check early stopping
    ++early_stop_round_counter;
    if (early_stop->round_period == early_stop_round_counter) {
      if (early_stop->callback_function(output, num_tree_per_iteration_)) {
        return;
      }
      early_stop_round_counter = 0;
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
623
624
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
Guolin Ke's avatar
Guolin Ke committed
625

626
  const double* raw_scores = nullptr;
Guolin Ke's avatar
Guolin Ke committed
627
628
  data_size_t num_data = 0;
  if (data_idx == 0) {
wxchan's avatar
wxchan committed
629
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
630
631
632
633
634
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
635
    *out_len = static_cast<int64_t>(num_data) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
636
  }
Guolin Ke's avatar
Guolin Ke committed
637
  if (objective_function_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
638
639
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
640
      std::vector<double> tree_pred(num_tree_per_iteration_);
641
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
642
        tree_pred[j] = raw_scores[j * num_data + i];
643
      }
Guolin Ke's avatar
Guolin Ke committed
644
645
      std::vector<double> tmp_result(num_class_);
      objective_function_->ConvertOutput(tree_pred.data(), tmp_result.data());
Guolin Ke's avatar
Guolin Ke committed
646
      for (int j = 0; j < num_class_; ++j) {
647
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
Guolin Ke's avatar
Guolin Ke committed
648
649
      }
    }
650
  } else {
Guolin Ke's avatar
Guolin Ke committed
651
    #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
652
    for (data_size_t i = 0; i < num_data; ++i) {
653
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
654
        out_result[j * num_data + i] = static_cast<double>(raw_scores[j * num_data + i]);
Guolin Ke's avatar
Guolin Ke committed
655
656
657
658
659
      }
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
660
661
662
void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
                             const std::vector<const Metric*>& training_metrics) {
  if (train_data != train_data_ && !train_data_->CheckAlign(*train_data)) {
663
    Log::Fatal("Cannot reset training data, since new training data has different bin mappers");
wxchan's avatar
wxchan committed
664
665
  }

Guolin Ke's avatar
Guolin Ke committed
666
667
668
669
670
671
  objective_function_ = objective_function;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
    CHECK(num_tree_per_iteration_ == objective_function_->NumModelPerIteration());
  } else {
    is_constant_hessian_ = false;
672
673
  }

Guolin Ke's avatar
Guolin Ke committed
674
675
676
677
  // push training metrics
  training_metrics_.clear();
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
678
  }
Guolin Ke's avatar
Guolin Ke committed
679
  training_metrics_.shrink_to_fit();
680

Guolin Ke's avatar
Guolin Ke committed
681
682
683
684
685
  if (train_data != train_data_) {
    train_data_ = train_data;
    // not same training data, need reset score and others
    // create score tracker
    train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
686

Guolin Ke's avatar
Guolin Ke committed
687
688
689
690
691
692
    // update score
    for (int i = 0; i < iter_; ++i) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        auto curr_tree = (i + num_init_iteration_) * num_tree_per_iteration_ + cur_tree_id;
        train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
      }
693
694
    }

Guolin Ke's avatar
Guolin Ke committed
695
    num_data_ = train_data_->num_data();
696

Guolin Ke's avatar
Guolin Ke committed
697
698
699
700
701
702
    // create buffer for gradients and hessians
    if (objective_function_ != nullptr) {
      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }
703

Guolin Ke's avatar
Guolin Ke committed
704
705
706
707
    max_feature_idx_ = train_data_->num_total_features() - 1;
    label_idx_ = train_data_->label_idx();
    feature_names_ = train_data_->feature_names();
    feature_infos_ = train_data_->feature_infos();
708

Guolin Ke's avatar
Guolin Ke committed
709
    tree_learner_->ResetTrainingData(train_data);
Guolin Ke's avatar
Guolin Ke committed
710
    ResetBaggingConfig(config_.get(), true);
711
  }
712
713
}

Guolin Ke's avatar
Guolin Ke committed
714
715
void GBDT::ResetConfig(const Config* config) {
  auto new_config = std::unique_ptr<Config>(new Config(*config));
Guolin Ke's avatar
Guolin Ke committed
716
717
718
  early_stopping_round_ = new_config->early_stopping_round;
  shrinkage_rate_ = new_config->learning_rate;
  if (tree_learner_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
719
    tree_learner_->ResetConfig(new_config.get());
720
  }
Guolin Ke's avatar
Guolin Ke committed
721
722
  if (train_data_ != nullptr) {
    ResetBaggingConfig(new_config.get(), false);
723
  }
Guolin Ke's avatar
Guolin Ke committed
724
  config_.reset(new_config.release());
Guolin Ke's avatar
Guolin Ke committed
725
726
}

Guolin Ke's avatar
Guolin Ke committed
727
void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
Guolin Ke's avatar
Guolin Ke committed
728
  // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
729
730
731
732
733
734
  data_size_t num_pos_data = 0;
  if (objective_function_ != nullptr) {
    num_pos_data = objective_function_->NumPositiveData();
  }
  bool balance_bagging_cond = (config->pos_bagging_fraction < 1.0 || config->neg_bagging_fraction < 1.0) && (num_pos_data > 0);
  if ((config->bagging_fraction < 1.0 || balance_bagging_cond) && config->bagging_freq > 0) {
735
736
    need_re_bagging_ = false;
    if (!is_change_dataset &&
Guolin Ke's avatar
Guolin Ke committed
737
738
      config_.get() != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq
      && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) {
739
740
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
741
742
    if (balance_bagging_cond) {
      balanced_bagging_ = true;
743
      bag_data_cnt_ = static_cast<data_size_t>(num_pos_data * config->pos_bagging_fraction)
Guolin Ke's avatar
Guolin Ke committed
744
745
746
747
                      + static_cast<data_size_t>((num_data_ - num_pos_data) * config->neg_bagging_fraction);
    } else {
      bag_data_cnt_ = static_cast<data_size_t>(config->bagging_fraction * num_data_);
    }
Guolin Ke's avatar
Guolin Ke committed
748
749
    bag_data_indices_.resize(num_data_);
    tmp_indices_.resize(num_data_);
750

Guolin Ke's avatar
Guolin Ke committed
751
752
753
754
755
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);
756

Guolin Ke's avatar
Guolin Ke committed
757
    double average_bag_rate = (bag_data_cnt_ / num_data_) / config->bagging_freq;
Guolin Ke's avatar
Guolin Ke committed
758
759
760
761
762
    int sparse_group = 0;
    for (int i = 0; i < train_data_->num_feature_groups(); ++i) {
      if (train_data_->FeatureGroupIsSparse(i)) {
        ++sparse_group;
      }
Guolin Ke's avatar
Guolin Ke committed
763
    }
Guolin Ke's avatar
Guolin Ke committed
764
765
766
767
768
769
770
771
772
773
    is_use_subset_ = false;
    const int group_threshold_usesubset = 100;
    const int sparse_group_threshold_usesubset = train_data_->num_feature_groups() / 4;
    if (average_bag_rate <= 0.5
        && (train_data_->num_feature_groups() < group_threshold_usesubset || sparse_group < sparse_group_threshold_usesubset)) {
      if (tmp_subset_ == nullptr || is_change_dataset) {
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
        tmp_subset_->CopyFeatureMapperFrom(train_data_);
      }
      is_use_subset_ = true;
774
      Log::Debug("Use subset for bagging");
Guolin Ke's avatar
Guolin Ke committed
775
776
    }

777
    need_re_bagging_ = true;
778

Guolin Ke's avatar
Guolin Ke committed
779
780
781
782
783
    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
      if (objective_function_ == nullptr) {
        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
        gradients_.resize(total_size);
        hessians_.resize(total_size);
784
      }
785
    }
786
  } else {
Guolin Ke's avatar
Guolin Ke committed
787
788
789
790
    bag_data_cnt_ = num_data_;
    bag_data_indices_.clear();
    tmp_indices_.clear();
    is_use_subset_ = false;
791
  }
wxchan's avatar
wxchan committed
792
793
}

Guolin Ke's avatar
Guolin Ke committed
794
}  // namespace LightGBM