"python-package/vscode:/vscode.git/clone" did not exist on "ef408f552a293f3bc7e98f89b9d15564e0b17938"
gbdt.cpp 29.1 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#include "gbdt.h"

#include <LightGBM/metric.h>
Guolin Ke's avatar
Guolin Ke committed
8
#include <LightGBM/network.h>
9
10
11
12
#include <LightGBM/objective_function.h>
#include <LightGBM/prediction_early_stop.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
13

14
#include <chrono>
Guolin Ke's avatar
Guolin Ke committed
15
16
17
18
19
#include <ctime>
#include <sstream>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
20

Guolin Ke's avatar
Guolin Ke committed
21
GBDT::GBDT() : iter_(0),
Guolin Ke's avatar
Guolin Ke committed
22
23
24
train_data_(nullptr),
objective_function_(nullptr),
early_stopping_round_(0),
25
es_first_metric_only_(false),
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
max_feature_idx_(0),
num_tree_per_iteration_(1),
num_class_(1),
num_iteration_for_pred_(0),
shrinkage_rate_(0.1f),
num_init_iteration_(0),
Guolin Ke's avatar
Guolin Ke committed
32
33
need_re_bagging_(false),
balanced_bagging_(false) {
Guolin Ke's avatar
Guolin Ke committed
34
35
  #pragma omp parallel
  #pragma omp master
Guolin Ke's avatar
Guolin Ke committed
36
37
38
39
  {
    num_threads_ = omp_get_num_threads();
  }
  average_output_ = false;
Guolin Ke's avatar
Guolin Ke committed
40
  tree_learner_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
41
42
43
44
45
}

GBDT::~GBDT() {
}

Guolin Ke's avatar
Guolin Ke committed
46
void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
47
                const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
48
  CHECK(train_data != nullptr);
49
  train_data_ = train_data;
50
  iter_ = 0;
wxchan's avatar
wxchan committed
51
  num_iteration_for_pred_ = 0;
52
  max_feature_idx_ = 0;
wxchan's avatar
wxchan committed
53
  num_class_ = config->num_class;
Guolin Ke's avatar
Guolin Ke committed
54
55
  config_ = std::unique_ptr<Config>(new Config(*config));
  early_stopping_round_ = config_->early_stopping_round;
56
  es_first_metric_only_ = config_->first_metric_only;
Guolin Ke's avatar
Guolin Ke committed
57
  shrinkage_rate_ = config_->learning_rate;
58

59
  std::string forced_splits_path = config->forcedsplits_filename;
60
  // load forced_splits file
61
62
63
64
65
66
67
68
  if (forced_splits_path != "") {
      std::ifstream forced_splits_file(forced_splits_path.c_str());
      std::stringstream buffer;
      buffer << forced_splits_file.rdbuf();
      std::string err;
      forced_splits_json_ = Json::parse(buffer.str(), err);
  }

69
70
71
72
  objective_function_ = objective_function;
  num_tree_per_iteration_ = num_class_;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
Guolin Ke's avatar
Guolin Ke committed
73
    num_tree_per_iteration_ = objective_function_->NumModelPerIteration();
74
75
76
77
  } else {
    is_constant_hessian_ = false;
  }

Guolin Ke's avatar
Guolin Ke committed
78
  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

  // init tree learner
  tree_learner_->Init(train_data_, is_constant_hessian_);

  // push training metrics
  training_metrics_.clear();
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
  training_metrics_.shrink_to_fit();

  train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));

  num_data_ = train_data_->num_data();
  // create buffer for gradients and hessians
  if (objective_function_ != nullptr) {
    size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
    gradients_.resize(total_size);
    hessians_.resize(total_size);
  }
  // get max feature index
  max_feature_idx_ = train_data_->num_total_features() - 1;
  // get label index
  label_idx_ = train_data_->label_idx();
  // get feature names
  feature_names_ = train_data_->feature_names();
  feature_infos_ = train_data_->feature_infos();

  // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
108
  ResetBaggingConfig(config_.get(), true);
109
110
111
112

  class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
  if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
    CHECK(num_tree_per_iteration_ == num_class_);
113
114
    for (int i = 0; i < num_class_; ++i) {
      class_need_train_[i] = objective_function_->ClassNeedTrain(i);
115
116
    }
  }
wxchan's avatar
wxchan committed
117
118
119
}

void GBDT::AddValidDataset(const Dataset* valid_data,
120
                           const std::vector<const Metric*>& valid_metrics) {
wxchan's avatar
wxchan committed
121
  if (!train_data_->CheckAlign(*valid_data)) {
122
    Log::Fatal("Cannot add validation data, since it has different bin mappers with training data");
123
  }
Guolin Ke's avatar
Guolin Ke committed
124
  // for a validation dataset, we need its score and metric
125
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
wxchan's avatar
wxchan committed
126
127
  // update score
  for (int i = 0; i < iter_; ++i) {
128
129
130
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
      auto curr_tree = (i + num_init_iteration_) * num_tree_per_iteration_ + cur_tree_id;
      new_score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
131
132
    }
  }
Guolin Ke's avatar
Guolin Ke committed
133
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
134
135
136
137
  valid_metrics_.emplace_back();
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
  }
Guolin Ke's avatar
Guolin Ke committed
138
  valid_metrics_.back().shrink_to_fit();
139

140
141
142
143
144
145
146
  if (early_stopping_round_ > 0) {
    auto num_metrics = valid_metrics.size();
    if (es_first_metric_only_) { num_metrics = 1; }
    best_iter_.emplace_back(num_metrics, 0);
    best_score_.emplace_back(num_metrics, kMinScore);
    best_msg_.emplace_back(num_metrics);
  }
Guolin Ke's avatar
Guolin Ke committed
147
148
}

Guolin Ke's avatar
Guolin Ke committed
149
150
151
152
153
154
155
156
157
158
void GBDT::Boosting() {
  if (objective_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
  // objective function will calculate gradients and hessians
  int64_t num_score = 0;
  objective_function_->
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
}

159
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer) {
160
161
162
  if (cnt <= 0) {
    return 0;
  }
Guolin Ke's avatar
Guolin Ke committed
163
  data_size_t bag_data_cnt = static_cast<data_size_t>(config_->bagging_fraction * cnt);
164
165
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
166
  auto right_buffer = buffer + bag_data_cnt;
167
168
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
169
    float prob = (bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
Guolin Ke's avatar
Guolin Ke committed
170
    if (cur_rand.NextFloat() < prob) {
171
172
      buffer[cur_left_cnt++] = start + i;
    } else {
Guolin Ke's avatar
Guolin Ke committed
173
      right_buffer[cur_right_cnt++] = start + i;
174
175
176
177
178
    }
  }
  CHECK(cur_left_cnt == bag_data_cnt);
  return cur_left_cnt;
}
Guolin Ke's avatar
Guolin Ke committed
179

Guolin Ke's avatar
Guolin Ke committed
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
data_size_t GBDT::BalancedBaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer) {
  if (cnt <= 0) {
    return 0;
  }
  auto label_ptr = train_data_->metadata().label();
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_pos = cnt - 1;
  // from right to left
  auto right_buffer = buffer;
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
    bool is_pos = label_ptr[start + i] > 0;
    bool is_in_bag = false;
    if (is_pos) {
      is_in_bag = cur_rand.NextFloat() < config_->pos_bagging_fraction;
    } else {
      is_in_bag = cur_rand.NextFloat() < config_->neg_bagging_fraction;
    }
    if (is_in_bag) {
      buffer[cur_left_cnt++] = start + i;
    } else {
      right_buffer[cur_right_pos--] = start + i;
    }
  }
  // reverse right buffer
  std::reverse(buffer + cur_left_cnt, buffer + cnt);
  return cur_left_cnt;
}

209
void GBDT::Bagging(int iter) {
Guolin Ke's avatar
Guolin Ke committed
210
  // if need bagging
Guolin Ke's avatar
Guolin Ke committed
211
  if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0)
Guolin Ke's avatar
Guolin Ke committed
212
      || need_re_bagging_) {
Guolin Ke's avatar
Guolin Ke committed
213
    need_re_bagging_ = false;
Guolin Ke's avatar
Guolin Ke committed
214
    const data_size_t min_inner_size = 1000;
215
216
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
217
    OMP_INIT_EX();
218
    #pragma omp parallel for schedule(static, 1)
219
    for (int i = 0; i < num_threads_; ++i) {
220
      OMP_LOOP_EX_BEGIN();
221
222
223
224
225
226
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Guolin Ke's avatar
Guolin Ke committed
227
      Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
Guolin Ke's avatar
Guolin Ke committed
228
229
230
231
232
233
      data_size_t cur_left_count = 0;
      if (balanced_bagging_) {
        cur_left_count = BalancedBaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
      } else {
        cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
      }
234
235
236
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
237
      OMP_LOOP_EX_END();
238
    }
239
    OMP_THROW_EX();
240
241
242
243
244
245
246
247
248
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

Guolin Ke's avatar
Guolin Ke committed
249
    #pragma omp parallel for schedule(static, 1)
250
    for (int i = 0; i < num_threads_; ++i) {
251
      OMP_LOOP_EX_BEGIN();
252
253
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
254
                    tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
255
      }
256
257
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
258
                    tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
259
      }
260
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
261
    }
262
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
263
    bag_data_cnt_ = left_cnt;
Guolin Ke's avatar
Guolin Ke committed
264
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
265
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
266
267
268
269
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
Guolin Ke's avatar
Guolin Ke committed
270
271
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
Guolin Ke's avatar
Guolin Ke committed
272
273
      tree_learner_->ResetTrainingData(tmp_subset_.get());
    }
Guolin Ke's avatar
Guolin Ke committed
274
275
276
  }
}

Guolin Ke's avatar
Guolin Ke committed
277
278
279
void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
  bool is_finished = false;
  auto start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
280
  for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
Guolin Ke's avatar
Guolin Ke committed
281
282
283
284
    is_finished = TrainOneIter(nullptr, nullptr);
    if (!is_finished) {
      is_finished = EvalAndCheckEarlyStopping();
    }
Guolin Ke's avatar
Guolin Ke committed
285
286
287
288
289
290
291
    auto end_time = std::chrono::steady_clock::now();
    // output used time per iteration
    Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
              std::milli>(end_time - start_time) * 1e-3, iter + 1);
    if (snapshot_freq > 0
        && (iter + 1) % snapshot_freq == 0) {
      std::string snapshot_out = model_output_path + ".snapshot_iter_" + std::to_string(iter + 1);
292
      SaveModelToFile(0, -1, snapshot_out.c_str());
Guolin Ke's avatar
Guolin Ke committed
293
294
295
296
    }
  }
}

297
298
299
300
301
302
303
304
305
306
307
308
309
void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction) {
  CHECK(tree_leaf_prediction.size() > 0);
  CHECK(static_cast<size_t>(num_data_) == tree_leaf_prediction.size());
  CHECK(static_cast<size_t>(models_.size()) == tree_leaf_prediction[0].size());
  int num_iterations = static_cast<int>(models_.size() / num_tree_per_iteration_);
  std::vector<int> leaf_pred(num_data_);
  for (int iter = 0; iter < num_iterations; ++iter) {
    Boosting();
    for (int tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) {
      int model_index = iter * num_tree_per_iteration_ + tree_id;
      #pragma omp parallel for schedule(static)
      for (int i = 0; i < num_data_; ++i) {
        leaf_pred[i] = tree_leaf_prediction[i][model_index];
Guolin Ke's avatar
Guolin Ke committed
310
        CHECK(leaf_pred[i] < models_[model_index]->num_leaves());
311
312
313
314
315
316
317
318
319
320
321
      }
      size_t bias = static_cast<size_t>(tree_id) * num_data_;
      auto grad = gradients_.data() + bias;
      auto hess = hessians_.data() + bias;
      auto new_tree = tree_learner_->FitByExistingTree(models_[model_index].get(), leaf_pred, grad, hess);
      train_score_updater_->AddScore(tree_learner_.get(), new_tree, tree_id);
      models_[model_index].reset(new_tree);
    }
  }
}

322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
*
* An improvement to this is to have options to explicitly choose
* (i) standard average
* (ii) custom average if available
* (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
*
* (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
*
*/
double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id) {
  double init_score = 0.0;
  if (fobj != nullptr) {
    init_score = fobj->BoostFromScore(class_id);
  }
  if (Network::num_machines() > 1) {
    init_score = Network::GlobalSyncUpByMean(init_score);
  }
  return init_score;
}

Guolin Ke's avatar
Guolin Ke committed
343
double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
344
  // boosting from average label; or customized "average" if implemented for the current objective
345
346
347
  if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
    if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
      double init_score = ObtainAutomaticInitialScore(objective_function_, class_id);
348
      if (std::fabs(init_score) > kEpsilon) {
Guolin Ke's avatar
Guolin Ke committed
349
350
351
352
353
        if (update_scorer) {
          train_score_updater_->AddScore(init_score, class_id);
          for (auto& score_updater : valid_score_updater_) {
            score_updater->AddScore(init_score, class_id);
          }
354
355
356
        }
        Log::Info("Start training from score %lf", init_score);
        return init_score;
Guolin Ke's avatar
Guolin Ke committed
357
      }
358
359
360
    } else if (std::string(objective_function_->GetName()) == std::string("regression_l1")
               || std::string(objective_function_->GetName()) == std::string("quantile")
               || std::string(objective_function_->GetName()) == std::string("mape")) {
361
      Log::Warning("Disabling boost_from_average in %s may cause the slow convergence", objective_function_->GetName());
362
    }
363
  }
Guolin Ke's avatar
Guolin Ke committed
364
365
  return 0.0f;
}
Guolin Ke's avatar
Guolin Ke committed
366

Guolin Ke's avatar
Guolin Ke committed
367
bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
368
  std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
Guolin Ke's avatar
Guolin Ke committed
369
  // boosting first
Guolin Ke's avatar
Guolin Ke committed
370
  if (gradients == nullptr || hessians == nullptr) {
371
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
372
      init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true);
373
    }
Guolin Ke's avatar
Guolin Ke committed
374
    Boosting();
Guolin Ke's avatar
Guolin Ke committed
375
376
    gradients = gradients_.data();
    hessians = hessians_.data();
Guolin Ke's avatar
Guolin Ke committed
377
  }
378
379
  // bagging logic
  Bagging(iter_);
Guolin Ke's avatar
Guolin Ke committed
380

Guolin Ke's avatar
Guolin Ke committed
381
  bool should_continue = false;
382
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
383
    const size_t bias = static_cast<size_t>(cur_tree_id) * num_data_;
384
    std::unique_ptr<Tree> new_tree(new Tree(2));
385
    if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
Guolin Ke's avatar
Guolin Ke committed
386
387
388
389
390
391
392
393
394
395
396
      auto grad = gradients + bias;
      auto hess = hessians + bias;
      // need to copy gradients for bagging subset.
      if (is_use_subset_ && bag_data_cnt_ < num_data_) {
        for (int i = 0; i < bag_data_cnt_; ++i) {
          gradients_[bias + i] = grad[bag_data_indices_[i]];
          hessians_[bias + i] = hess[bag_data_indices_[i]];
        }
        grad = gradients_.data() + bias;
        hess = hessians_.data() + bias;
      }
397
      new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
398
    }
Guolin Ke's avatar
Guolin Ke committed
399

Guolin Ke's avatar
Guolin Ke committed
400
    if (new_tree->num_leaves() > 1) {
Guolin Ke's avatar
Guolin Ke committed
401
      should_continue = true;
402
403
404
      auto score_ptr = train_score_updater_->score() + bias;
      auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast<double>(label[i]) - score_ptr[i]; };
      tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter,
405
                                     num_data_, bag_data_indices_.data(), bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
406
407
408
      // shrinkage by learning rate
      new_tree->Shrinkage(shrinkage_rate_);
      // update score
409
      UpdateScore(new_tree.get(), cur_tree_id);
410
411
      if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
        new_tree->AddBias(init_scores[cur_tree_id]);
Guolin Ke's avatar
Guolin Ke committed
412
      }
413
414
    } else {
      // only add default score one-time
415
416
417
418
419
420
421
422
423
      if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
        double output = 0.0;
        if (!class_need_train_[cur_tree_id]) {
          if (objective_function_ != nullptr) {
            output = objective_function_->BoostFromScore(cur_tree_id);
          }
        } else {
          output = init_scores[cur_tree_id];
        }
424
        new_tree->AsConstantTree(output);
Guolin Ke's avatar
Guolin Ke committed
425
        // updates scores
426
        train_score_updater_->AddScore(output, cur_tree_id);
427
        for (auto& score_updater : valid_score_updater_) {
428
          score_updater->AddScore(output, cur_tree_id);
429
430
431
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
432
433
434
    // add model
    models_.push_back(std::move(new_tree));
  }
Guolin Ke's avatar
Guolin Ke committed
435

Guolin Ke's avatar
Guolin Ke committed
436
  if (!should_continue) {
437
    Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
438
439
440
441
    if (models_.size() > static_cast<size_t>(num_tree_per_iteration_)) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        models_.pop_back();
      }
Guolin Ke's avatar
Guolin Ke committed
442
443
444
    }
    return true;
  }
445

Guolin Ke's avatar
Guolin Ke committed
446
447
  ++iter_;
  return false;
Guolin Ke's avatar
Guolin Ke committed
448
}
449

wxchan's avatar
wxchan committed
450
void GBDT::RollbackOneIter() {
451
  if (iter_ <= 0) { return; }
wxchan's avatar
wxchan committed
452
  // reset score
453
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
454
    auto curr_tree = models_.size() - num_tree_per_iteration_ + cur_tree_id;
wxchan's avatar
wxchan committed
455
    models_[curr_tree]->Shrinkage(-1.0);
456
    train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
457
    for (auto& score_updater : valid_score_updater_) {
458
      score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
459
460
461
    }
  }
  // remove model
462
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
wxchan's avatar
wxchan committed
463
464
465
466
467
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
468
bool GBDT::EvalAndCheckEarlyStopping() {
469
470
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
471
  auto best_msg = OutputMetric(iter_);
Guolin Ke's avatar
Guolin Ke committed
472
473


Guolin Ke's avatar
Guolin Ke committed
474
  is_met_early_stopping = !best_msg.empty();
475
476
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
477
              iter_, iter_ - early_stopping_round_);
Guolin Ke's avatar
Guolin Ke committed
478
    Log::Info("Output of best iteration round:\n%s", best_msg.c_str());
479
    // pop last early_stopping_round_ models
480
    for (int i = 0; i < early_stopping_round_ * num_tree_per_iteration_; ++i) {
481
482
483
484
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
485
486
}

487
void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
488
  // update training score
Guolin Ke's avatar
Guolin Ke committed
489
  if (!is_use_subset_) {
490
    train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
491
492
493
494
495
496

    // we need to predict out-of-bag scores of data for boosting
    if (num_data_ - bag_data_cnt_ > 0) {
      train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
    }

Guolin Ke's avatar
Guolin Ke committed
497
  } else {
498
    train_score_updater_->AddScore(tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
499
  }
Guolin Ke's avatar
Guolin Ke committed
500
501


Guolin Ke's avatar
Guolin Ke committed
502
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
503
  for (auto& score_updater : valid_score_updater_) {
504
    score_updater->AddScore(tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
505
506
507
  }
}

Guolin Ke's avatar
Guolin Ke committed
508
509
510
511
std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
  return metric->Eval(score, objective_function_);
}

Guolin Ke's avatar
Guolin Ke committed
512
std::string GBDT::OutputMetric(int iter) {
Guolin Ke's avatar
Guolin Ke committed
513
  bool need_output = (iter % config_->metric_freq) == 0;
Guolin Ke's avatar
Guolin Ke committed
514
515
  std::string ret = "";
  std::stringstream msg_buf;
516
  std::vector<std::pair<size_t, size_t>> meet_early_stopping_pairs;
Guolin Ke's avatar
Guolin Ke committed
517
  // print training metric
Guolin Ke's avatar
Guolin Ke committed
518
  if (need_output) {
519
520
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
Guolin Ke's avatar
Guolin Ke committed
521
      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
522
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
523
524
525
526
527
528
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
          << ", training " << name[k]
          << " : " << scores[k];
        Log::Info(tmp_buf.str().c_str());
        if (early_stopping_round_ > 0) {
529
          msg_buf << tmp_buf.str() << '\n';
Guolin Ke's avatar
Guolin Ke committed
530
        }
531
      }
532
    }
Guolin Ke's avatar
Guolin Ke committed
533
534
  }
  // print validation metric
Guolin Ke's avatar
Guolin Ke committed
535
  if (need_output || early_stopping_round_ > 0) {
536
537
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
538
        auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score());
Guolin Ke's avatar
Guolin Ke committed
539
540
541
542
543
544
545
546
547
548
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
          tmp_buf << "Iteration:" << iter
            << ", valid_" << i + 1 << " " << name[k]
            << " : " << test_scores[k];
          if (need_output) {
            Log::Info(tmp_buf.str().c_str());
          }
          if (early_stopping_round_ > 0) {
549
            msg_buf << tmp_buf.str() << '\n';
550
          }
wxchan's avatar
wxchan committed
551
        }
552
        if (es_first_metric_only_ && j > 0) { continue; }
Guolin Ke's avatar
Guolin Ke committed
553
        if (ret.empty() && early_stopping_round_ > 0) {
554
555
556
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
557
            best_iter_[i][j] = iter;
Guolin Ke's avatar
Guolin Ke committed
558
            meet_early_stopping_pairs.emplace_back(i, j);
559
          } else {
Guolin Ke's avatar
Guolin Ke committed
560
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; }
561
          }
wxchan's avatar
wxchan committed
562
563
        }
      }
Guolin Ke's avatar
Guolin Ke committed
564
565
    }
  }
Guolin Ke's avatar
Guolin Ke committed
566
567
568
  for (auto& pair : meet_early_stopping_pairs) {
    best_msg_[pair.first][pair.second] = msg_buf.str();
  }
wxchan's avatar
wxchan committed
569
  return ret;
Guolin Ke's avatar
Guolin Ke committed
570
571
}

572
/*! \brief Get eval result */
573
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
Guolin Ke's avatar
Guolin Ke committed
574
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
575
576
  std::vector<double> ret;
  if (data_idx == 0) {
577
    for (auto& sub_metric : training_metrics_) {
Guolin Ke's avatar
Guolin Ke committed
578
      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
579
580
581
      for (auto score : scores) {
        ret.push_back(score);
      }
582
    }
583
  } else {
584
585
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
586
      auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score());
587
588
589
      for (auto score : test_scores) {
        ret.push_back(score);
      }
590
591
592
593
594
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
595
/*! \brief Get training scores result */
596
const double* GBDT::GetTrainingScore(int64_t* out_len) {
597
  *out_len = static_cast<int64_t>(train_score_updater_->num_data()) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
598
  return train_score_updater_->score();
599
600
}

601
602
603
void GBDT::PredictContrib(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
  int early_stop_round_counter = 0;
  // set zero
Guolin Ke's avatar
Guolin Ke committed
604
605
  const int num_features = max_feature_idx_ + 1;
  std::memset(output, 0, sizeof(double) * num_tree_per_iteration_ * (num_features + 1));
606
607
608
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
    // predict all the trees for one iteration
    for (int k = 0; k < num_tree_per_iteration_; ++k) {
Guolin Ke's avatar
Guolin Ke committed
609
      models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features + 1));
610
611
612
613
614
615
616
617
618
619
620
621
    }
    // check early stopping
    ++early_stop_round_counter;
    if (early_stop->round_period == early_stop_round_counter) {
      if (early_stop->callback_function(output, num_tree_per_iteration_)) {
        return;
      }
      early_stop_round_counter = 0;
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
622
623
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
Guolin Ke's avatar
Guolin Ke committed
624

625
  const double* raw_scores = nullptr;
Guolin Ke's avatar
Guolin Ke committed
626
627
  data_size_t num_data = 0;
  if (data_idx == 0) {
wxchan's avatar
wxchan committed
628
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
629
630
631
632
633
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
634
    *out_len = static_cast<int64_t>(num_data) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
635
  }
Guolin Ke's avatar
Guolin Ke committed
636
  if (objective_function_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
637
638
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
639
      std::vector<double> tree_pred(num_tree_per_iteration_);
640
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
641
        tree_pred[j] = raw_scores[j * num_data + i];
642
      }
Guolin Ke's avatar
Guolin Ke committed
643
644
      std::vector<double> tmp_result(num_class_);
      objective_function_->ConvertOutput(tree_pred.data(), tmp_result.data());
Guolin Ke's avatar
Guolin Ke committed
645
      for (int j = 0; j < num_class_; ++j) {
646
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
Guolin Ke's avatar
Guolin Ke committed
647
648
      }
    }
649
  } else {
Guolin Ke's avatar
Guolin Ke committed
650
    #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
651
    for (data_size_t i = 0; i < num_data; ++i) {
652
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
653
        out_result[j * num_data + i] = static_cast<double>(raw_scores[j * num_data + i]);
Guolin Ke's avatar
Guolin Ke committed
654
655
656
657
658
      }
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
659
660
661
void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
                             const std::vector<const Metric*>& training_metrics) {
  if (train_data != train_data_ && !train_data_->CheckAlign(*train_data)) {
662
    Log::Fatal("Cannot reset training data, since new training data has different bin mappers");
wxchan's avatar
wxchan committed
663
664
  }

Guolin Ke's avatar
Guolin Ke committed
665
666
667
668
669
670
  objective_function_ = objective_function;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
    CHECK(num_tree_per_iteration_ == objective_function_->NumModelPerIteration());
  } else {
    is_constant_hessian_ = false;
671
672
  }

Guolin Ke's avatar
Guolin Ke committed
673
674
675
676
  // push training metrics
  training_metrics_.clear();
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
677
  }
Guolin Ke's avatar
Guolin Ke committed
678
  training_metrics_.shrink_to_fit();
679

Guolin Ke's avatar
Guolin Ke committed
680
681
682
683
684
  if (train_data != train_data_) {
    train_data_ = train_data;
    // not same training data, need reset score and others
    // create score tracker
    train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
685

Guolin Ke's avatar
Guolin Ke committed
686
687
688
689
690
691
    // update score
    for (int i = 0; i < iter_; ++i) {
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        auto curr_tree = (i + num_init_iteration_) * num_tree_per_iteration_ + cur_tree_id;
        train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
      }
692
693
    }

Guolin Ke's avatar
Guolin Ke committed
694
    num_data_ = train_data_->num_data();
695

Guolin Ke's avatar
Guolin Ke committed
696
697
698
699
700
701
    // create buffer for gradients and hessians
    if (objective_function_ != nullptr) {
      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }
702

Guolin Ke's avatar
Guolin Ke committed
703
704
705
706
    max_feature_idx_ = train_data_->num_total_features() - 1;
    label_idx_ = train_data_->label_idx();
    feature_names_ = train_data_->feature_names();
    feature_infos_ = train_data_->feature_infos();
707

Guolin Ke's avatar
Guolin Ke committed
708
    tree_learner_->ResetTrainingData(train_data);
Guolin Ke's avatar
Guolin Ke committed
709
    ResetBaggingConfig(config_.get(), true);
710
  }
711
712
}

Guolin Ke's avatar
Guolin Ke committed
713
714
void GBDT::ResetConfig(const Config* config) {
  auto new_config = std::unique_ptr<Config>(new Config(*config));
Guolin Ke's avatar
Guolin Ke committed
715
716
717
  early_stopping_round_ = new_config->early_stopping_round;
  shrinkage_rate_ = new_config->learning_rate;
  if (tree_learner_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
718
    tree_learner_->ResetConfig(new_config.get());
719
  }
Guolin Ke's avatar
Guolin Ke committed
720
721
  if (train_data_ != nullptr) {
    ResetBaggingConfig(new_config.get(), false);
722
  }
Guolin Ke's avatar
Guolin Ke committed
723
  config_.reset(new_config.release());
Guolin Ke's avatar
Guolin Ke committed
724
725
}

Guolin Ke's avatar
Guolin Ke committed
726
void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
Guolin Ke's avatar
Guolin Ke committed
727
  // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
728
729
730
731
732
733
  data_size_t num_pos_data = 0;
  if (objective_function_ != nullptr) {
    num_pos_data = objective_function_->NumPositiveData();
  }
  bool balance_bagging_cond = (config->pos_bagging_fraction < 1.0 || config->neg_bagging_fraction < 1.0) && (num_pos_data > 0);
  if ((config->bagging_fraction < 1.0 || balance_bagging_cond) && config->bagging_freq > 0) {
734
735
    need_re_bagging_ = false;
    if (!is_change_dataset &&
Guolin Ke's avatar
Guolin Ke committed
736
737
      config_.get() != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq
      && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) {
738
739
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
740
741
    if (balance_bagging_cond) {
      balanced_bagging_ = true;
742
      bag_data_cnt_ = static_cast<data_size_t>(num_pos_data * config->pos_bagging_fraction)
Guolin Ke's avatar
Guolin Ke committed
743
744
745
746
                      + static_cast<data_size_t>((num_data_ - num_pos_data) * config->neg_bagging_fraction);
    } else {
      bag_data_cnt_ = static_cast<data_size_t>(config->bagging_fraction * num_data_);
    }
Guolin Ke's avatar
Guolin Ke committed
747
748
    bag_data_indices_.resize(num_data_);
    tmp_indices_.resize(num_data_);
749

Guolin Ke's avatar
Guolin Ke committed
750
751
752
753
754
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);
755

Guolin Ke's avatar
Guolin Ke committed
756
    double average_bag_rate = (bag_data_cnt_ / num_data_) / config->bagging_freq;
Guolin Ke's avatar
Guolin Ke committed
757
758
759
760
761
    int sparse_group = 0;
    for (int i = 0; i < train_data_->num_feature_groups(); ++i) {
      if (train_data_->FeatureGroupIsSparse(i)) {
        ++sparse_group;
      }
Guolin Ke's avatar
Guolin Ke committed
762
    }
Guolin Ke's avatar
Guolin Ke committed
763
764
765
766
767
768
769
770
771
772
    is_use_subset_ = false;
    const int group_threshold_usesubset = 100;
    const int sparse_group_threshold_usesubset = train_data_->num_feature_groups() / 4;
    if (average_bag_rate <= 0.5
        && (train_data_->num_feature_groups() < group_threshold_usesubset || sparse_group < sparse_group_threshold_usesubset)) {
      if (tmp_subset_ == nullptr || is_change_dataset) {
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
        tmp_subset_->CopyFeatureMapperFrom(train_data_);
      }
      is_use_subset_ = true;
773
      Log::Debug("Use subset for bagging");
Guolin Ke's avatar
Guolin Ke committed
774
775
    }

776
    need_re_bagging_ = true;
777

Guolin Ke's avatar
Guolin Ke committed
778
779
780
781
782
    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
      if (objective_function_ == nullptr) {
        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
        gradients_.resize(total_size);
        hessians_.resize(total_size);
783
      }
784
    }
785
  } else {
Guolin Ke's avatar
Guolin Ke committed
786
787
788
789
    bag_data_cnt_ = num_data_;
    bag_data_indices_.clear();
    tmp_indices_.clear();
    is_use_subset_ = false;
790
  }
wxchan's avatar
wxchan committed
791
792
}

Guolin Ke's avatar
Guolin Ke committed
793
}  // namespace LightGBM