gbdt.cpp 42.3 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#include "gbdt.h"

3
#include <LightGBM/utils/openmp_wrapper.h>
4

Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
#include <LightGBM/utils/common.h>

#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
cbecker's avatar
cbecker committed
9
#include <LightGBM/prediction_early_stop.h>
Guolin Ke's avatar
Guolin Ke committed
10
#include <LightGBM/network.h>
Guolin Ke's avatar
Guolin Ke committed
11
12
13
14
15
16
17

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
18
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
19
20
21

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
22
23
24
#ifdef TIMETAG
std::chrono::duration<double, std::milli> boosting_time;
std::chrono::duration<double, std::milli> train_score_time;
Guolin Ke's avatar
Guolin Ke committed
25
std::chrono::duration<double, std::milli> out_of_bag_score_time;
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
32
std::chrono::duration<double, std::milli> valid_score_time;
std::chrono::duration<double, std::milli> metric_time;
std::chrono::duration<double, std::milli> bagging_time;
std::chrono::duration<double, std::milli> sub_gradient_time;
std::chrono::duration<double, std::milli> tree_time;
#endif // TIMETAG

33
GBDT::GBDT()
34
  :iter_(0),
35
  train_data_(nullptr),
36
  objective_function_(nullptr),
37
38
  early_stopping_round_(0),
  max_feature_idx_(0),
39
  num_tree_per_iteration_(1),
40
  num_class_(1),
41
  num_iteration_for_pred_(0),
42
  shrinkage_rate_(0.1f),
43
44
  num_init_iteration_(0),
  boost_from_average_(false) {
Guolin Ke's avatar
Guolin Ke committed
45
46
  #pragma omp parallel
  #pragma omp master
Guolin Ke's avatar
Guolin Ke committed
47
48
49
50
  {
    num_threads_ = omp_get_num_threads();
  }
  average_output_ = false;
Guolin Ke's avatar
Guolin Ke committed
51
  tree_learner_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
52
53
54
}

GBDT::~GBDT() {
Guolin Ke's avatar
Guolin Ke committed
55
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
56
57
  Log::Info("GBDT::boosting costs %f", boosting_time * 1e-3);
  Log::Info("GBDT::train_score costs %f", train_score_time * 1e-3);
Guolin Ke's avatar
Guolin Ke committed
58
  Log::Info("GBDT::out_of_bag_score costs %f", out_of_bag_score_time * 1e-3);
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
63
  Log::Info("GBDT::valid_score costs %f", valid_score_time * 1e-3);
  Log::Info("GBDT::metric costs %f", metric_time * 1e-3);
  Log::Info("GBDT::bagging costs %f", bagging_time * 1e-3);
  Log::Info("GBDT::sub_gradient costs %f", sub_gradient_time * 1e-3);
  Log::Info("GBDT::tree costs %f", tree_time * 1e-3);
Guolin Ke's avatar
Guolin Ke committed
64
  #endif
Guolin Ke's avatar
Guolin Ke committed
65
66
}

67
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
68
                const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
69
  CHECK(train_data->num_features() > 0);
70
  train_data_ = train_data;
71
  iter_ = 0;
wxchan's avatar
wxchan committed
72
  num_iteration_for_pred_ = 0;
73
  max_feature_idx_ = 0;
wxchan's avatar
wxchan committed
74
  num_class_ = config->num_class;
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  gbdt_config_ = std::unique_ptr<BoostingConfig>(new BoostingConfig(*config));
  early_stopping_round_ = gbdt_config_->early_stopping_round;
  shrinkage_rate_ = gbdt_config_->learning_rate;

  objective_function_ = objective_function;
  num_tree_per_iteration_ = num_class_;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
    num_tree_per_iteration_ = objective_function_->NumTreePerIteration();
  } else {
    is_constant_hessian_ = false;
  }

  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->device_type, &gbdt_config_->tree_config));

  // init tree learner
  tree_learner_->Init(train_data_, is_constant_hessian_);

  // push training metrics
  training_metrics_.clear();
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
  training_metrics_.shrink_to_fit();

  train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));

  num_data_ = train_data_->num_data();
  // create buffer for gradients and hessians
  if (objective_function_ != nullptr) {
    size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
    gradients_.resize(total_size);
    hessians_.resize(total_size);
  }
  // get max feature index
  max_feature_idx_ = train_data_->num_total_features() - 1;
  // get label index
  label_idx_ = train_data_->label_idx();
  // get feature names
  feature_names_ = train_data_->feature_names();
  feature_infos_ = train_data_->feature_infos();

  // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
118
  ResetBaggingConfig(gbdt_config_.get(), true);
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

  // reset config for tree learner
  class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
  if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
    CHECK(num_tree_per_iteration_ == num_class_);
    // + 1 here for the binary classification
    class_default_output_ = std::vector<double>(num_tree_per_iteration_, 0.0f);
    auto label = train_data_->metadata().label();
    if (num_tree_per_iteration_ > 1) {
      // multi-class
      std::vector<data_size_t> cnt_per_class(num_tree_per_iteration_, 0);
      for (data_size_t i = 0; i < num_data_; ++i) {
        int index = static_cast<int>(label[i]);
        CHECK(index < num_tree_per_iteration_);
        ++cnt_per_class[index];
      }
      for (int i = 0; i < num_tree_per_iteration_; ++i) {
        if (cnt_per_class[i] == num_data_) {
          class_need_train_[i] = false;
          class_default_output_[i] = -std::log(kEpsilon);
        } else if (cnt_per_class[i] == 0) {
          class_need_train_[i] = false;
          class_default_output_[i] = -std::log(1.0f / kEpsilon - 1.0f);
        }
      }
    } else {
      // binary class
      data_size_t cnt_pos = 0;
      for (data_size_t i = 0; i < num_data_; ++i) {
        if (label[i] > 0) {
          ++cnt_pos;
        }
      }
      if (cnt_pos == 0) {
        class_need_train_[0] = false;
        class_default_output_[0] = -std::log(1.0f / kEpsilon - 1.0f);
      } else if (cnt_pos == num_data_) {
        class_need_train_[0] = false;
        class_default_output_[0] = -std::log(kEpsilon);
      }
    }
  }
wxchan's avatar
wxchan committed
161
162
}

163
void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
164
                             const std::vector<const Metric*>& training_metrics) {
165
  if (train_data != train_data_ && !train_data_->CheckAlign(*train_data)) {
wxchan's avatar
wxchan committed
166
167
    Log::Fatal("cannot reset training data, since new training data has different bin mappers");
  }
Guolin Ke's avatar
Guolin Ke committed
168
  CHECK(train_data->num_features() > 0);
169
170
171
  objective_function_ = objective_function;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
Guolin Ke's avatar
Guolin Ke committed
172
    CHECK(num_tree_per_iteration_ == objective_function_->NumTreePerIteration());
173
174
175
  } else {
    is_constant_hessian_ = false;
  }
Guolin Ke's avatar
Guolin Ke committed
176

177
178
179
180
181
182
  // push training metrics
  training_metrics_.clear();
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
  training_metrics_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
183

184
185
  if (train_data != train_data_) {
    train_data_ = train_data;
wxchan's avatar
wxchan committed
186
187
    // not same training data, need reset score and others
    // create score tracker
188
    train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
wxchan's avatar
wxchan committed
189
190
    // update score
    for (int i = 0; i < iter_; ++i) {
191
192
193
      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
        auto curr_tree = (i + num_init_iteration_) * num_tree_per_iteration_ + cur_tree_id;
        train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
194
195
      }
    }
196
197
    num_data_ = train_data_->num_data();

wxchan's avatar
wxchan committed
198
    // create buffer for gradients and hessians
199
200
201
202
203
    if (objective_function_ != nullptr) {
      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }
204

wxchan's avatar
wxchan committed
205
    // get max feature index
206
    max_feature_idx_ = train_data_->num_total_features() - 1;
wxchan's avatar
wxchan committed
207
    // get label index
208
    label_idx_ = train_data_->label_idx();
209
    // get feature names
210
211
212
213
    feature_names_ = train_data_->feature_names();

    feature_infos_ = train_data_->feature_infos();

Guolin Ke's avatar
Guolin Ke committed
214
    ResetBaggingConfig(gbdt_config_.get(), true);
Guolin Ke's avatar
Guolin Ke committed
215

216
    tree_learner_->ResetTrainingData(train_data);
Guolin Ke's avatar
Guolin Ke committed
217
  }
218
}
Guolin Ke's avatar
Guolin Ke committed
219

220
221
222
223
void GBDT::ResetConfig(const BoostingConfig* config) {
  auto new_config = std::unique_ptr<BoostingConfig>(new BoostingConfig(*config));
  early_stopping_round_ = new_config->early_stopping_round;
  shrinkage_rate_ = new_config->learning_rate;
Guolin Ke's avatar
Guolin Ke committed
224
  if (tree_learner_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
225
    ResetBaggingConfig(new_config.get(), false);
Guolin Ke's avatar
Guolin Ke committed
226
227
    tree_learner_->ResetConfig(&new_config->tree_config);
  }
228
229
230
  gbdt_config_.reset(new_config.release());
}

Guolin Ke's avatar
Guolin Ke committed
231
void GBDT::ResetBaggingConfig(const BoostingConfig* config, bool is_change_dataset) {
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
  // if need bagging, create buffer
  if (config->bagging_fraction < 1.0 && config->bagging_freq > 0) {
    bag_data_cnt_ =
      static_cast<data_size_t>(config->bagging_fraction * num_data_);
    bag_data_indices_.resize(num_data_);
    tmp_indices_.resize(num_data_);
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);
    double average_bag_rate = config->bagging_fraction / config->bagging_freq;
    int sparse_group = 0;
    for (int i = 0; i < train_data_->num_feature_groups(); ++i) {
      if (train_data_->FeatureGroupIsSparse(i)) {
        ++sparse_group;
Guolin Ke's avatar
Guolin Ke committed
248
      }
wxchan's avatar
wxchan committed
249
    }
250
251
252
253
254
    is_use_subset_ = false;
    const int group_threshold_usesubset = 100;
    const int sparse_group_threshold_usesubset = train_data_->num_feature_groups() / 4;
    if (average_bag_rate <= 0.5
        && (train_data_->num_feature_groups() < group_threshold_usesubset || sparse_group < sparse_group_threshold_usesubset)) {
Guolin Ke's avatar
Guolin Ke committed
255
256
257
258
      if (tmp_subset_ == nullptr || is_change_dataset) {
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
        tmp_subset_->CopyFeatureMapperFrom(train_data_);
      }
259
260
      is_use_subset_ = true;
      Log::Debug("use subset for bagging");
261
    }
262
263
264
265
266
  } else {
    bag_data_cnt_ = num_data_;
    bag_data_indices_.clear();
    tmp_indices_.clear();
    is_use_subset_ = false;
Guolin Ke's avatar
Guolin Ke committed
267
  }
Guolin Ke's avatar
Guolin Ke committed
268
269
270
271

  if (is_change_dataset) {
    need_re_bagging_ = true;
  }
Guolin Ke's avatar
Guolin Ke committed
272
273
}

wxchan's avatar
wxchan committed
274
void GBDT::AddValidDataset(const Dataset* valid_data,
275
                           const std::vector<const Metric*>& valid_metrics) {
wxchan's avatar
wxchan committed
276
277
  if (!train_data_->CheckAlign(*valid_data)) {
    Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
278
  }
Guolin Ke's avatar
Guolin Ke committed
279
  // for a validation dataset, we need its score and metric
280
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
wxchan's avatar
wxchan committed
281
282
  // update score
  for (int i = 0; i < iter_; ++i) {
283
284
285
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
      auto curr_tree = (i + num_init_iteration_) * num_tree_per_iteration_ + cur_tree_id;
      new_score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
286
287
    }
  }
Guolin Ke's avatar
Guolin Ke committed
288
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
289
  valid_metrics_.emplace_back();
290
291
292
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
Guolin Ke's avatar
Guolin Ke committed
293
    best_msg_.emplace_back();
294
  }
Guolin Ke's avatar
Guolin Ke committed
295
296
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
297
298
299
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
Guolin Ke's avatar
Guolin Ke committed
300
      best_msg_.back().emplace_back();
301
    }
Guolin Ke's avatar
Guolin Ke committed
302
  }
Guolin Ke's avatar
Guolin Ke committed
303
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
304
305
}

306
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer) {
307
308
309
  if (cnt <= 0) {
    return 0;
  }
310
311
312
313
  data_size_t bag_data_cnt =
    static_cast<data_size_t>(gbdt_config_->bagging_fraction * cnt);
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
314
  auto right_buffer = buffer + bag_data_cnt;
315
316
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
317
318
319
    float prob =
      (bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
    if (cur_rand.NextFloat() < prob) {
320
321
      buffer[cur_left_cnt++] = start + i;
    } else {
Guolin Ke's avatar
Guolin Ke committed
322
      right_buffer[cur_right_cnt++] = start + i;
323
324
325
326
327
    }
  }
  CHECK(cur_left_cnt == bag_data_cnt);
  return cur_left_cnt;
}
Guolin Ke's avatar
Guolin Ke committed
328

329
void GBDT::Bagging(int iter) {
Guolin Ke's avatar
Guolin Ke committed
330
  // if need bagging
Guolin Ke's avatar
Guolin Ke committed
331
332
  if ( (bag_data_cnt_ < num_data_ && iter % gbdt_config_->bagging_freq == 0) || need_re_bagging_) {
    need_re_bagging_ = false;
Guolin Ke's avatar
Guolin Ke committed
333
    const data_size_t min_inner_size = 1000;
334
335
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
336
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
337
    #pragma omp parallel for schedule(static,1)
338
    for (int i = 0; i < num_threads_; ++i) {
339
      OMP_LOOP_EX_BEGIN();
340
341
342
343
344
345
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Guolin Ke's avatar
Guolin Ke committed
346
347
      Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
348
349
350
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
351
      OMP_LOOP_EX_END();
352
    }
353
    OMP_THROW_EX();
354
355
356
357
358
359
360
361
362
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

Guolin Ke's avatar
Guolin Ke committed
363
    #pragma omp parallel for schedule(static, 1)
364
    for (int i = 0; i < num_threads_; ++i) {
365
      OMP_LOOP_EX_BEGIN();
366
367
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
368
                    tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
369
      }
370
371
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
372
                    tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
373
      }
374
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
375
    }
376
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
377
    bag_data_cnt_ = left_cnt;
Guolin Ke's avatar
Guolin Ke committed
378
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
379
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
380
381
382
383
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
Guolin Ke's avatar
Guolin Ke committed
384
385
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
Guolin Ke's avatar
Guolin Ke committed
386
387
      tree_learner_->ResetTrainingData(tmp_subset_.get());
    }
Guolin Ke's avatar
Guolin Ke committed
388
389
390
  }
}

391
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
392
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
393
  auto start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
394
  #endif
395
  // we need to predict out-of-bag scores of data for boosting
Guolin Ke's avatar
Guolin Ke committed
396
  if (num_data_ - bag_data_cnt_ > 0 && !is_use_subset_) {
397
    train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
398
  }
Guolin Ke's avatar
Guolin Ke committed
399
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
400
  out_of_bag_score_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
401
  #endif
Guolin Ke's avatar
Guolin Ke committed
402
403
}

404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
/* If the custom "average" is implemented it will be used inplace of the label average (if enabled)
 *
 * An improvement to this is to have options to explicitly choose
 * (i) standard average
 * (ii) custom average if available
 * (iii) any user defined scalar bias (e.g. using a new option "init_score" that overrides (i) and (ii) )
 *
 * (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
 *
 */
double ObtainAutomaticInitialScore(const ObjectiveFunction* objf, const float* label, data_size_t num_data) {
  double init_score = 0.0f;
  bool got_custom = false;
  if (objf != nullptr) {
    got_custom = objf->GetCustomAverage(&init_score);
  }
  if (!got_custom) {
    double sum_label = 0.0f;
    #pragma omp parallel for schedule(static) reduction(+:sum_label)
    for (data_size_t i = 0; i < num_data; ++i) {
      sum_label += label[i];
    }
    init_score = sum_label / num_data;
Guolin Ke's avatar
Guolin Ke committed
427
428
429
430
431
432
  }
  if (Network::num_machines() > 1) {
    double global_init_score = 0.0f;
    Network::Allreduce(reinterpret_cast<char*>(&init_score),
                       sizeof(init_score), sizeof(init_score),
                       reinterpret_cast<char*>(&global_init_score),
433
                       [] (const char* src, char* dst, int len) {
Guolin Ke's avatar
Guolin Ke committed
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
      int used_size = 0;
      const int type_size = sizeof(double);
      const double *p1;
      double *p2;
      while (used_size < len) {
        p1 = reinterpret_cast<const double *>(src);
        p2 = reinterpret_cast<double *>(dst);
        *p2 += *p1;
        src += type_size;
        dst += type_size;
        used_size += type_size;
      }
    });
    return global_init_score / Network::num_machines();
  } else {
    return init_score;
  }
}

Guolin Ke's avatar
Guolin Ke committed
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
  bool is_finished = false;
  bool need_eval = true;
  auto start_time = std::chrono::steady_clock::now();
  for (int iter = 0; iter < gbdt_config_->num_iterations && !is_finished; ++iter) {
    is_finished = TrainOneIter(nullptr, nullptr, need_eval);
    auto end_time = std::chrono::steady_clock::now();
    // output used time per iteration
    Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
              std::milli>(end_time - start_time) * 1e-3, iter + 1);
    if (snapshot_freq > 0
        && (iter + 1) % snapshot_freq == 0) {
      std::string snapshot_out = model_output_path + ".snapshot_iter_" + std::to_string(iter + 1);
      SaveModelToFile(-1, snapshot_out.c_str());
    }
  }
  SaveModelToFile(-1, model_output_path.c_str());
}

472
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
473
  // boosting from average label; or customized "average" if implemented for the current objective
Guolin Ke's avatar
Guolin Ke committed
474
475
  if (models_.empty()
      && gbdt_config_->boost_from_average
476
      && !train_score_updater_->has_init_score()
477
478
479
      && num_class_ <= 1
      && objective_function_ != nullptr
      && objective_function_->BoostFromAverage()) {
480
481
    auto label = train_data_->metadata().label();
    double init_score = ObtainAutomaticInitialScore(objective_function_, label, num_data_);
482
    std::unique_ptr<Tree> new_tree(new Tree(2));
483
    new_tree->AsConstantTree(init_score);
484
485
486
487
488
    train_score_updater_->AddScore(init_score, 0);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(init_score, 0);
    }
    models_.push_back(std::move(new_tree));
489
490
    boost_from_average_ = true;
  }
Guolin Ke's avatar
Guolin Ke committed
491

Guolin Ke's avatar
Guolin Ke committed
492
493
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
Guolin Ke's avatar
Guolin Ke committed
494
    #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
495
    auto start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
496
    #endif
Guolin Ke's avatar
Guolin Ke committed
497
    Boosting();
498
499
    gradient = gradients_.data();
    hessian = hessians_.data();
Guolin Ke's avatar
Guolin Ke committed
500
    #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
501
    boosting_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
502
    #endif
Guolin Ke's avatar
Guolin Ke committed
503
  }
Guolin Ke's avatar
Guolin Ke committed
504
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
505
  auto start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
506
  #endif
507
508
  // bagging logic
  Bagging(iter_);
Guolin Ke's avatar
Guolin Ke committed
509
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
510
  bagging_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
511
  #endif
Guolin Ke's avatar
Guolin Ke committed
512
  // need to use subset gradient and hessian
Guolin Ke's avatar
Guolin Ke committed
513
  if (is_use_subset_ && bag_data_cnt_ < num_data_) {
Guolin Ke's avatar
Guolin Ke committed
514
    #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
515
    start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
516
    #endif
517
518
519
520
521
    if (gradients_.empty()) {
      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }
Guolin Ke's avatar
Guolin Ke committed
522
    // get sub gradients
523
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
524
      size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
525
      // cannot multi-threading here.
Guolin Ke's avatar
Guolin Ke committed
526
      for (int i = 0; i < bag_data_cnt_; ++i) {
527
528
        gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
        hessians_[bias + i] = hessian[bias + bag_data_indices_[i]];
Guolin Ke's avatar
Guolin Ke committed
529
530
      }
    }
531
532
    gradient = gradients_.data();
    hessian = hessians_.data();
Guolin Ke's avatar
Guolin Ke committed
533
    #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
534
    sub_gradient_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
535
    #endif
Guolin Ke's avatar
Guolin Ke committed
536
  }
Guolin Ke's avatar
Guolin Ke committed
537
  bool should_continue = false;
538
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
539
    #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
540
    start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
541
    #endif
542
    std::unique_ptr<Tree> new_tree(new Tree(2));
543
    if (class_need_train_[cur_tree_id]) {
544
      size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
545
      new_tree.reset(
546
        tree_learner_->Train(gradient + bias, hessian + bias, is_constant_hessian_));
547
    }
Guolin Ke's avatar
Guolin Ke committed
548
    #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
549
    tree_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
550
    #endif
Guolin Ke's avatar
Guolin Ke committed
551
552

    if (new_tree->num_leaves() > 1) {
Guolin Ke's avatar
Guolin Ke committed
553
554
555
556
      should_continue = true;
      // shrinkage by learning rate
      new_tree->Shrinkage(shrinkage_rate_);
      // update score
557
558
      UpdateScore(new_tree.get(), cur_tree_id);
      UpdateScoreOutOfBag(new_tree.get(), cur_tree_id);
559
560
    } else {
      // only add default score one-time
561
562
      if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
        auto output = class_default_output_[cur_tree_id];
563
        new_tree->AsConstantTree(output);
564
        train_score_updater_->AddScore(output, cur_tree_id);
565
        for (auto& score_updater : valid_score_updater_) {
566
          score_updater->AddScore(output, cur_tree_id);
567
568
569
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
570
571
572
    // add model
    models_.push_back(std::move(new_tree));
  }
Guolin Ke's avatar
Guolin Ke committed
573
  if (!should_continue) {
Guolin Ke's avatar
Guolin Ke committed
574
    Log::Warning("Stopped training because there are no more leaves that meet the split requirements.");
575
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
576
577
578
579
      models_.pop_back();
    }
    return true;
  }
Guolin Ke's avatar
Guolin Ke committed
580
581
582
583
584
585
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
586

Guolin Ke's avatar
Guolin Ke committed
587
}
588

wxchan's avatar
wxchan committed
589
void GBDT::RollbackOneIter() {
590
  if (iter_ <= 0) { return; }
wxchan's avatar
wxchan committed
591
592
  int cur_iter = iter_ + num_init_iteration_ - 1;
  // reset score
593
594
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
    auto curr_tree = cur_iter * num_tree_per_iteration_ + cur_tree_id;
wxchan's avatar
wxchan committed
595
    models_[curr_tree]->Shrinkage(-1.0);
596
    train_score_updater_->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
597
    for (auto& score_updater : valid_score_updater_) {
598
      score_updater->AddScore(models_[curr_tree].get(), cur_tree_id);
wxchan's avatar
wxchan committed
599
600
601
    }
  }
  // remove model
602
  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
wxchan's avatar
wxchan committed
603
604
605
606
607
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
608
bool GBDT::EvalAndCheckEarlyStopping() {
609
  bool is_met_early_stopping = false;
Guolin Ke's avatar
Guolin Ke committed
610
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
611
  auto start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
612
  #endif
613
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
614
  auto best_msg = OutputMetric(iter_);
Guolin Ke's avatar
Guolin Ke committed
615
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
616
  metric_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
617
  #endif
Guolin Ke's avatar
Guolin Ke committed
618
  is_met_early_stopping = !best_msg.empty();
619
620
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
621
              iter_, iter_ - early_stopping_round_);
Guolin Ke's avatar
Guolin Ke committed
622
    Log::Info("Output of best iteration round:\n%s", best_msg.c_str());
623
    // pop last early_stopping_round_ models
624
    for (int i = 0; i < early_stopping_round_ * num_tree_per_iteration_; ++i) {
625
626
627
628
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
629
630
}

631
void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
Guolin Ke's avatar
Guolin Ke committed
632
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
633
  auto start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
634
  #endif
Guolin Ke's avatar
Guolin Ke committed
635
  // update training score
Guolin Ke's avatar
Guolin Ke committed
636
  if (!is_use_subset_) {
637
    train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
638
  } else {
639
    train_score_updater_->AddScore(tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
640
  }
Guolin Ke's avatar
Guolin Ke committed
641
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
642
  train_score_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
643
644
  #endif
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
645
  start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
646
  #endif
Guolin Ke's avatar
Guolin Ke committed
647
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
648
  for (auto& score_updater : valid_score_updater_) {
649
    score_updater->AddScore(tree, cur_tree_id);
Guolin Ke's avatar
Guolin Ke committed
650
  }
Guolin Ke's avatar
Guolin Ke committed
651
  #ifdef TIMETAG
Guolin Ke's avatar
Guolin Ke committed
652
  valid_score_time += std::chrono::steady_clock::now() - start_time;
Guolin Ke's avatar
Guolin Ke committed
653
  #endif
Guolin Ke's avatar
Guolin Ke committed
654
655
}

Guolin Ke's avatar
Guolin Ke committed
656
657
658
659
std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
  return metric->Eval(score, objective_function_);
}

Guolin Ke's avatar
Guolin Ke committed
660
661
662
663
std::string GBDT::OutputMetric(int iter) {
  bool need_output = (iter % gbdt_config_->output_freq) == 0;
  std::string ret = "";
  std::stringstream msg_buf;
664
  std::vector<std::pair<size_t, size_t>> meet_early_stopping_pairs;
Guolin Ke's avatar
Guolin Ke committed
665
  // print training metric
Guolin Ke's avatar
Guolin Ke committed
666
  if (need_output) {
667
668
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
Guolin Ke's avatar
Guolin Ke committed
669
      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
670
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
671
672
673
674
675
676
677
678
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
          << ", training " << name[k]
          << " : " << scores[k];
        Log::Info(tmp_buf.str().c_str());
        if (early_stopping_round_ > 0) {
          msg_buf << tmp_buf.str() << std::endl;
        }
679
      }
680
    }
Guolin Ke's avatar
Guolin Ke committed
681
682
  }
  // print validation metric
Guolin Ke's avatar
Guolin Ke committed
683
  if (need_output || early_stopping_round_ > 0) {
684
685
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
686
        auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score());
Guolin Ke's avatar
Guolin Ke committed
687
688
689
690
691
692
693
694
695
696
697
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
          tmp_buf << "Iteration:" << iter
            << ", valid_" << i + 1 << " " << name[k]
            << " : " << test_scores[k];
          if (need_output) {
            Log::Info(tmp_buf.str().c_str());
          }
          if (early_stopping_round_ > 0) {
            msg_buf << tmp_buf.str() << std::endl;
698
          }
wxchan's avatar
wxchan committed
699
        }
Guolin Ke's avatar
Guolin Ke committed
700
        if (ret.empty() && early_stopping_round_ > 0) {
701
702
703
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
704
            best_iter_[i][j] = iter;
Guolin Ke's avatar
Guolin Ke committed
705
            meet_early_stopping_pairs.emplace_back(i, j);
706
          } else {
Guolin Ke's avatar
Guolin Ke committed
707
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; }
708
          }
wxchan's avatar
wxchan committed
709
710
        }
      }
Guolin Ke's avatar
Guolin Ke committed
711
712
    }
  }
Guolin Ke's avatar
Guolin Ke committed
713
714
715
  for (auto& pair : meet_early_stopping_pairs) {
    best_msg_[pair.first][pair.second] = msg_buf.str();
  }
wxchan's avatar
wxchan committed
716
  return ret;
Guolin Ke's avatar
Guolin Ke committed
717
718
}

719
/*! \brief Get eval result */
720
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
Guolin Ke's avatar
Guolin Ke committed
721
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
722
723
  std::vector<double> ret;
  if (data_idx == 0) {
724
    for (auto& sub_metric : training_metrics_) {
Guolin Ke's avatar
Guolin Ke committed
725
      auto scores = EvalOneMetric(sub_metric, train_score_updater_->score());
726
727
728
      for (auto score : scores) {
        ret.push_back(score);
      }
729
    }
730
  } else {
731
732
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
733
      auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score());
734
735
736
      for (auto score : test_scores) {
        ret.push_back(score);
      }
737
738
739
740
741
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
742
/*! \brief Get training scores result */
743
const double* GBDT::GetTrainingScore(int64_t* out_len) {
744
  *out_len = static_cast<int64_t>(train_score_updater_->num_data()) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
745
  return train_score_updater_->score();
746
747
}

748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
void GBDT::PredictContrib(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
  int early_stop_round_counter = 0;
  // set zero
  const int num_features = max_feature_idx_+1;
  std::memset(output, 0, sizeof(double) * num_tree_per_iteration_ * (num_features+1));
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
    // predict all the trees for one iteration
    for (int k = 0; k < num_tree_per_iteration_; ++k) {
      models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features+1));
    }
    // check early stopping
    ++early_stop_round_counter;
    if (early_stop->round_period == early_stop_round_counter) {
      if (early_stop->callback_function(output, num_tree_per_iteration_)) {
        return;
      }
      early_stop_round_counter = 0;
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
769
770
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
Guolin Ke's avatar
Guolin Ke committed
771

772
  const double* raw_scores = nullptr;
Guolin Ke's avatar
Guolin Ke committed
773
774
  data_size_t num_data = 0;
  if (data_idx == 0) {
wxchan's avatar
wxchan committed
775
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
776
777
778
779
780
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
781
    *out_len = static_cast<int64_t>(num_data) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
782
  }
Guolin Ke's avatar
Guolin Ke committed
783
  if (objective_function_ != nullptr && !average_output_) {
Guolin Ke's avatar
Guolin Ke committed
784
785
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
786
      std::vector<double> tree_pred(num_tree_per_iteration_);
787
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
788
        tree_pred[j] = raw_scores[j * num_data + i];
789
      }
Guolin Ke's avatar
Guolin Ke committed
790
791
      std::vector<double> tmp_result(num_class_);
      objective_function_->ConvertOutput(tree_pred.data(), tmp_result.data());
Guolin Ke's avatar
Guolin Ke committed
792
      for (int j = 0; j < num_class_; ++j) {
793
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
Guolin Ke's avatar
Guolin Ke committed
794
795
      }
    }
796
  } else {
Guolin Ke's avatar
Guolin Ke committed
797
    #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
798
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
799
      std::vector<double> tmp_result(num_tree_per_iteration_);
800
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
801
        out_result[j * num_data + i] = static_cast<double>(raw_scores[j * num_data + i]);
Guolin Ke's avatar
Guolin Ke committed
802
803
804
805
806
      }
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
807
void GBDT::Boosting() {
808
  if (objective_function_ == nullptr) {
809
810
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
811
  // objective function will calculate gradients and hessians
812
  int64_t num_score = 0;
813
  objective_function_->
Guolin Ke's avatar
Guolin Ke committed
814
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
815
816
}

817
std::string GBDT::DumpModel(int num_iteration) const {
Guolin Ke's avatar
Guolin Ke committed
818
  std::stringstream str_buf;
wxchan's avatar
wxchan committed
819

Guolin Ke's avatar
Guolin Ke committed
820
  str_buf << "{";
Guolin Ke's avatar
Guolin Ke committed
821
  str_buf << "\"name\":\"" << SubModelName() << "\"," << std::endl;
Guolin Ke's avatar
Guolin Ke committed
822
  str_buf << "\"num_class\":" << num_class_ << "," << std::endl;
823
  str_buf << "\"num_tree_per_iteration\":" << num_tree_per_iteration_ << "," << std::endl;
Guolin Ke's avatar
Guolin Ke committed
824
825
  str_buf << "\"label_index\":" << label_idx_ << "," << std::endl;
  str_buf << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
wxchan's avatar
wxchan committed
826

827
828
829
  str_buf << "\"feature_names\":[\""
    << Common::Join(feature_names_, "\",\"") << "\"],"
    << std::endl;
Guolin Ke's avatar
Guolin Ke committed
830

Guolin Ke's avatar
Guolin Ke committed
831
  str_buf << "\"tree_info\":[";
832
833
  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
Guolin Ke's avatar
Guolin Ke committed
834
    num_iteration += boost_from_average_ ? 1 : 0;
835
    num_used_model = std::min(num_iteration * num_tree_per_iteration_, num_used_model);
836
  }
837
  for (int i = 0; i < num_used_model; ++i) {
wxchan's avatar
wxchan committed
838
    if (i > 0) {
Guolin Ke's avatar
Guolin Ke committed
839
      str_buf << ",";
wxchan's avatar
wxchan committed
840
    }
Guolin Ke's avatar
Guolin Ke committed
841
842
843
844
    str_buf << "{";
    str_buf << "\"tree_index\":" << i << ",";
    str_buf << models_[i]->ToJSON();
    str_buf << "}";
wxchan's avatar
wxchan committed
845
  }
Guolin Ke's avatar
Guolin Ke committed
846
  str_buf << "]" << std::endl;
wxchan's avatar
wxchan committed
847

Guolin Ke's avatar
Guolin Ke committed
848
  str_buf << "}" << std::endl;
wxchan's avatar
wxchan committed
849

Guolin Ke's avatar
Guolin Ke committed
850
  return str_buf.str();
wxchan's avatar
wxchan committed
851
852
}

853
854
855
std::string GBDT::ModelToIfElse(int num_iteration) const {
  std::stringstream str_buf;

856
857
858
859
  str_buf << "#include \"gbdt.h\"" << std::endl;
  str_buf << "#include <LightGBM/utils/common.h>" << std::endl;
  str_buf << "#include <LightGBM/objective_function.h>" << std::endl;
  str_buf << "#include <LightGBM/metric.h>" << std::endl;
cbecker's avatar
cbecker committed
860
  str_buf << "#include <LightGBM/prediction_early_stop.h>" << std::endl;
861
862
863
864
865
866
867
868
  str_buf << "#include <ctime>" << std::endl;
  str_buf << "#include <sstream>" << std::endl;
  str_buf << "#include <chrono>" << std::endl;
  str_buf << "#include <string>" << std::endl;
  str_buf << "#include <vector>" << std::endl;
  str_buf << "#include <utility>" << std::endl;
  str_buf << "namespace LightGBM {" << std::endl;

869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_iteration += boost_from_average_ ? 1 : 0;
    num_used_model = std::min(num_iteration * num_tree_per_iteration_, num_used_model);
  }

  // PredictRaw
  for (int i = 0; i < num_used_model; ++i) {
    str_buf << models_[i]->ToIfElse(i, false) << std::endl;
  }

  str_buf << "double (*PredictTreePtr[])(const double*) = { ";
  for (int i = 0; i < num_used_model; ++i) {
    if (i > 0) {
      str_buf << " , ";
    }
    str_buf << "PredictTree" << i;
  }
  str_buf << " };" << std::endl << std::endl;

  std::stringstream pred_str_buf;

891
  pred_str_buf << "\t" << "int early_stop_round_counter = 0;" << std::endl;
892
  pred_str_buf << "\t" << "std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);" << std::endl;
cbecker's avatar
cbecker committed
893
  pred_str_buf << "\t" << "for (int i = 0; i < num_iteration_for_pred_; ++i) {" << std::endl;
894
  pred_str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
cbecker's avatar
cbecker committed
895
  pred_str_buf << "\t\t\t" << "output[k] += (*PredictTreePtr[i * num_tree_per_iteration_ + k])(features);" << std::endl;
896
  pred_str_buf << "\t\t" << "}" << std::endl;
897
898
899
  pred_str_buf << "\t\t" << "++early_stop_round_counter;" << std::endl;
  pred_str_buf << "\t\t" << "if (early_stop->round_period == early_stop_round_counter) {" << std::endl;
  pred_str_buf << "\t\t\t" << "if (early_stop->callback_function(output, num_tree_per_iteration_))" << std::endl;
cbecker's avatar
cbecker committed
900
  pred_str_buf << "\t\t\t\t" << "return;" << std::endl;
901
  pred_str_buf << "\t\t\t" << "early_stop_round_counter = 0;" << std::endl;
902
903
904
  pred_str_buf << "\t\t" << "}" << std::endl;
  pred_str_buf << "\t" << "}" << std::endl;

905
  str_buf << "void GBDT::PredictRaw(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
906
907
908
909
910
  str_buf << pred_str_buf.str();
  str_buf << "}" << std::endl;
  str_buf << std::endl;

  // Predict
911
912
  str_buf << "void GBDT::Predict(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
  str_buf << "\t" << "PredictRaw(features, output, early_stop);" << std::endl;
Guolin Ke's avatar
Guolin Ke committed
913
914
915
916
917
918
  str_buf << "\t" << "if (average_output_) {" << std::endl;
  str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
  str_buf << "\t\t\t" << "output[k] /= num_iteration_for_pred_;" << std::endl;
  str_buf << "\t\t" << "}" << std::endl;
  str_buf << "\t" << "}" << std::endl;
  str_buf << "\t" << "else if (objective_function_ != nullptr) {" << std::endl;
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
  str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << std::endl;
  str_buf << "\t" << "}" << std::endl;
  str_buf << "}" << std::endl;
  str_buf << std::endl;

  // PredictLeafIndex
  for (int i = 0; i < num_used_model; ++i) {
    str_buf << models_[i]->ToIfElse(i, true) << std::endl;
  }

  str_buf << "double (*PredictTreeLeafPtr[])(const double*) = { ";
  for (int i = 0; i < num_used_model; ++i) {
    if (i > 0) {
      str_buf << " , ";
    }
    str_buf << "PredictTree" << i << "Leaf";
  }
  str_buf << " };" << std::endl << std::endl;

  str_buf << "void GBDT::PredictLeafIndex(const double* features, double *output) const {" << std::endl;
  str_buf << "\t" << "int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;" << std::endl;
  str_buf << "\t" << "for (int i = 0; i < total_tree; ++i) {" << std::endl;
  str_buf << "\t\t" << "output[i] = (*PredictTreeLeafPtr[i])(features);" << std::endl;
  str_buf << "\t" << "}" << std::endl;
  str_buf << "}" << std::endl;
944
945
946

  str_buf << "}  // namespace LightGBM" << std::endl;

947
948
949
950
951
952
  return str_buf.str();
}

bool GBDT::SaveModelToIfElse(int num_iteration, const char* filename) const {
  /*! \brief File to write models */
  std::ofstream output_file;
953
954
955
  std::ifstream ifs(filename);
  if (ifs.good()) {
    std::string origin((std::istreambuf_iterator<char>(ifs)),
956
      (std::istreambuf_iterator<char>()));
957
958
959
960
961
962
963
964
965
966
967
    output_file.open(filename);
    output_file << "#define USE_HARD_CODE 0" << std::endl;
    output_file << "#ifndef USE_HARD_CODE" << std::endl;
    output_file << origin << std::endl;
    output_file << "#else" << std::endl;
    output_file << ModelToIfElse(num_iteration);
    output_file << "#endif" << std::endl;
  } else {
    output_file.open(filename);
    output_file << ModelToIfElse(num_iteration);
  }
968

969
  ifs.close();
970
971
972
973
974
  output_file.close();

  return (bool)output_file;
}

Guolin Ke's avatar
Guolin Ke committed
975
std::string GBDT::SaveModelToString(int num_iteration) const {
976
  std::stringstream ss;
977

978
979
980
981
  // output model type
  ss << SubModelName() << std::endl;
  // output number of class
  ss << "num_class=" << num_class_ << std::endl;
982
  ss << "num_tree_per_iteration=" << num_tree_per_iteration_ << std::endl;
983
984
985
986
  // output label index
  ss << "label_index=" << label_idx_ << std::endl;
  // output max_feature_idx
  ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
987
988
989
  // output objective
  if (objective_function_ != nullptr) {
    ss << "objective=" << objective_function_->ToString() << std::endl;
990
  }
991

992
993
994
  if (boost_from_average_) {
    ss << "boost_from_average" << std::endl;
  }
Guolin Ke's avatar
Guolin Ke committed
995

Guolin Ke's avatar
Guolin Ke committed
996
997
998
999
  if (average_output_) {
    ss << "average_output" << std::endl;
  }

1000
  ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;
1001

1002
  ss << "feature_infos=" << Common::Join(feature_infos_, " ") << std::endl;
1003

1004
1005
  std::vector<double> feature_importances = FeatureImportance(num_iteration, 0);

1006
1007
  ss << std::endl;
  int num_used_model = static_cast<int>(models_.size());
Guolin Ke's avatar
Guolin Ke committed
1008
1009
  if (num_iteration > 0) {
    num_iteration += boost_from_average_ ? 1 : 0;
1010
    num_used_model = std::min(num_iteration * num_tree_per_iteration_, num_used_model);
1011
1012
1013
1014
1015
1016
1017
  }
  // output tree models
  for (int i = 0; i < num_used_model; ++i) {
    ss << "Tree=" << i << std::endl;
    ss << models_[i]->ToString() << std::endl;
  }

1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
  // store the importance first
  std::vector<std::pair<size_t, std::string>> pairs;
  for (size_t i = 0; i < feature_importances.size(); ++i) {
    size_t feature_importances_int = static_cast<size_t>(feature_importances[i]);
    if (feature_importances_int > 0) {
      pairs.emplace_back(feature_importances_int, feature_names_[i]);
    }
  }
  // sort the importance
  std::sort(pairs.begin(), pairs.end(),
            [] (const std::pair<size_t, std::string>& lhs,
                const std::pair<size_t, std::string>& rhs) {
    return lhs.first > rhs.first;
  });
1032
1033
1034
1035
1036
1037
  ss << std::endl << "feature importances:" << std::endl;
  for (size_t i = 0; i < pairs.size(); ++i) {
    ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
  }

  return ss.str();
1038
1039
}

1040
bool GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
wxchan's avatar
wxchan committed
1041
1042
1043
  /*! \brief File to write models */
  std::ofstream output_file;
  output_file.open(filename);
1044

1045
  output_file << SaveModelToString(num_iteration);
1046

wxchan's avatar
wxchan committed
1047
  output_file.close();
1048
1049

  return (bool)output_file;
Guolin Ke's avatar
Guolin Ke committed
1050
1051
}

1052
bool GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
1053
1054
  // use serialized string to restore this object
  models_.clear();
Guolin Ke's avatar
Guolin Ke committed
1055
  std::vector<std::string> lines = Common::SplitLines(model_str.c_str());
1056
1057

  // get number of classes
1058
1059
1060
1061
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
1062
    Log::Fatal("Model file doesn't specify the number of classes");
1063
    return false;
1064
  }
1065
1066
1067
1068
1069
1070
1071
1072

  line = Common::FindFromLines(lines, "num_tree_per_iteration=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_tree_per_iteration_);
  } else {
    num_tree_per_iteration_ = num_class_;
  }

Guolin Ke's avatar
Guolin Ke committed
1073
  // get index of label
1074
1075
1076
1077
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
1078
    Log::Fatal("Model file doesn't specify the label index");
1079
    return false;
Guolin Ke's avatar
Guolin Ke committed
1080
  }
Guolin Ke's avatar
Guolin Ke committed
1081
  // get max_feature_idx first
1082
1083
1084
1085
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
1086
    Log::Fatal("Model file doesn't specify max_feature_idx");
1087
    return false;
Guolin Ke's avatar
Guolin Ke committed
1088
  }
1089
1090
1091
1092
1093
  // get boost_from_average_
  line = Common::FindFromLines(lines, "boost_from_average");
  if (line.size() > 0) {
    boost_from_average_ = true;
  }
Guolin Ke's avatar
Guolin Ke committed
1094
1095
1096
1097
1098
  // get average_output
  line = Common::FindFromLines(lines, "average_output");
  if (line.size() > 0) {
    average_output_ = true;
  }
Guolin Ke's avatar
Guolin Ke committed
1099
1100
1101
  // get feature names
  line = Common::FindFromLines(lines, "feature_names=");
  if (line.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
1102
    feature_names_ = Common::Split(line.substr(std::strlen("feature_names=")).c_str(), ' ');
Guolin Ke's avatar
Guolin Ke committed
1103
1104
    if (feature_names_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_names");
1105
      return false;
Guolin Ke's avatar
Guolin Ke committed
1106
    }
1107
  } else {
Guolin Ke's avatar
Guolin Ke committed
1108
    Log::Fatal("Model file doesn't contain feature names");
1109
    return false;
Guolin Ke's avatar
Guolin Ke committed
1110
1111
  }

Guolin Ke's avatar
Guolin Ke committed
1112
1113
  line = Common::FindFromLines(lines, "feature_infos=");
  if (line.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
1114
    feature_infos_ = Common::Split(line.substr(std::strlen("feature_infos=")).c_str(), ' ');
Guolin Ke's avatar
Guolin Ke committed
1115
1116
1117
1118
1119
1120
1121
1122
1123
    if (feature_infos_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_infos");
      return false;
    }
  } else {
    Log::Fatal("Model file doesn't contain feature infos");
    return false;
  }

1124
1125
1126
1127
1128
1129
1130
1131
  line = Common::FindFromLines(lines, "objective=");

  if (line.size() > 0) {
    auto str = Common::Split(line.c_str(), '=')[1];
    loaded_objective_.reset(ObjectiveFunction::CreateObjectiveFunction(str));
    objective_function_ = loaded_objective_.get();
  }

Guolin Ke's avatar
Guolin Ke committed
1132
  // get tree models
1133
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
1134
1135
1136
1137
1138
1139
1140
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
Guolin Ke's avatar
Guolin Ke committed
1141
      std::string tree_str = Common::Join<std::string>(lines, start, end, "\n");
1142
      models_.emplace_back(new Tree(tree_str));
Guolin Ke's avatar
Guolin Ke committed
1143
1144
1145
1146
    } else {
      ++i;
    }
  }
1147
  Log::Info("Finished loading %d models", models_.size());
1148
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
wxchan's avatar
wxchan committed
1149
  num_init_iteration_ = num_iteration_for_pred_;
1150
  iter_ = 0;
1151
1152

  return true;
Guolin Ke's avatar
Guolin Ke committed
1153
1154
}

1155
1156
1157
1158
1159
1160
1161
std::vector<double> GBDT::FeatureImportance(int num_iteration, int importance_type) const {

  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_iteration += boost_from_average_ ? 1 : 0;
    num_used_model = std::min(num_iteration * num_tree_per_iteration_, num_used_model);
  }
1162

1163
1164
1165
1166
1167
1168
1169
  std::vector<double> feature_importances(max_feature_idx_ + 1, 0.0);
  if (importance_type == 0) {
    for (int iter = boost_from_average_ ? 1 : 0; iter < num_used_model; ++iter) {
      for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
        if (models_[iter]->split_gain(split_idx) > 0) {
          feature_importances[models_[iter]->split_feature(split_idx)] += 1.0;
        }
Guolin Ke's avatar
Guolin Ke committed
1170
      }
wxchan's avatar
wxchan committed
1171
    }
1172
1173
1174
1175
1176
1177
1178
  } else if (importance_type == 1) {
    for (int iter = boost_from_average_ ? 1 : 0; iter < num_used_model; ++iter) {
      for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
        if (models_[iter]->split_gain(split_idx) > 0) {
          feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx);
        }
      }
1179
    }
1180
1181
  } else {
    Log::Fatal("Unknown importance type: only support split=0 and gain=1.");
1182
  }
1183
  return feature_importances;
wxchan's avatar
wxchan committed
1184
1185
}

Guolin Ke's avatar
Guolin Ke committed
1186
}  // namespace LightGBM