gbdt.cpp 24.4 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#include "gbdt.h"

3
#include <LightGBM/utils/openmp_wrapper.h>
4

Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
9
10
11
12
13
14
15
#include <LightGBM/utils/common.h>

#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
16
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
17
18
19

namespace LightGBM {

20
GBDT::GBDT()
21
  :iter_(0),
22
23
24
25
26
  train_data_(nullptr),
  object_function_(nullptr),
  early_stopping_round_(0),
  max_feature_idx_(0),
  num_class_(1),
27
  sigmoid_(-1.0f),
28
  num_iteration_for_pred_(0),
29
  shrinkage_rate_(0.1f),
wxchan's avatar
wxchan committed
30
  num_init_iteration_(0) {
31
32
33
34
35
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
36
37
38
39
40
}

GBDT::~GBDT() {
}

41
42
43
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
  iter_ = 0;
wxchan's avatar
wxchan committed
44
  num_iteration_for_pred_ = 0;
45
  max_feature_idx_ = 0;
wxchan's avatar
wxchan committed
46
47
  num_class_ = config->num_class;
  train_data_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
48
  gbdt_config_ = nullptr;
49
  tree_learner_ = nullptr;
wxchan's avatar
wxchan committed
50
51
52
53
54
  ResetTrainingData(config, train_data, object_function, training_metrics);
}

void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
  const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
55
  auto new_config = std::unique_ptr<BoostingConfig>(new BoostingConfig(*config));
wxchan's avatar
wxchan committed
56
57
58
  if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
    Log::Fatal("cannot reset training data, since new training data has different bin mappers");
  }
Guolin Ke's avatar
Guolin Ke committed
59
60
61
  early_stopping_round_ = new_config->early_stopping_round;
  shrinkage_rate_ = new_config->learning_rate;

Guolin Ke's avatar
Guolin Ke committed
62
  object_function_ = object_function;
Guolin Ke's avatar
Guolin Ke committed
63

Guolin Ke's avatar
Guolin Ke committed
64
  sigmoid_ = -1.0f;
wxchan's avatar
wxchan committed
65
  if (object_function_ != nullptr
Guolin Ke's avatar
Guolin Ke committed
66
67
    && std::string(object_function_->GetName()) == std::string("binary")) {
    // only binary classification need sigmoid transform
Guolin Ke's avatar
Guolin Ke committed
68
    sigmoid_ = new_config->sigmoid;
69
  }
Guolin Ke's avatar
Guolin Ke committed
70

Guolin Ke's avatar
Guolin Ke committed
71
  if (train_data_ != train_data && train_data != nullptr) {
72
73
    if (tree_learner_ == nullptr) {
      tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(new_config->tree_learner_type, &new_config->tree_config));
Guolin Ke's avatar
Guolin Ke committed
74
75
    }
    // init tree learner
76
    tree_learner_->Init(train_data);
Guolin Ke's avatar
Guolin Ke committed
77

Guolin Ke's avatar
Guolin Ke committed
78
79
80
81
82
83
    // push training metrics
    training_metrics_.clear();
    for (const auto& metric : training_metrics) {
      training_metrics_.push_back(metric);
    }
    training_metrics_.shrink_to_fit();
wxchan's avatar
wxchan committed
84
85
86
87
88
89
90
91
92
93
94
95
96
    // not same training data, need reset score and others
    // create score tracker
    train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
    // update score
    for (int i = 0; i < iter_; ++i) {
      for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
        auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
        train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
      }
    }
    num_data_ = train_data->num_data();
    // create buffer for gradients and hessians
    if (object_function_ != nullptr) {
97
98
99
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
wxchan's avatar
wxchan committed
100
101
102
103
104
    }
    // get max feature index
    max_feature_idx_ = train_data->num_total_features() - 1;
    // get label index
    label_idx_ = train_data->label_idx();
105
106
    // get feature names
    feature_names_ = train_data->feature_names();
Guolin Ke's avatar
Guolin Ke committed
107
108
  }

Guolin Ke's avatar
Guolin Ke committed
109
110
  if ((train_data_ != train_data && train_data != nullptr)
    || (gbdt_config_ != nullptr && gbdt_config_->bagging_fraction != new_config->bagging_fraction)) {
wxchan's avatar
wxchan committed
111
    // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
112
    if (new_config->bagging_fraction < 1.0 && new_config->bagging_freq > 0) {
113
114
      bag_data_cnt_ =
        static_cast<data_size_t>(new_config->bagging_fraction * num_data_);
115
      bag_data_indices_.resize(num_data_);
116
117
118
119
120
121
      tmp_indices_.resize(num_data_);
      offsets_buf_.resize(num_threads_);
      left_cnts_buf_.resize(num_threads_);
      right_cnts_buf_.resize(num_threads_);
      left_write_pos_buf_.resize(num_threads_);
      right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
122
123
      double average_bag_rate = new_config->bagging_fraction / new_config->bagging_freq;
      is_use_subset_ = false;
124
      if (average_bag_rate <= 0.5) {
Guolin Ke's avatar
Guolin Ke committed
125
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
126
        tmp_subset_->CopyFeatureMapperFrom(train_data);
Guolin Ke's avatar
Guolin Ke committed
127
128
129
        is_use_subset_ = true;
        Log::Debug("use subset for bagging");
      }
wxchan's avatar
wxchan committed
130
131
132
    } else {
      bag_data_cnt_ = num_data_;
      bag_data_indices_.clear();
133
      tmp_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
134
      is_use_subset_ = false;
wxchan's avatar
wxchan committed
135
    }
Guolin Ke's avatar
Guolin Ke committed
136
  }
wxchan's avatar
wxchan committed
137
  train_data_ = train_data;
Guolin Ke's avatar
Guolin Ke committed
138
139
  if (train_data_ != nullptr) {
    // reset config for tree learner
140
    tree_learner_->ResetConfig(&new_config->tree_config);
Guolin Ke's avatar
Guolin Ke committed
141
  }
Guolin Ke's avatar
Guolin Ke committed
142
  gbdt_config_.reset(new_config.release());
Guolin Ke's avatar
Guolin Ke committed
143
144
}

wxchan's avatar
wxchan committed
145
void GBDT::AddValidDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
146
  const std::vector<const Metric*>& valid_metrics) {
wxchan's avatar
wxchan committed
147
148
  if (!train_data_->CheckAlign(*valid_data)) {
    Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
149
  }
Guolin Ke's avatar
Guolin Ke committed
150
  // for a validation dataset, we need its score and metric
Guolin Ke's avatar
Guolin Ke committed
151
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
wxchan's avatar
wxchan committed
152
153
154
155
156
157
158
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
      new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
159
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
160
  valid_metrics_.emplace_back();
161
162
163
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
Guolin Ke's avatar
Guolin Ke committed
164
    best_msg_.emplace_back();
165
  }
Guolin Ke's avatar
Guolin Ke committed
166
167
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
168
169
170
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
Guolin Ke's avatar
Guolin Ke committed
171
      best_msg_.back().emplace_back();
172
    }
Guolin Ke's avatar
Guolin Ke committed
173
  }
Guolin Ke's avatar
Guolin Ke committed
174
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
175
176
}

Guolin Ke's avatar
Guolin Ke committed
177
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer){
178
179
180
  if (cnt <= 0) {
    return 0;
  }
181
182
183
184
  data_size_t bag_data_cnt =
    static_cast<data_size_t>(gbdt_config_->bagging_fraction * cnt);
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
185
  auto right_buffer = buffer + bag_data_cnt;
186
187
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
188
189
190
    float prob =
      (bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
    if (cur_rand.NextFloat() < prob) {
191
192
      buffer[cur_left_cnt++] = start + i;
    } else {
Guolin Ke's avatar
Guolin Ke committed
193
      right_buffer[cur_right_cnt++] = start + i;
194
195
196
197
198
    }
  }
  CHECK(cur_left_cnt == bag_data_cnt);
  return cur_left_cnt;
}
Guolin Ke's avatar
Guolin Ke committed
199

Guolin Ke's avatar
Guolin Ke committed
200
201


202
void GBDT::Bagging(int iter) {
Guolin Ke's avatar
Guolin Ke committed
203
  // if need bagging
204
  if (bag_data_cnt_ < num_data_ && iter % gbdt_config_->bagging_freq == 0) {
Guolin Ke's avatar
Guolin Ke committed
205
    const data_size_t min_inner_size = 1000;
206
207
208
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }

Guolin Ke's avatar
Guolin Ke committed
209
#pragma omp parallel for schedule(static,1)
210
211
212
213
214
215
216
    for (int i = 0; i < num_threads_; ++i) {
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Guolin Ke's avatar
Guolin Ke committed
217
218
      Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
    }
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
237
      }
238
239
240
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
241
242
      }
    }
Guolin Ke's avatar
Guolin Ke committed
243
244
    bag_data_cnt_ = left_cnt;
    CHECK(bag_data_indices_[bag_data_cnt_ - 1] > bag_data_indices_[bag_data_cnt_]);
Guolin Ke's avatar
Guolin Ke committed
245
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
246
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
247
248
249
250
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
Guolin Ke's avatar
Guolin Ke committed
251
252
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
Guolin Ke's avatar
Guolin Ke committed
253
254
      tree_learner_->ResetTrainingData(tmp_subset_.get());
    }
Guolin Ke's avatar
Guolin Ke committed
255
256
257
  }
}

258
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
259
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
260
  if (num_data_ - bag_data_cnt_ > 0 && !is_use_subset_) {
261
    train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
262
263
264
  }
}

265
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
Guolin Ke's avatar
Guolin Ke committed
266
267
268
269
270
271
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
    Boosting();
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
272
273
  // bagging logic
  Bagging(iter_);
Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
  if (is_use_subset_ && bag_data_cnt_ < num_data_) {
    if (gradients_.empty()) {
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }    
    // get sub gradients
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto bias = curr_class * num_data_;
      for (int i = 0; i < bag_data_cnt_; ++i) {
        gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
        hessians_[bias + i] = hessian[bias + bag_data_indices_[i]];
      }
    }
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
Guolin Ke's avatar
Guolin Ke committed
291
292
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    // train a new tree
293
    std::unique_ptr<Tree> new_tree(tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
Guolin Ke's avatar
Guolin Ke committed
294
295
296
297
    // if cannot learn a new tree, then stop
    if (new_tree->num_leaves() <= 1) {
      Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
      return true;
298
    }
299

Guolin Ke's avatar
Guolin Ke committed
300
301
302
303
304
    // shrinkage by learning rate
    new_tree->Shrinkage(shrinkage_rate_);
    // update score
    UpdateScore(new_tree.get(), curr_class);
    UpdateScoreOutOfBag(new_tree.get(), curr_class);
305

Guolin Ke's avatar
Guolin Ke committed
306
307
308
309
310
311
312
313
314
    // add model
    models_.push_back(std::move(new_tree));
  }
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
315

Guolin Ke's avatar
Guolin Ke committed
316
}
317

wxchan's avatar
wxchan committed
318
void GBDT::RollbackOneIter() {
319
  if (iter_ <= 0) { return; }
wxchan's avatar
wxchan committed
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
  int cur_iter = iter_ + num_init_iteration_ - 1;
  // reset score
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    auto curr_tree = cur_iter * num_class_ + curr_class;
    models_[curr_tree]->Shrinkage(-1.0);
    train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
  // remove model
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
337
bool GBDT::EvalAndCheckEarlyStopping() {
338
339
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
340
341
  auto best_msg = OutputMetric(iter_);
  is_met_early_stopping = !best_msg.empty();
342
343
344
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
Guolin Ke's avatar
Guolin Ke committed
345
    Log::Info("Output of best iteration round:\n%s", best_msg.c_str());
346
    // pop last early_stopping_round_ models
347
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
348
349
350
351
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
352
353
}

354
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
355
  // update training score
Guolin Ke's avatar
Guolin Ke committed
356
357
358
359
360
  if (!is_use_subset_) {
    train_score_updater_->AddScore(tree_learner_.get(), curr_class);
  } else {
    train_score_updater_->AddScore(tree, curr_class);
  }
Guolin Ke's avatar
Guolin Ke committed
361
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
362
363
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
364
365
366
  }
}

Guolin Ke's avatar
Guolin Ke committed
367
368
369
370
std::string GBDT::OutputMetric(int iter) {
  bool need_output = (iter % gbdt_config_->output_freq) == 0;
  std::string ret = "";
  std::stringstream msg_buf;
371
  std::vector<std::pair<size_t, size_t>> meet_early_stopping_pairs;
Guolin Ke's avatar
Guolin Ke committed
372
  // print training metric
Guolin Ke's avatar
Guolin Ke committed
373
  if (need_output) {
374
375
376
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
377
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
378
379
380
381
382
383
384
385
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
          << ", training " << name[k]
          << " : " << scores[k];
        Log::Info(tmp_buf.str().c_str());
        if (early_stopping_round_ > 0) {
          msg_buf << tmp_buf.str() << std::endl;
        }
386
      }
387
    }
Guolin Ke's avatar
Guolin Ke committed
388
389
  }
  // print validation metric
Guolin Ke's avatar
Guolin Ke committed
390
  if (need_output || early_stopping_round_ > 0) {
391
392
393
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
Guolin Ke's avatar
Guolin Ke committed
394
395
396
397
398
399
400
401
402
403
404
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
          tmp_buf << "Iteration:" << iter
            << ", valid_" << i + 1 << " " << name[k]
            << " : " << test_scores[k];
          if (need_output) {
            Log::Info(tmp_buf.str().c_str());
          }
          if (early_stopping_round_ > 0) {
            msg_buf << tmp_buf.str() << std::endl;
405
          }
wxchan's avatar
wxchan committed
406
        }
Guolin Ke's avatar
Guolin Ke committed
407
        if (ret.empty() && early_stopping_round_ > 0) {
408
409
410
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
411
            best_iter_[i][j] = iter;
Guolin Ke's avatar
Guolin Ke committed
412
            meet_early_stopping_pairs.emplace_back(i, j);
413
          } else {
Guolin Ke's avatar
Guolin Ke committed
414
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; }
415
          }
wxchan's avatar
wxchan committed
416
417
        }
      }
Guolin Ke's avatar
Guolin Ke committed
418
419
    }
  }
Guolin Ke's avatar
Guolin Ke committed
420
421
422
  for (auto& pair : meet_early_stopping_pairs) {
    best_msg_[pair.first][pair.second] = msg_buf.str();
  }
wxchan's avatar
wxchan committed
423
  return ret;
Guolin Ke's avatar
Guolin Ke committed
424
425
}

426
/*! \brief Get eval result */
427
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
Guolin Ke's avatar
Guolin Ke committed
428
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
429
430
  std::vector<double> ret;
  if (data_idx == 0) {
431
432
    for (auto& sub_metric : training_metrics_) {
      auto scores = sub_metric->Eval(train_score_updater_->score());
433
434
435
      for (auto score : scores) {
        ret.push_back(score);
      }
436
437
    }
  }
438
439
440
441
442
443
444
  else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score());
      for (auto score : test_scores) {
        ret.push_back(score);
      }
445
446
447
448
449
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
450
/*! \brief Get training scores result */
451
const double* GBDT::GetTrainingScore(int64_t* out_len) {
452
  *out_len = static_cast<int64_t>(train_score_updater_->num_data()) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
453
  return train_score_updater_->score();
454
455
}

Guolin Ke's avatar
Guolin Ke committed
456
457
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
Guolin Ke's avatar
Guolin Ke committed
458

459
  const double* raw_scores = nullptr;
Guolin Ke's avatar
Guolin Ke committed
460
461
  data_size_t num_data = 0;
  if (data_idx == 0) {
wxchan's avatar
wxchan committed
462
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
463
464
465
466
467
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
468
    *out_len = static_cast<int64_t>(num_data) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
469
470
  }
  if (num_class_ > 1) {
wxchan's avatar
wxchan committed
471
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
472
    for (data_size_t i = 0; i < num_data; ++i) {
473
      std::vector<double> tmp_result(num_class_);
Guolin Ke's avatar
Guolin Ke committed
474
      for (int j = 0; j < num_class_; ++j) {
475
        tmp_result[j] = raw_scores[j * num_data + i];
Guolin Ke's avatar
Guolin Ke committed
476
477
478
      }
      Common::Softmax(&tmp_result);
      for (int j = 0; j < num_class_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
479
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
Guolin Ke's avatar
Guolin Ke committed
480
481
      }
    }
Guolin Ke's avatar
Guolin Ke committed
482
  } else if(sigmoid_ > 0.0f){
wxchan's avatar
wxchan committed
483
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
484
    for (data_size_t i = 0; i < num_data; ++i) {
485
      out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(- sigmoid_ * raw_scores[i])));
Guolin Ke's avatar
Guolin Ke committed
486
487
    }
  } else {
wxchan's avatar
wxchan committed
488
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
489
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
490
      out_result[i] = static_cast<double>(raw_scores[i]);
Guolin Ke's avatar
Guolin Ke committed
491
492
493
494
495
    }
  }

}

Guolin Ke's avatar
Guolin Ke committed
496
void GBDT::Boosting() {
497
498
499
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
500
  // objective function will calculate gradients and hessians
501
  int64_t num_score = 0;
Guolin Ke's avatar
Guolin Ke committed
502
  object_function_->
Guolin Ke's avatar
Guolin Ke committed
503
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
504
505
}

506
std::string GBDT::DumpModel(int num_iteration) const {
Guolin Ke's avatar
Guolin Ke committed
507
  std::stringstream str_buf;
wxchan's avatar
wxchan committed
508

Guolin Ke's avatar
Guolin Ke committed
509
  str_buf << "{";
Guolin Ke's avatar
Guolin Ke committed
510
  str_buf << "\"name\":\"" << SubModelName() << "\"," << std::endl;
Guolin Ke's avatar
Guolin Ke committed
511
512
513
514
  str_buf << "\"num_class\":" << num_class_ << "," << std::endl;
  str_buf << "\"label_index\":" << label_idx_ << "," << std::endl;
  str_buf << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
  str_buf << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
wxchan's avatar
wxchan committed
515

Guolin Ke's avatar
Guolin Ke committed
516
  str_buf << "\"feature_names\":[\"" 
517
     << Common::Join(feature_names_, "\",\"") << "\"]," 
Guolin Ke's avatar
Guolin Ke committed
518
519
     << std::endl;

Guolin Ke's avatar
Guolin Ke committed
520
  str_buf << "\"tree_info\":[";
521
522
523
524
525
  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_used_model = std::min(num_iteration * num_class_, num_used_model);
  } 
  for (int i = 0; i < num_used_model; ++i) {
wxchan's avatar
wxchan committed
526
    if (i > 0) {
Guolin Ke's avatar
Guolin Ke committed
527
      str_buf << ",";
wxchan's avatar
wxchan committed
528
    }
Guolin Ke's avatar
Guolin Ke committed
529
530
531
532
    str_buf << "{";
    str_buf << "\"tree_index\":" << i << ",";
    str_buf << models_[i]->ToJSON();
    str_buf << "}";
wxchan's avatar
wxchan committed
533
  }
Guolin Ke's avatar
Guolin Ke committed
534
  str_buf << "]" << std::endl;
wxchan's avatar
wxchan committed
535

Guolin Ke's avatar
Guolin Ke committed
536
  str_buf << "}" << std::endl;
wxchan's avatar
wxchan committed
537

Guolin Ke's avatar
Guolin Ke committed
538
  return str_buf.str();
wxchan's avatar
wxchan committed
539
540
}

541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
std::string GBDT::SaveModelToString(int num_iterations) const {
    std::stringstream ss;

    // output model type
    ss << SubModelName() << std::endl;
    // output number of class
    ss << "num_class=" << num_class_ << std::endl;
    // output label index
    ss << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
    // output objective name
    if (object_function_ != nullptr) {
      ss << "objective=" << object_function_->GetName() << std::endl;
    }
    // output sigmoid parameter
    ss << "sigmoid=" << sigmoid_ << std::endl;

    ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;

    ss << std::endl;
    int num_used_model = static_cast<int>(models_.size());
    if (num_iterations > 0) {
      num_used_model = std::min(num_iterations * num_class_, num_used_model);
    }
    // output tree models
    for (int i = 0; i < num_used_model; ++i) {
      ss << "Tree=" << i << std::endl;
      ss << models_[i]->ToString() << std::endl;
    }

    std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
    ss << std::endl << "feature importances:" << std::endl;
    for (size_t i = 0; i < pairs.size(); ++i) {
      ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
    }

    return ss.str();
}

581
bool GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
wxchan's avatar
wxchan committed
582
583
584
  /*! \brief File to write models */
  std::ofstream output_file;
  output_file.open(filename);
585

586
  output_file << SaveModelToString(num_iteration);
587

wxchan's avatar
wxchan committed
588
  output_file.close();
589
590

  return (bool)output_file;
Guolin Ke's avatar
Guolin Ke committed
591
592
}

593
bool GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
594
595
596
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
597
598

  // get number of classes
599
600
601
602
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
603
    Log::Fatal("Model file doesn't specify the number of classes");
604
    return false;
605
  }
Guolin Ke's avatar
Guolin Ke committed
606
  // get index of label
607
608
609
610
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
611
    Log::Fatal("Model file doesn't specify the label index");
612
    return false;
Guolin Ke's avatar
Guolin Ke committed
613
  }
Guolin Ke's avatar
Guolin Ke committed
614
  // get max_feature_idx first
615
616
617
618
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
619
    Log::Fatal("Model file doesn't specify max_feature_idx");
620
    return false;
Guolin Ke's avatar
Guolin Ke committed
621
622
  }
  // get sigmoid parameter
623
624
625
626
  line = Common::FindFromLines(lines, "sigmoid=");
  if (line.size() > 0) {
    Common::Atof(Common::Split(line.c_str(), '=')[1].c_str(), &sigmoid_);
  } else {
627
    sigmoid_ = -1.0f;
Guolin Ke's avatar
Guolin Ke committed
628
  }
Guolin Ke's avatar
Guolin Ke committed
629
630
631
  // get feature names
  line = Common::FindFromLines(lines, "feature_names=");
  if (line.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
632
    feature_names_ = Common::Split(line.substr(std::strlen("feature_names=")).c_str(), " ");
Guolin Ke's avatar
Guolin Ke committed
633
634
    if (feature_names_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_names");
635
      return false;
Guolin Ke's avatar
Guolin Ke committed
636
    }
Guolin Ke's avatar
Guolin Ke committed
637
638
  }
  else {
Guolin Ke's avatar
Guolin Ke committed
639
    Log::Fatal("Model file doesn't contain feature names");
640
    return false;
Guolin Ke's avatar
Guolin Ke committed
641
642
  }

Guolin Ke's avatar
Guolin Ke committed
643
  // get tree models
644
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
645
646
647
648
649
650
651
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
Guolin Ke's avatar
Guolin Ke committed
652
      std::string tree_str = Common::Join<std::string>(lines, start, end, "\n");
Guolin Ke's avatar
Guolin Ke committed
653
654
      auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
      models_.push_back(std::move(new_tree));
Guolin Ke's avatar
Guolin Ke committed
655
656
657
658
    } else {
      ++i;
    }
  }
659
  Log::Info("Finished loading %d models", models_.size());
wxchan's avatar
wxchan committed
660
661
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
  num_init_iteration_ = num_iteration_for_pred_;
662
  iter_ = 0;
663
664

  return true;
Guolin Ke's avatar
Guolin Ke committed
665
666
}

wxchan's avatar
wxchan committed
667
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
668

669
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
670
    for (size_t iter = 0; iter < models_.size(); ++iter) {
671
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
Guolin Ke's avatar
Guolin Ke committed
672
            ++feature_importances[models_[iter]->split_feature(split_idx)];
wxchan's avatar
wxchan committed
673
674
        }
    }
675
676
677
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
678
      if (feature_importances[i] > 0) {
679
        pairs.emplace_back(feature_importances[i], feature_names_[i]);
Guolin Ke's avatar
Guolin Ke committed
680
      }
681
682
683
684
685
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
686
      return lhs.first > rhs.first;
687
    });
wxchan's avatar
wxchan committed
688
    return pairs;
wxchan's avatar
wxchan committed
689
690
}

691
692
std::vector<double> GBDT::PredictRaw(const double* value) const {
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
693
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
694
695
696
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
    }
Guolin Ke's avatar
Guolin Ke committed
697
698
699
700
  }
  return ret;
}

701
std::vector<double> GBDT::Predict(const double* value) const {
702
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
703
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
704
705
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
706
707
    }
  }
708
709
  // if need sigmoid transform
  if (sigmoid_ > 0 && num_class_ == 1) {
710
    ret[0] = 1.0f / (1.0f + std::exp(-sigmoid_ * ret[0]));
711
712
713
  } else if (num_class_ > 1) {
    Common::Softmax(&ret);
  }
714
715
716
  return ret;
}

717
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
wxchan's avatar
wxchan committed
718
  std::vector<int> ret;
wxchan's avatar
wxchan committed
719
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
720
721
722
    for (int j = 0; j < num_class_; ++j) {
      ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
    }
wxchan's avatar
wxchan committed
723
724
725
726
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
727
}  // namespace LightGBM