gbdt.cpp 26.2 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#include "gbdt.h"

3
#include <LightGBM/utils/openmp_wrapper.h>
4

Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
9
10
11
12
13
14
15
16
#include <LightGBM/utils/common.h>

#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
17
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
18
19
20

namespace LightGBM {

21
GBDT::GBDT()
22
  :iter_(0),
23
24
25
26
27
28
  train_data_(nullptr),
  object_function_(nullptr),
  early_stopping_round_(0),
  max_feature_idx_(0),
  num_class_(1),
  sigmoid_(1.0f),
29
  num_iteration_for_pred_(0),
30
  shrinkage_rate_(0.1f),
wxchan's avatar
wxchan committed
31
  num_init_iteration_(0) {
32
33
34
35
36
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
37
38
39
}

GBDT::~GBDT() {
Guolin Ke's avatar
Guolin Ke committed
40

Guolin Ke's avatar
Guolin Ke committed
41
42
}

43
44
45
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
  iter_ = 0;
wxchan's avatar
wxchan committed
46
  num_iteration_for_pred_ = 0;
47
  max_feature_idx_ = 0;
wxchan's avatar
wxchan committed
48
49
  num_class_ = config->num_class;
  train_data_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
50
  gbdt_config_ = nullptr;
51
  tree_learner_ = nullptr;
wxchan's avatar
wxchan committed
52
53
54
55
56
  ResetTrainingData(config, train_data, object_function, training_metrics);
}

void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
  const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
57
  auto new_config = std::unique_ptr<BoostingConfig>(new BoostingConfig(*config));
wxchan's avatar
wxchan committed
58
59
60
  if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
    Log::Fatal("cannot reset training data, since new training data has different bin mappers");
  }
Guolin Ke's avatar
Guolin Ke committed
61
62
63
  early_stopping_round_ = new_config->early_stopping_round;
  shrinkage_rate_ = new_config->learning_rate;

Guolin Ke's avatar
Guolin Ke committed
64
  object_function_ = object_function;
Guolin Ke's avatar
Guolin Ke committed
65

Guolin Ke's avatar
Guolin Ke committed
66
  sigmoid_ = -1.0f;
wxchan's avatar
wxchan committed
67
  if (object_function_ != nullptr
Guolin Ke's avatar
Guolin Ke committed
68
69
    && std::string(object_function_->GetName()) == std::string("binary")) {
    // only binary classification need sigmoid transform
Guolin Ke's avatar
Guolin Ke committed
70
    sigmoid_ = new_config->sigmoid;
71
  }
Guolin Ke's avatar
Guolin Ke committed
72

Guolin Ke's avatar
Guolin Ke committed
73
  if (train_data_ != train_data && train_data != nullptr) {
74
75
    if (tree_learner_ == nullptr) {
      tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(new_config->tree_learner_type, &new_config->tree_config));
Guolin Ke's avatar
Guolin Ke committed
76
77
    }
    // init tree learner
78
    tree_learner_->Init(train_data);
Guolin Ke's avatar
Guolin Ke committed
79

Guolin Ke's avatar
Guolin Ke committed
80
81
82
83
84
85
    // push training metrics
    training_metrics_.clear();
    for (const auto& metric : training_metrics) {
      training_metrics_.push_back(metric);
    }
    training_metrics_.shrink_to_fit();
wxchan's avatar
wxchan committed
86
87
88
89
90
91
92
93
94
95
96
97
98
    // not same training data, need reset score and others
    // create score tracker
    train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
    // update score
    for (int i = 0; i < iter_; ++i) {
      for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
        auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
        train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
      }
    }
    num_data_ = train_data->num_data();
    // create buffer for gradients and hessians
    if (object_function_ != nullptr) {
99
100
101
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
wxchan's avatar
wxchan committed
102
103
104
105
106
    }
    // get max feature index
    max_feature_idx_ = train_data->num_total_features() - 1;
    // get label index
    label_idx_ = train_data->label_idx();
107
108
    // get feature names
    feature_names_ = train_data->feature_names();
109
110
111
112
113
114
115
116
117
118
    // get feature infos
    feature_infos_.clear();
    for (int i = 0; i < max_feature_idx_ + 1; ++i) {
      int feature_idx = train_data->GetInnerFeatureIndex(i);
      if (feature_idx < 0) { 
        feature_infos_.push_back("trival feature"); 
      } else {
        feature_infos_.push_back(train_data->FeatureAt(feature_idx)->bin_mapper()->bin_info());
      }
    }
Guolin Ke's avatar
Guolin Ke committed
119
120
  }

Guolin Ke's avatar
Guolin Ke committed
121
122
  if ((train_data_ != train_data && train_data != nullptr)
    || (gbdt_config_ != nullptr && gbdt_config_->bagging_fraction != new_config->bagging_fraction)) {
wxchan's avatar
wxchan committed
123
    // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
124
    if (new_config->bagging_fraction < 1.0 && new_config->bagging_freq > 0) {
125
126
      bag_data_cnt_ =
        static_cast<data_size_t>(new_config->bagging_fraction * num_data_);
127
      bag_data_indices_.resize(num_data_);
128
129
130
131
132
133
      tmp_indices_.resize(num_data_);
      offsets_buf_.resize(num_threads_);
      left_cnts_buf_.resize(num_threads_);
      right_cnts_buf_.resize(num_threads_);
      left_write_pos_buf_.resize(num_threads_);
      right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
134
135
      double average_bag_rate = new_config->bagging_fraction / new_config->bagging_freq;
      is_use_subset_ = false;
Guolin Ke's avatar
Guolin Ke committed
136
137
138
      if (average_bag_rate < 0.5) {
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
        tmp_subset_->CopyFeatureMapperFrom(train_data, false);
Guolin Ke's avatar
Guolin Ke committed
139
140
141
        is_use_subset_ = true;
        Log::Debug("use subset for bagging");
      }
wxchan's avatar
wxchan committed
142
143
144
    } else {
      bag_data_cnt_ = num_data_;
      bag_data_indices_.clear();
145
      tmp_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
146
      is_use_subset_ = false;
wxchan's avatar
wxchan committed
147
    }
Guolin Ke's avatar
Guolin Ke committed
148
  }
wxchan's avatar
wxchan committed
149
  train_data_ = train_data;
Guolin Ke's avatar
Guolin Ke committed
150
151
  if (train_data_ != nullptr) {
    // reset config for tree learner
152
    tree_learner_->ResetConfig(&new_config->tree_config);
Guolin Ke's avatar
Guolin Ke committed
153
  }
Guolin Ke's avatar
Guolin Ke committed
154
  gbdt_config_.reset(new_config.release());
Guolin Ke's avatar
Guolin Ke committed
155
156
}

wxchan's avatar
wxchan committed
157
void GBDT::AddValidDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
158
  const std::vector<const Metric*>& valid_metrics) {
wxchan's avatar
wxchan committed
159
160
  if (!train_data_->CheckAlign(*valid_data)) {
    Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
161
  }
Guolin Ke's avatar
Guolin Ke committed
162
  // for a validation dataset, we need its score and metric
Guolin Ke's avatar
Guolin Ke committed
163
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
wxchan's avatar
wxchan committed
164
165
166
167
168
169
170
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
      new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
171
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
172
  valid_metrics_.emplace_back();
173
174
175
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
Guolin Ke's avatar
Guolin Ke committed
176
    best_msg_.emplace_back();
177
  }
Guolin Ke's avatar
Guolin Ke committed
178
179
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
180
181
182
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
Guolin Ke's avatar
Guolin Ke committed
183
      best_msg_.back().emplace_back();
184
    }
Guolin Ke's avatar
Guolin Ke committed
185
  }
Guolin Ke's avatar
Guolin Ke committed
186
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
187
188
}

Guolin Ke's avatar
Guolin Ke committed
189
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer){
190
191
192
193
  data_size_t bag_data_cnt =
    static_cast<data_size_t>(gbdt_config_->bagging_fraction * cnt);
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
194
  auto right_buffer = buffer + bag_data_cnt;
195
196
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
197
198
199
    float prob =
      (bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
    if (cur_rand.NextFloat() < prob) {
200
201
      buffer[cur_left_cnt++] = start + i;
    } else {
Guolin Ke's avatar
Guolin Ke committed
202
      right_buffer[cur_right_cnt++] = start + i;
203
204
    }
  }
Guolin Ke's avatar
Guolin Ke committed
205
  CHECK(buffer[bag_data_cnt - 1] > buffer[bag_data_cnt]);
206
207
208
  CHECK(cur_left_cnt == bag_data_cnt);
  return cur_left_cnt;
}
Guolin Ke's avatar
Guolin Ke committed
209

Guolin Ke's avatar
Guolin Ke committed
210
211


212
void GBDT::Bagging(int iter) {
Guolin Ke's avatar
Guolin Ke committed
213
  // if need bagging
214
  if (bag_data_cnt_ < num_data_ && iter % gbdt_config_->bagging_freq == 0) {
Guolin Ke's avatar
Guolin Ke committed
215
    const data_size_t min_inner_size = 1000;
216
217
218
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }

Guolin Ke's avatar
Guolin Ke committed
219
#pragma omp parallel for schedule(static,1)
220
221
222
223
224
225
226
    for (int i = 0; i < num_threads_; ++i) {
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Guolin Ke's avatar
Guolin Ke committed
227
228
      Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
    }
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
247
      }
248
249
250
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
251
252
      }
    }
Guolin Ke's avatar
Guolin Ke committed
253
254
    bag_data_cnt_ = left_cnt;
    CHECK(bag_data_indices_[bag_data_cnt_ - 1] > bag_data_indices_[bag_data_cnt_]);
Guolin Ke's avatar
Guolin Ke committed
255
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
256
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
257
258
259
260
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
Guolin Ke's avatar
Guolin Ke committed
261
262
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
Guolin Ke's avatar
Guolin Ke committed
263
264
      tree_learner_->ResetTrainingData(tmp_subset_.get());
    }
Guolin Ke's avatar
Guolin Ke committed
265
266
267
  }
}

268
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
269
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
270
  if (num_data_ - bag_data_cnt_ > 0 && !is_use_subset_) {
271
    train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
272
273
274
  }
}

275
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
Guolin Ke's avatar
Guolin Ke committed
276
277
278
279
280
281
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
    Boosting();
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
282
283
  // bagging logic
  Bagging(iter_);
Guolin Ke's avatar
Guolin Ke committed
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
  if (is_use_subset_ && bag_data_cnt_ < num_data_) {
    if (gradients_.empty()) {
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }    
    // get sub gradients
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto bias = curr_class * num_data_;
      for (int i = 0; i < bag_data_cnt_; ++i) {
        gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
        hessians_[bias + i] = hessian[bias + bag_data_indices_[i]];
      }
    }
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
Guolin Ke's avatar
Guolin Ke committed
301
302
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    // train a new tree
303
    std::unique_ptr<Tree> new_tree(tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
Guolin Ke's avatar
Guolin Ke committed
304
305
306
307
    // if cannot learn a new tree, then stop
    if (new_tree->num_leaves() <= 1) {
      Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
      return true;
308
    }
309

Guolin Ke's avatar
Guolin Ke committed
310
311
312
313
314
    // shrinkage by learning rate
    new_tree->Shrinkage(shrinkage_rate_);
    // update score
    UpdateScore(new_tree.get(), curr_class);
    UpdateScoreOutOfBag(new_tree.get(), curr_class);
315

Guolin Ke's avatar
Guolin Ke committed
316
317
318
319
320
321
322
323
324
    // add model
    models_.push_back(std::move(new_tree));
  }
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
325

Guolin Ke's avatar
Guolin Ke committed
326
}
327

wxchan's avatar
wxchan committed
328
void GBDT::RollbackOneIter() {
329
  if (iter_ <= 0) { return; }
wxchan's avatar
wxchan committed
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
  int cur_iter = iter_ + num_init_iteration_ - 1;
  // reset score
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    auto curr_tree = cur_iter * num_class_ + curr_class;
    models_[curr_tree]->Shrinkage(-1.0);
    train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
  // remove model
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
347
bool GBDT::EvalAndCheckEarlyStopping() {
348
349
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
350
351
  auto best_msg = OutputMetric(iter_);
  is_met_early_stopping = !best_msg.empty();
352
353
354
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
Guolin Ke's avatar
Guolin Ke committed
355
    Log::Info("Output of best iteration round:\n%s", best_msg.c_str());
356
    // pop last early_stopping_round_ models
357
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
358
359
360
361
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
362
363
}

364
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
365
  // update training score
Guolin Ke's avatar
Guolin Ke committed
366
367
368
369
370
  if (!is_use_subset_) {
    train_score_updater_->AddScore(tree_learner_.get(), curr_class);
  } else {
    train_score_updater_->AddScore(tree, curr_class);
  }
Guolin Ke's avatar
Guolin Ke committed
371
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
372
373
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
374
375
376
  }
}

Guolin Ke's avatar
Guolin Ke committed
377
378
379
380
std::string GBDT::OutputMetric(int iter) {
  bool need_output = (iter % gbdt_config_->output_freq) == 0;
  std::string ret = "";
  std::stringstream msg_buf;
381
  std::vector<std::pair<size_t, size_t>> meet_early_stopping_pairs;
Guolin Ke's avatar
Guolin Ke committed
382
  // print training metric
Guolin Ke's avatar
Guolin Ke committed
383
  if (need_output) {
384
385
386
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
387
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
388
389
390
391
392
393
394
395
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
          << ", training " << name[k]
          << " : " << scores[k];
        Log::Info(tmp_buf.str().c_str());
        if (early_stopping_round_ > 0) {
          msg_buf << tmp_buf.str() << std::endl;
        }
396
      }
397
    }
Guolin Ke's avatar
Guolin Ke committed
398
399
  }
  // print validation metric
Guolin Ke's avatar
Guolin Ke committed
400
  if (need_output || early_stopping_round_ > 0) {
401
402
403
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
Guolin Ke's avatar
Guolin Ke committed
404
405
406
407
408
409
410
411
412
413
414
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
          tmp_buf << "Iteration:" << iter
            << ", valid_" << i + 1 << " " << name[k]
            << " : " << test_scores[k];
          if (need_output) {
            Log::Info(tmp_buf.str().c_str());
          }
          if (early_stopping_round_ > 0) {
            msg_buf << tmp_buf.str() << std::endl;
415
          }
wxchan's avatar
wxchan committed
416
        }
Guolin Ke's avatar
Guolin Ke committed
417
        if (ret.empty() && early_stopping_round_ > 0) {
418
419
420
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
421
            best_iter_[i][j] = iter;
Guolin Ke's avatar
Guolin Ke committed
422
            meet_early_stopping_pairs.emplace_back(i, j);
423
          } else {
Guolin Ke's avatar
Guolin Ke committed
424
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; }
425
          }
wxchan's avatar
wxchan committed
426
427
        }
      }
Guolin Ke's avatar
Guolin Ke committed
428
429
    }
  }
Guolin Ke's avatar
Guolin Ke committed
430
431
432
  for (auto& pair : meet_early_stopping_pairs) {
    best_msg_[pair.first][pair.second] = msg_buf.str();
  }
wxchan's avatar
wxchan committed
433
  return ret;
Guolin Ke's avatar
Guolin Ke committed
434
435
}

436
/*! \brief Get eval result */
437
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
Guolin Ke's avatar
Guolin Ke committed
438
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
439
440
  std::vector<double> ret;
  if (data_idx == 0) {
441
442
    for (auto& sub_metric : training_metrics_) {
      auto scores = sub_metric->Eval(train_score_updater_->score());
443
444
445
      for (auto score : scores) {
        ret.push_back(score);
      }
446
447
    }
  }
448
449
450
451
452
453
454
  else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score());
      for (auto score : test_scores) {
        ret.push_back(score);
      }
455
456
457
458
459
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
460
/*! \brief Get training scores result */
461
const double* GBDT::GetTrainingScore(int64_t* out_len) {
462
  *out_len = static_cast<int64_t>(train_score_updater_->num_data()) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
463
  return train_score_updater_->score();
464
465
}

Guolin Ke's avatar
Guolin Ke committed
466
467
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
Guolin Ke's avatar
Guolin Ke committed
468

469
  const double* raw_scores = nullptr;
Guolin Ke's avatar
Guolin Ke committed
470
471
  data_size_t num_data = 0;
  if (data_idx == 0) {
wxchan's avatar
wxchan committed
472
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
473
474
475
476
477
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
478
    *out_len = static_cast<int64_t>(num_data) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
479
480
  }
  if (num_class_ > 1) {
wxchan's avatar
wxchan committed
481
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
482
    for (data_size_t i = 0; i < num_data; ++i) {
483
      std::vector<double> tmp_result(num_class_);
Guolin Ke's avatar
Guolin Ke committed
484
      for (int j = 0; j < num_class_; ++j) {
485
        tmp_result[j] = raw_scores[j * num_data + i];
Guolin Ke's avatar
Guolin Ke committed
486
487
488
      }
      Common::Softmax(&tmp_result);
      for (int j = 0; j < num_class_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
489
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
Guolin Ke's avatar
Guolin Ke committed
490
491
      }
    }
Guolin Ke's avatar
Guolin Ke committed
492
  } else if(sigmoid_ > 0.0f){
wxchan's avatar
wxchan committed
493
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
494
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
495
      out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
Guolin Ke's avatar
Guolin Ke committed
496
497
    }
  } else {
wxchan's avatar
wxchan committed
498
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
499
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
500
      out_result[i] = static_cast<double>(raw_scores[i]);
Guolin Ke's avatar
Guolin Ke committed
501
502
503
504
505
    }
  }

}

Guolin Ke's avatar
Guolin Ke committed
506
void GBDT::Boosting() {
507
508
509
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
510
  // objective function will calculate gradients and hessians
511
  int64_t num_score = 0;
Guolin Ke's avatar
Guolin Ke committed
512
  object_function_->
Guolin Ke's avatar
Guolin Ke committed
513
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
514
515
}

516
std::string GBDT::DumpModel(int num_iteration) const {
Guolin Ke's avatar
Guolin Ke committed
517
  std::stringstream str_buf;
wxchan's avatar
wxchan committed
518

Guolin Ke's avatar
Guolin Ke committed
519
  str_buf << "{";
Guolin Ke's avatar
Guolin Ke committed
520
  str_buf << "\"name\":\"" << SubModelName() << "\"," << std::endl;
Guolin Ke's avatar
Guolin Ke committed
521
522
523
524
  str_buf << "\"num_class\":" << num_class_ << "," << std::endl;
  str_buf << "\"label_index\":" << label_idx_ << "," << std::endl;
  str_buf << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
  str_buf << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
wxchan's avatar
wxchan committed
525

Guolin Ke's avatar
Guolin Ke committed
526
  str_buf << "\"feature_names\":[\"" 
527
     << Common::Join(feature_names_, "\",\"") << "\"]," 
Guolin Ke's avatar
Guolin Ke committed
528
529
     << std::endl;

Guolin Ke's avatar
Guolin Ke committed
530
  str_buf << "\"tree_info\":[";
531
532
533
534
535
  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_used_model = std::min(num_iteration * num_class_, num_used_model);
  } 
  for (int i = 0; i < num_used_model; ++i) {
wxchan's avatar
wxchan committed
536
    if (i > 0) {
Guolin Ke's avatar
Guolin Ke committed
537
      str_buf << ",";
wxchan's avatar
wxchan committed
538
    }
Guolin Ke's avatar
Guolin Ke committed
539
540
541
542
    str_buf << "{";
    str_buf << "\"tree_index\":" << i << ",";
    str_buf << models_[i]->ToJSON();
    str_buf << "}";
wxchan's avatar
wxchan committed
543
  }
Guolin Ke's avatar
Guolin Ke committed
544
  str_buf << "]" << std::endl;
wxchan's avatar
wxchan committed
545

Guolin Ke's avatar
Guolin Ke committed
546
  str_buf << "}" << std::endl;
wxchan's avatar
wxchan committed
547

Guolin Ke's avatar
Guolin Ke committed
548
  return str_buf.str();
wxchan's avatar
wxchan committed
549
550
}

551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
std::string GBDT::SaveModelToString(int num_iterations) const {
    std::stringstream ss;

    // output model type
    ss << SubModelName() << std::endl;
    // output number of class
    ss << "num_class=" << num_class_ << std::endl;
    // output label index
    ss << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
    // output objective name
    if (object_function_ != nullptr) {
      ss << "objective=" << object_function_->GetName() << std::endl;
    }
    // output sigmoid parameter
    ss << "sigmoid=" << sigmoid_ << std::endl;

    ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;

    ss << std::endl;
    int num_used_model = static_cast<int>(models_.size());
    if (num_iterations > 0) {
      num_used_model = std::min(num_iterations * num_class_, num_used_model);
    }
    // output tree models
    for (int i = 0; i < num_used_model; ++i) {
      ss << "Tree=" << i << std::endl;
      ss << models_[i]->ToString() << std::endl;
    }

    std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
    ss << std::endl << "feature importances:" << std::endl;
    for (size_t i = 0; i < pairs.size(); ++i) {
      ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
    }

    ss << std::endl << "feature information:" << std::endl;
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
589
    for (int i = 0; i < max_feature_idx_ + 1; ++i) {
590
591
592
593
594
595
      ss << feature_names_[i] << "=" << feature_infos_[i] << std::endl;
    }

    return ss.str();
}

596
bool GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
wxchan's avatar
wxchan committed
597
598
599
  /*! \brief File to write models */
  std::ofstream output_file;
  output_file.open(filename);
600

601
  output_file << SaveModelToString(num_iteration);
602

wxchan's avatar
wxchan committed
603
  output_file.close();
604
605

  return (bool)output_file;
Guolin Ke's avatar
Guolin Ke committed
606
607
}

608
bool GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
609
610
611
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
612
613

  // get number of classes
614
615
616
617
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
618
    Log::Fatal("Model file doesn't specify the number of classes");
619
    return false;
620
  }
Guolin Ke's avatar
Guolin Ke committed
621
  // get index of label
622
623
624
625
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
626
    Log::Fatal("Model file doesn't specify the label index");
627
    return false;
Guolin Ke's avatar
Guolin Ke committed
628
  }
Guolin Ke's avatar
Guolin Ke committed
629
  // get max_feature_idx first
630
631
632
633
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
634
    Log::Fatal("Model file doesn't specify max_feature_idx");
635
    return false;
Guolin Ke's avatar
Guolin Ke committed
636
637
  }
  // get sigmoid parameter
638
639
640
641
  line = Common::FindFromLines(lines, "sigmoid=");
  if (line.size() > 0) {
    Common::Atof(Common::Split(line.c_str(), '=')[1].c_str(), &sigmoid_);
  } else {
642
    sigmoid_ = -1.0f;
Guolin Ke's avatar
Guolin Ke committed
643
  }
Guolin Ke's avatar
Guolin Ke committed
644
645
646
  // get feature names
  line = Common::FindFromLines(lines, "feature_names=");
  if (line.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
647
    feature_names_ = Common::Split(line.substr(std::strlen("feature_names=")).c_str(), " ");
Guolin Ke's avatar
Guolin Ke committed
648
649
    if (feature_names_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_names");
650
      return false;
Guolin Ke's avatar
Guolin Ke committed
651
652
653
    }
  } else {
    Log::Fatal("Model file doesn't contain feature names");
654
    return false;
Guolin Ke's avatar
Guolin Ke committed
655
656
  }

657
  // returns offset, or lines.size() if not found.
Guolin Ke's avatar
Guolin Ke committed
658
  auto find_string_lineno = [&lines](const std::string &str, size_t start_line)
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
  {
    size_t i = start_line;
    size_t featinfo_find_pos = std::string::npos;
    while (i < lines.size()) {
      featinfo_find_pos = lines[i].find(str);
      if (featinfo_find_pos != std::string::npos)
        break;
      ++i;
    }

    return i;
  };

  // load feature information
  {
Guolin Ke's avatar
Guolin Ke committed
674
    size_t finfo_line_idx = find_string_lineno("feature information:", 0);
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696

    if (finfo_line_idx >= lines.size()) {
      Log::Fatal("Model file doesn't contain feature information");
      return false;
    }

    feature_infos_.resize(max_feature_idx_ + 1);

    // search for each feature name
    for (int i=0; i < max_feature_idx_ + 1; i++) {
      const auto feat_name = feature_names_[i];
      size_t line_idx = find_string_lineno(feat_name + "=", finfo_line_idx + 1);
      if (line_idx >= lines.size()) {
        Log::Fatal(("Model file doesn't contain feature information for feature " + feat_name).c_str());
        return false;
      }

      const auto this_line = lines[line_idx];
      feature_infos_[i] = this_line.substr((feat_name + "=").size());
    }
  }

Guolin Ke's avatar
Guolin Ke committed
697
  // get tree models
698
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
699
700
701
702
703
704
705
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
Guolin Ke's avatar
Guolin Ke committed
706
      std::string tree_str = Common::Join<std::string>(lines, start, end, "\n");
Guolin Ke's avatar
Guolin Ke committed
707
708
      auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
      models_.push_back(std::move(new_tree));
Guolin Ke's avatar
Guolin Ke committed
709
710
711
712
    } else {
      ++i;
    }
  }
713
  Log::Info("Finished loading %d models", models_.size());
wxchan's avatar
wxchan committed
714
715
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
  num_init_iteration_ = num_iteration_for_pred_;
716
  iter_ = 0;
717
718

  return true;
Guolin Ke's avatar
Guolin Ke committed
719
720
}

wxchan's avatar
wxchan committed
721
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
722

723
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
724
    for (size_t iter = 0; iter < models_.size(); ++iter) {
725
726
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
            ++feature_importances[models_[iter]->split_feature_real(split_idx)];
wxchan's avatar
wxchan committed
727
728
        }
    }
729
730
731
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
732
      if (feature_importances[i] > 0) {
733
        pairs.emplace_back(feature_importances[i], feature_names_[i]);
Guolin Ke's avatar
Guolin Ke committed
734
      }
735
736
737
738
739
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
740
      return lhs.first > rhs.first;
741
    });
wxchan's avatar
wxchan committed
742
    return pairs;
wxchan's avatar
wxchan committed
743
744
}

745
746
std::vector<double> GBDT::PredictRaw(const double* value) const {
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
747
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
748
749
750
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
    }
Guolin Ke's avatar
Guolin Ke committed
751
752
753
754
  }
  return ret;
}

755
std::vector<double> GBDT::Predict(const double* value) const {
756
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
757
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
758
759
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
760
761
    }
  }
762
763
764
765
766
767
  // if need sigmoid transform
  if (sigmoid_ > 0 && num_class_ == 1) {
    ret[0] = 1.0f / (1.0f + std::exp(- 2.0f * sigmoid_ * ret[0]));
  } else if (num_class_ > 1) {
    Common::Softmax(&ret);
  }
768
769
770
  return ret;
}

771
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
wxchan's avatar
wxchan committed
772
  std::vector<int> ret;
wxchan's avatar
wxchan committed
773
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
774
775
776
    for (int j = 0; j < num_class_; ++j) {
      ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
    }
wxchan's avatar
wxchan committed
777
778
779
780
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
781
}  // namespace LightGBM