gbdt.cpp 26.3 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#include "gbdt.h"

3
#include <LightGBM/utils/openmp_wrapper.h>
4

Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
9
10
11
12
13
14
15
16
#include <LightGBM/utils/common.h>

#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
17
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
18
19
20

namespace LightGBM {

21
GBDT::GBDT()
22
  :iter_(0),
23
24
25
26
27
  train_data_(nullptr),
  object_function_(nullptr),
  early_stopping_round_(0),
  max_feature_idx_(0),
  num_class_(1),
28
  sigmoid_(-1.0f),
29
  num_iteration_for_pred_(0),
30
  shrinkage_rate_(0.1f),
wxchan's avatar
wxchan committed
31
  num_init_iteration_(0) {
32
33
34
35
36
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
37
38
39
}

GBDT::~GBDT() {
Guolin Ke's avatar
Guolin Ke committed
40

Guolin Ke's avatar
Guolin Ke committed
41
42
}

43
44
45
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
  iter_ = 0;
wxchan's avatar
wxchan committed
46
  num_iteration_for_pred_ = 0;
47
  max_feature_idx_ = 0;
wxchan's avatar
wxchan committed
48
49
  num_class_ = config->num_class;
  train_data_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
50
  gbdt_config_ = nullptr;
51
  tree_learner_ = nullptr;
wxchan's avatar
wxchan committed
52
53
54
55
56
  ResetTrainingData(config, train_data, object_function, training_metrics);
}

void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
  const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
57
  auto new_config = std::unique_ptr<BoostingConfig>(new BoostingConfig(*config));
wxchan's avatar
wxchan committed
58
59
60
  if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
    Log::Fatal("cannot reset training data, since new training data has different bin mappers");
  }
Guolin Ke's avatar
Guolin Ke committed
61
62
63
  early_stopping_round_ = new_config->early_stopping_round;
  shrinkage_rate_ = new_config->learning_rate;

Guolin Ke's avatar
Guolin Ke committed
64
  object_function_ = object_function;
Guolin Ke's avatar
Guolin Ke committed
65

Guolin Ke's avatar
Guolin Ke committed
66
  sigmoid_ = -1.0f;
wxchan's avatar
wxchan committed
67
  if (object_function_ != nullptr
Guolin Ke's avatar
Guolin Ke committed
68
69
    && std::string(object_function_->GetName()) == std::string("binary")) {
    // only binary classification need sigmoid transform
Guolin Ke's avatar
Guolin Ke committed
70
    sigmoid_ = new_config->sigmoid;
71
  }
Guolin Ke's avatar
Guolin Ke committed
72

Guolin Ke's avatar
Guolin Ke committed
73
  if (train_data_ != train_data && train_data != nullptr) {
74
75
    if (tree_learner_ == nullptr) {
      tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(new_config->tree_learner_type, &new_config->tree_config));
Guolin Ke's avatar
Guolin Ke committed
76
77
    }
    // init tree learner
78
    tree_learner_->Init(train_data);
Guolin Ke's avatar
Guolin Ke committed
79

Guolin Ke's avatar
Guolin Ke committed
80
81
82
83
84
85
    // push training metrics
    training_metrics_.clear();
    for (const auto& metric : training_metrics) {
      training_metrics_.push_back(metric);
    }
    training_metrics_.shrink_to_fit();
wxchan's avatar
wxchan committed
86
87
88
89
90
91
92
93
94
95
96
97
98
    // not same training data, need reset score and others
    // create score tracker
    train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
    // update score
    for (int i = 0; i < iter_; ++i) {
      for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
        auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
        train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
      }
    }
    num_data_ = train_data->num_data();
    // create buffer for gradients and hessians
    if (object_function_ != nullptr) {
99
100
101
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
wxchan's avatar
wxchan committed
102
103
104
105
106
    }
    // get max feature index
    max_feature_idx_ = train_data->num_total_features() - 1;
    // get label index
    label_idx_ = train_data->label_idx();
107
108
    // get feature names
    feature_names_ = train_data->feature_names();
109
110
111
112
113
114
115
116
117
118
    // get feature infos
    feature_infos_.clear();
    for (int i = 0; i < max_feature_idx_ + 1; ++i) {
      int feature_idx = train_data->GetInnerFeatureIndex(i);
      if (feature_idx < 0) { 
        feature_infos_.push_back("trival feature"); 
      } else {
        feature_infos_.push_back(train_data->FeatureAt(feature_idx)->bin_mapper()->bin_info());
      }
    }
Guolin Ke's avatar
Guolin Ke committed
119
120
  }

Guolin Ke's avatar
Guolin Ke committed
121
122
  if ((train_data_ != train_data && train_data != nullptr)
    || (gbdt_config_ != nullptr && gbdt_config_->bagging_fraction != new_config->bagging_fraction)) {
wxchan's avatar
wxchan committed
123
    // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
124
    if (new_config->bagging_fraction < 1.0 && new_config->bagging_freq > 0) {
125
126
      bag_data_cnt_ =
        static_cast<data_size_t>(new_config->bagging_fraction * num_data_);
127
      bag_data_indices_.resize(num_data_);
128
129
130
131
132
133
      tmp_indices_.resize(num_data_);
      offsets_buf_.resize(num_threads_);
      left_cnts_buf_.resize(num_threads_);
      right_cnts_buf_.resize(num_threads_);
      left_write_pos_buf_.resize(num_threads_);
      right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
134
135
      double average_bag_rate = new_config->bagging_fraction / new_config->bagging_freq;
      is_use_subset_ = false;
Guolin Ke's avatar
Guolin Ke committed
136
137
138
      if (average_bag_rate < 0.5) {
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
        tmp_subset_->CopyFeatureMapperFrom(train_data, false);
Guolin Ke's avatar
Guolin Ke committed
139
140
141
        is_use_subset_ = true;
        Log::Debug("use subset for bagging");
      }
wxchan's avatar
wxchan committed
142
143
144
    } else {
      bag_data_cnt_ = num_data_;
      bag_data_indices_.clear();
145
      tmp_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
146
      is_use_subset_ = false;
wxchan's avatar
wxchan committed
147
    }
Guolin Ke's avatar
Guolin Ke committed
148
  }
wxchan's avatar
wxchan committed
149
  train_data_ = train_data;
Guolin Ke's avatar
Guolin Ke committed
150
151
  if (train_data_ != nullptr) {
    // reset config for tree learner
152
    tree_learner_->ResetConfig(&new_config->tree_config);
Guolin Ke's avatar
Guolin Ke committed
153
  }
Guolin Ke's avatar
Guolin Ke committed
154
  gbdt_config_.reset(new_config.release());
Guolin Ke's avatar
Guolin Ke committed
155
156
}

wxchan's avatar
wxchan committed
157
void GBDT::AddValidDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
158
  const std::vector<const Metric*>& valid_metrics) {
wxchan's avatar
wxchan committed
159
160
  if (!train_data_->CheckAlign(*valid_data)) {
    Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
161
  }
Guolin Ke's avatar
Guolin Ke committed
162
  // for a validation dataset, we need its score and metric
Guolin Ke's avatar
Guolin Ke committed
163
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
wxchan's avatar
wxchan committed
164
165
166
167
168
169
170
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
      new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
171
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
172
  valid_metrics_.emplace_back();
173
174
175
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
Guolin Ke's avatar
Guolin Ke committed
176
    best_msg_.emplace_back();
177
  }
Guolin Ke's avatar
Guolin Ke committed
178
179
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
180
181
182
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
Guolin Ke's avatar
Guolin Ke committed
183
      best_msg_.back().emplace_back();
184
    }
Guolin Ke's avatar
Guolin Ke committed
185
  }
Guolin Ke's avatar
Guolin Ke committed
186
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
187
188
}

Guolin Ke's avatar
Guolin Ke committed
189
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer){
190
191
192
  if (cnt <= 0) {
    return 0;
  }
193
194
195
196
  data_size_t bag_data_cnt =
    static_cast<data_size_t>(gbdt_config_->bagging_fraction * cnt);
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
197
  auto right_buffer = buffer + bag_data_cnt;
198
199
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
200
201
202
    float prob =
      (bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
    if (cur_rand.NextFloat() < prob) {
203
204
      buffer[cur_left_cnt++] = start + i;
    } else {
Guolin Ke's avatar
Guolin Ke committed
205
      right_buffer[cur_right_cnt++] = start + i;
206
207
    }
  }
Guolin Ke's avatar
Guolin Ke committed
208
  CHECK(buffer[bag_data_cnt - 1] > buffer[bag_data_cnt]);
209
210
211
  CHECK(cur_left_cnt == bag_data_cnt);
  return cur_left_cnt;
}
Guolin Ke's avatar
Guolin Ke committed
212

Guolin Ke's avatar
Guolin Ke committed
213
214


215
void GBDT::Bagging(int iter) {
Guolin Ke's avatar
Guolin Ke committed
216
  // if need bagging
217
  if (bag_data_cnt_ < num_data_ && iter % gbdt_config_->bagging_freq == 0) {
Guolin Ke's avatar
Guolin Ke committed
218
    const data_size_t min_inner_size = 1000;
219
220
221
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }

Guolin Ke's avatar
Guolin Ke committed
222
#pragma omp parallel for schedule(static,1)
223
224
225
226
227
228
229
    for (int i = 0; i < num_threads_; ++i) {
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Guolin Ke's avatar
Guolin Ke committed
230
231
      Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
    }
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
250
      }
251
252
253
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
254
255
      }
    }
Guolin Ke's avatar
Guolin Ke committed
256
257
    bag_data_cnt_ = left_cnt;
    CHECK(bag_data_indices_[bag_data_cnt_ - 1] > bag_data_indices_[bag_data_cnt_]);
Guolin Ke's avatar
Guolin Ke committed
258
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
259
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
260
261
262
263
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
Guolin Ke's avatar
Guolin Ke committed
264
265
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
Guolin Ke's avatar
Guolin Ke committed
266
267
      tree_learner_->ResetTrainingData(tmp_subset_.get());
    }
Guolin Ke's avatar
Guolin Ke committed
268
269
270
  }
}

271
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
272
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
273
  if (num_data_ - bag_data_cnt_ > 0 && !is_use_subset_) {
274
    train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
275
276
277
  }
}

278
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
283
284
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
    Boosting();
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
285
286
  // bagging logic
  Bagging(iter_);
Guolin Ke's avatar
Guolin Ke committed
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
  if (is_use_subset_ && bag_data_cnt_ < num_data_) {
    if (gradients_.empty()) {
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }    
    // get sub gradients
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto bias = curr_class * num_data_;
      for (int i = 0; i < bag_data_cnt_; ++i) {
        gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
        hessians_[bias + i] = hessian[bias + bag_data_indices_[i]];
      }
    }
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
Guolin Ke's avatar
Guolin Ke committed
304
305
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    // train a new tree
306
    std::unique_ptr<Tree> new_tree(tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
Guolin Ke's avatar
Guolin Ke committed
307
308
309
310
    // if cannot learn a new tree, then stop
    if (new_tree->num_leaves() <= 1) {
      Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
      return true;
311
    }
312

Guolin Ke's avatar
Guolin Ke committed
313
314
315
316
317
    // shrinkage by learning rate
    new_tree->Shrinkage(shrinkage_rate_);
    // update score
    UpdateScore(new_tree.get(), curr_class);
    UpdateScoreOutOfBag(new_tree.get(), curr_class);
318

Guolin Ke's avatar
Guolin Ke committed
319
320
321
322
323
324
325
326
327
    // add model
    models_.push_back(std::move(new_tree));
  }
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
328

Guolin Ke's avatar
Guolin Ke committed
329
}
330

wxchan's avatar
wxchan committed
331
void GBDT::RollbackOneIter() {
332
  if (iter_ <= 0) { return; }
wxchan's avatar
wxchan committed
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
  int cur_iter = iter_ + num_init_iteration_ - 1;
  // reset score
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    auto curr_tree = cur_iter * num_class_ + curr_class;
    models_[curr_tree]->Shrinkage(-1.0);
    train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
  // remove model
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
350
bool GBDT::EvalAndCheckEarlyStopping() {
351
352
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
353
354
  auto best_msg = OutputMetric(iter_);
  is_met_early_stopping = !best_msg.empty();
355
356
357
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
Guolin Ke's avatar
Guolin Ke committed
358
    Log::Info("Output of best iteration round:\n%s", best_msg.c_str());
359
    // pop last early_stopping_round_ models
360
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
361
362
363
364
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
365
366
}

367
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
368
  // update training score
Guolin Ke's avatar
Guolin Ke committed
369
370
371
372
373
  if (!is_use_subset_) {
    train_score_updater_->AddScore(tree_learner_.get(), curr_class);
  } else {
    train_score_updater_->AddScore(tree, curr_class);
  }
Guolin Ke's avatar
Guolin Ke committed
374
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
375
376
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
377
378
379
  }
}

Guolin Ke's avatar
Guolin Ke committed
380
381
382
383
std::string GBDT::OutputMetric(int iter) {
  bool need_output = (iter % gbdt_config_->output_freq) == 0;
  std::string ret = "";
  std::stringstream msg_buf;
384
  std::vector<std::pair<size_t, size_t>> meet_early_stopping_pairs;
Guolin Ke's avatar
Guolin Ke committed
385
  // print training metric
Guolin Ke's avatar
Guolin Ke committed
386
  if (need_output) {
387
388
389
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
390
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
391
392
393
394
395
396
397
398
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
          << ", training " << name[k]
          << " : " << scores[k];
        Log::Info(tmp_buf.str().c_str());
        if (early_stopping_round_ > 0) {
          msg_buf << tmp_buf.str() << std::endl;
        }
399
      }
400
    }
Guolin Ke's avatar
Guolin Ke committed
401
402
  }
  // print validation metric
Guolin Ke's avatar
Guolin Ke committed
403
  if (need_output || early_stopping_round_ > 0) {
404
405
406
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
Guolin Ke's avatar
Guolin Ke committed
407
408
409
410
411
412
413
414
415
416
417
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
          tmp_buf << "Iteration:" << iter
            << ", valid_" << i + 1 << " " << name[k]
            << " : " << test_scores[k];
          if (need_output) {
            Log::Info(tmp_buf.str().c_str());
          }
          if (early_stopping_round_ > 0) {
            msg_buf << tmp_buf.str() << std::endl;
418
          }
wxchan's avatar
wxchan committed
419
        }
Guolin Ke's avatar
Guolin Ke committed
420
        if (ret.empty() && early_stopping_round_ > 0) {
421
422
423
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
424
            best_iter_[i][j] = iter;
Guolin Ke's avatar
Guolin Ke committed
425
            meet_early_stopping_pairs.emplace_back(i, j);
426
          } else {
Guolin Ke's avatar
Guolin Ke committed
427
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; }
428
          }
wxchan's avatar
wxchan committed
429
430
        }
      }
Guolin Ke's avatar
Guolin Ke committed
431
432
    }
  }
Guolin Ke's avatar
Guolin Ke committed
433
434
435
  for (auto& pair : meet_early_stopping_pairs) {
    best_msg_[pair.first][pair.second] = msg_buf.str();
  }
wxchan's avatar
wxchan committed
436
  return ret;
Guolin Ke's avatar
Guolin Ke committed
437
438
}

439
/*! \brief Get eval result */
440
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
Guolin Ke's avatar
Guolin Ke committed
441
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
442
443
  std::vector<double> ret;
  if (data_idx == 0) {
444
445
    for (auto& sub_metric : training_metrics_) {
      auto scores = sub_metric->Eval(train_score_updater_->score());
446
447
448
      for (auto score : scores) {
        ret.push_back(score);
      }
449
450
    }
  }
451
452
453
454
455
456
457
  else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score());
      for (auto score : test_scores) {
        ret.push_back(score);
      }
458
459
460
461
462
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
463
/*! \brief Get training scores result */
464
const double* GBDT::GetTrainingScore(int64_t* out_len) {
465
  *out_len = static_cast<int64_t>(train_score_updater_->num_data()) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
466
  return train_score_updater_->score();
467
468
}

Guolin Ke's avatar
Guolin Ke committed
469
470
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
Guolin Ke's avatar
Guolin Ke committed
471

472
  const double* raw_scores = nullptr;
Guolin Ke's avatar
Guolin Ke committed
473
474
  data_size_t num_data = 0;
  if (data_idx == 0) {
wxchan's avatar
wxchan committed
475
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
476
477
478
479
480
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
481
    *out_len = static_cast<int64_t>(num_data) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
482
483
  }
  if (num_class_ > 1) {
wxchan's avatar
wxchan committed
484
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
485
    for (data_size_t i = 0; i < num_data; ++i) {
486
      std::vector<double> tmp_result(num_class_);
Guolin Ke's avatar
Guolin Ke committed
487
      for (int j = 0; j < num_class_; ++j) {
488
        tmp_result[j] = raw_scores[j * num_data + i];
Guolin Ke's avatar
Guolin Ke committed
489
490
491
      }
      Common::Softmax(&tmp_result);
      for (int j = 0; j < num_class_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
492
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
Guolin Ke's avatar
Guolin Ke committed
493
494
      }
    }
Guolin Ke's avatar
Guolin Ke committed
495
  } else if(sigmoid_ > 0.0f){
wxchan's avatar
wxchan committed
496
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
497
    for (data_size_t i = 0; i < num_data; ++i) {
498
      out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(- sigmoid_ * raw_scores[i])));
Guolin Ke's avatar
Guolin Ke committed
499
500
    }
  } else {
wxchan's avatar
wxchan committed
501
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
502
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
503
      out_result[i] = static_cast<double>(raw_scores[i]);
Guolin Ke's avatar
Guolin Ke committed
504
505
506
507
508
    }
  }

}

Guolin Ke's avatar
Guolin Ke committed
509
void GBDT::Boosting() {
510
511
512
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
513
  // objective function will calculate gradients and hessians
514
  int64_t num_score = 0;
Guolin Ke's avatar
Guolin Ke committed
515
  object_function_->
Guolin Ke's avatar
Guolin Ke committed
516
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
517
518
}

519
std::string GBDT::DumpModel(int num_iteration) const {
Guolin Ke's avatar
Guolin Ke committed
520
  std::stringstream str_buf;
wxchan's avatar
wxchan committed
521

Guolin Ke's avatar
Guolin Ke committed
522
  str_buf << "{";
Guolin Ke's avatar
Guolin Ke committed
523
  str_buf << "\"name\":\"" << SubModelName() << "\"," << std::endl;
Guolin Ke's avatar
Guolin Ke committed
524
525
526
527
  str_buf << "\"num_class\":" << num_class_ << "," << std::endl;
  str_buf << "\"label_index\":" << label_idx_ << "," << std::endl;
  str_buf << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
  str_buf << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
wxchan's avatar
wxchan committed
528

Guolin Ke's avatar
Guolin Ke committed
529
  str_buf << "\"feature_names\":[\"" 
530
     << Common::Join(feature_names_, "\",\"") << "\"]," 
Guolin Ke's avatar
Guolin Ke committed
531
532
     << std::endl;

Guolin Ke's avatar
Guolin Ke committed
533
  str_buf << "\"tree_info\":[";
534
535
536
537
538
  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_used_model = std::min(num_iteration * num_class_, num_used_model);
  } 
  for (int i = 0; i < num_used_model; ++i) {
wxchan's avatar
wxchan committed
539
    if (i > 0) {
Guolin Ke's avatar
Guolin Ke committed
540
      str_buf << ",";
wxchan's avatar
wxchan committed
541
    }
Guolin Ke's avatar
Guolin Ke committed
542
543
544
545
    str_buf << "{";
    str_buf << "\"tree_index\":" << i << ",";
    str_buf << models_[i]->ToJSON();
    str_buf << "}";
wxchan's avatar
wxchan committed
546
  }
Guolin Ke's avatar
Guolin Ke committed
547
  str_buf << "]" << std::endl;
wxchan's avatar
wxchan committed
548

Guolin Ke's avatar
Guolin Ke committed
549
  str_buf << "}" << std::endl;
wxchan's avatar
wxchan committed
550

Guolin Ke's avatar
Guolin Ke committed
551
  return str_buf.str();
wxchan's avatar
wxchan committed
552
553
}

554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
std::string GBDT::SaveModelToString(int num_iterations) const {
    std::stringstream ss;

    // output model type
    ss << SubModelName() << std::endl;
    // output number of class
    ss << "num_class=" << num_class_ << std::endl;
    // output label index
    ss << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
    // output objective name
    if (object_function_ != nullptr) {
      ss << "objective=" << object_function_->GetName() << std::endl;
    }
    // output sigmoid parameter
    ss << "sigmoid=" << sigmoid_ << std::endl;

    ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;

    ss << std::endl;
    int num_used_model = static_cast<int>(models_.size());
    if (num_iterations > 0) {
      num_used_model = std::min(num_iterations * num_class_, num_used_model);
    }
    // output tree models
    for (int i = 0; i < num_used_model; ++i) {
      ss << "Tree=" << i << std::endl;
      ss << models_[i]->ToString() << std::endl;
    }

    std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
    ss << std::endl << "feature importances:" << std::endl;
    for (size_t i = 0; i < pairs.size(); ++i) {
      ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
    }

    ss << std::endl << "feature information:" << std::endl;
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
592
    for (int i = 0; i < max_feature_idx_ + 1; ++i) {
593
594
595
596
597
598
      ss << feature_names_[i] << "=" << feature_infos_[i] << std::endl;
    }

    return ss.str();
}

599
bool GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
wxchan's avatar
wxchan committed
600
601
602
  /*! \brief File to write models */
  std::ofstream output_file;
  output_file.open(filename);
603

604
  output_file << SaveModelToString(num_iteration);
605

wxchan's avatar
wxchan committed
606
  output_file.close();
607
608

  return (bool)output_file;
Guolin Ke's avatar
Guolin Ke committed
609
610
}

611
bool GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
612
613
614
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
615
616

  // get number of classes
617
618
619
620
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
621
    Log::Fatal("Model file doesn't specify the number of classes");
622
    return false;
623
  }
Guolin Ke's avatar
Guolin Ke committed
624
  // get index of label
625
626
627
628
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
629
    Log::Fatal("Model file doesn't specify the label index");
630
    return false;
Guolin Ke's avatar
Guolin Ke committed
631
  }
Guolin Ke's avatar
Guolin Ke committed
632
  // get max_feature_idx first
633
634
635
636
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
637
    Log::Fatal("Model file doesn't specify max_feature_idx");
638
    return false;
Guolin Ke's avatar
Guolin Ke committed
639
640
  }
  // get sigmoid parameter
641
642
643
644
  line = Common::FindFromLines(lines, "sigmoid=");
  if (line.size() > 0) {
    Common::Atof(Common::Split(line.c_str(), '=')[1].c_str(), &sigmoid_);
  } else {
645
    sigmoid_ = -1.0f;
Guolin Ke's avatar
Guolin Ke committed
646
  }
Guolin Ke's avatar
Guolin Ke committed
647
648
649
  // get feature names
  line = Common::FindFromLines(lines, "feature_names=");
  if (line.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
650
    feature_names_ = Common::Split(line.substr(std::strlen("feature_names=")).c_str(), " ");
Guolin Ke's avatar
Guolin Ke committed
651
652
    if (feature_names_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_names");
653
      return false;
Guolin Ke's avatar
Guolin Ke committed
654
655
656
    }
  } else {
    Log::Fatal("Model file doesn't contain feature names");
657
    return false;
Guolin Ke's avatar
Guolin Ke committed
658
659
  }

660
  // returns offset, or lines.size() if not found.
Guolin Ke's avatar
Guolin Ke committed
661
  auto find_string_lineno = [&lines](const std::string &str, size_t start_line)
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
  {
    size_t i = start_line;
    size_t featinfo_find_pos = std::string::npos;
    while (i < lines.size()) {
      featinfo_find_pos = lines[i].find(str);
      if (featinfo_find_pos != std::string::npos)
        break;
      ++i;
    }

    return i;
  };

  // load feature information
  {
Guolin Ke's avatar
Guolin Ke committed
677
    size_t finfo_line_idx = find_string_lineno("feature information:", 0);
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699

    if (finfo_line_idx >= lines.size()) {
      Log::Fatal("Model file doesn't contain feature information");
      return false;
    }

    feature_infos_.resize(max_feature_idx_ + 1);

    // search for each feature name
    for (int i=0; i < max_feature_idx_ + 1; i++) {
      const auto feat_name = feature_names_[i];
      size_t line_idx = find_string_lineno(feat_name + "=", finfo_line_idx + 1);
      if (line_idx >= lines.size()) {
        Log::Fatal(("Model file doesn't contain feature information for feature " + feat_name).c_str());
        return false;
      }

      const auto this_line = lines[line_idx];
      feature_infos_[i] = this_line.substr((feat_name + "=").size());
    }
  }

Guolin Ke's avatar
Guolin Ke committed
700
  // get tree models
701
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
702
703
704
705
706
707
708
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
Guolin Ke's avatar
Guolin Ke committed
709
      std::string tree_str = Common::Join<std::string>(lines, start, end, "\n");
Guolin Ke's avatar
Guolin Ke committed
710
711
      auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
      models_.push_back(std::move(new_tree));
Guolin Ke's avatar
Guolin Ke committed
712
713
714
715
    } else {
      ++i;
    }
  }
716
  Log::Info("Finished loading %d models", models_.size());
wxchan's avatar
wxchan committed
717
718
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
  num_init_iteration_ = num_iteration_for_pred_;
719
  iter_ = 0;
720
721

  return true;
Guolin Ke's avatar
Guolin Ke committed
722
723
}

wxchan's avatar
wxchan committed
724
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
725

726
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
727
    for (size_t iter = 0; iter < models_.size(); ++iter) {
728
729
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
            ++feature_importances[models_[iter]->split_feature_real(split_idx)];
wxchan's avatar
wxchan committed
730
731
        }
    }
732
733
734
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
735
      if (feature_importances[i] > 0) {
736
        pairs.emplace_back(feature_importances[i], feature_names_[i]);
Guolin Ke's avatar
Guolin Ke committed
737
      }
738
739
740
741
742
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
743
      return lhs.first > rhs.first;
744
    });
wxchan's avatar
wxchan committed
745
    return pairs;
wxchan's avatar
wxchan committed
746
747
}

748
749
std::vector<double> GBDT::PredictRaw(const double* value) const {
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
750
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
751
752
753
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
    }
Guolin Ke's avatar
Guolin Ke committed
754
755
756
757
  }
  return ret;
}

758
std::vector<double> GBDT::Predict(const double* value) const {
759
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
760
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
761
762
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
763
764
    }
  }
765
766
  // if need sigmoid transform
  if (sigmoid_ > 0 && num_class_ == 1) {
767
    ret[0] = 1.0f / (1.0f + std::exp(-sigmoid_ * ret[0]));
768
769
770
  } else if (num_class_ > 1) {
    Common::Softmax(&ret);
  }
771
772
773
  return ret;
}

774
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
wxchan's avatar
wxchan committed
775
  std::vector<int> ret;
wxchan's avatar
wxchan committed
776
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
777
778
779
    for (int j = 0; j < num_class_; ++j) {
      ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
    }
wxchan's avatar
wxchan committed
780
781
782
783
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
784
}  // namespace LightGBM