gbdt.cpp 20.4 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include "gbdt.h"

#include <LightGBM/utils/common.h>

#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
15
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

19
20
21
22
GBDT::GBDT() 
  :saved_model_size_(-1), 
  num_iteration_for_pred_(0), 
  num_init_iteration_(0) {
Guolin Ke's avatar
Guolin Ke committed
23

Guolin Ke's avatar
Guolin Ke committed
24
25
26
}

GBDT::~GBDT() {
Guolin Ke's avatar
Guolin Ke committed
27

Guolin Ke's avatar
Guolin Ke committed
28
29
}

30
31
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
32
  gbdt_config_ = config;
33
  iter_ = 0;
34
  saved_model_size_ = -1;
Guolin Ke's avatar
Guolin Ke committed
35
  num_iteration_for_pred_ = 0;
36
37
  max_feature_idx_ = 0;
  early_stopping_round_ = gbdt_config_->early_stopping_round;
Guolin Ke's avatar
Guolin Ke committed
38
  shrinkage_rate_ = gbdt_config_->learning_rate;
39
  num_class_ = config->num_class;
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  train_data_ = nullptr;
  ResetTrainingData(train_data, object_function, training_metrics);
  // initialize random generator
  random_ = Random(gbdt_config_->bagging_seed);

}

void GBDT::ResetConfig(const BoostingConfig* config) {
  gbdt_config_ = config;
  early_stopping_round_ = gbdt_config_->early_stopping_round;
  shrinkage_rate_ = gbdt_config_->learning_rate;
  // create tree learner
  tree_learner_.clear();
  for (int i = 0; i < num_class_; ++i) {
    auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
    new_tree_learner->Init(train_data_);
    // init tree learner
    tree_learner_.push_back(std::move(new_tree_learner));
  }
  tree_learner_.shrink_to_fit();
  // if need bagging, create buffer
  if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
    out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
    bag_data_indices_ = std::vector<data_size_t>(num_data_);
  } else {
    out_of_bag_data_cnt_ = 0;
    out_of_bag_data_indices_.clear();
    bag_data_cnt_ = num_data_;
    bag_data_indices_.clear();
  }
  // initialize random generator
  random_ = Random(gbdt_config_->bagging_seed);
}

void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* object_function, const std::vector<const Metric*>& training_metrics) {
  if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
    Log::Fatal("cannot reset training data, since new training data has different bin mappers");
  }
  train_data_ = train_data;
Guolin Ke's avatar
Guolin Ke committed
79
  // create tree learner
80
  tree_learner_.clear();
Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
  for (int i = 0; i < num_class_; ++i) {
    auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
    new_tree_learner->Init(train_data_);
    // init tree learner
    tree_learner_.push_back(std::move(new_tree_learner));
86
  }
Guolin Ke's avatar
Guolin Ke committed
87
  tree_learner_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
88
89
  object_function_ = object_function;
  // push training metrics
90
  training_metrics_.clear();
Guolin Ke's avatar
Guolin Ke committed
91
92
93
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
Guolin Ke's avatar
Guolin Ke committed
94
  training_metrics_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
95
  // create score tracker
Guolin Ke's avatar
Guolin Ke committed
96
  train_score_updater_.reset(new ScoreUpdater(train_data_, num_class_));
Guolin Ke's avatar
Guolin Ke committed
97
98
  num_data_ = train_data_->num_data();
  // create buffer for gradients and hessians
99
  if (object_function_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
    gradients_ = std::vector<score_t>(num_data_ * num_class_);
    hessians_ = std::vector<score_t>(num_data_ * num_class_);
  }
  sigmoid_ = -1.0f;
104
  if (object_function_ != nullptr
Guolin Ke's avatar
Guolin Ke committed
105
106
107
    && std::string(object_function_->GetName()) == std::string("binary")) {
    // only binary classification need sigmoid transform
    sigmoid_ = gbdt_config_->sigmoid;
108
  }
Guolin Ke's avatar
Guolin Ke committed
109
  // get max feature index
110
  max_feature_idx_ = train_data_->num_total_features() - 1;
Guolin Ke's avatar
Guolin Ke committed
111
112
  // get label index
  label_idx_ = train_data_->label_idx();
Guolin Ke's avatar
Guolin Ke committed
113
114
  // if need bagging, create buffer
  if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
Guolin Ke's avatar
Guolin Ke committed
115
116
    out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
    bag_data_indices_ = std::vector<data_size_t>(num_data_);
Guolin Ke's avatar
Guolin Ke committed
117
118
  } else {
    out_of_bag_data_cnt_ = 0;
Guolin Ke's avatar
Guolin Ke committed
119
    out_of_bag_data_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
120
    bag_data_cnt_ = num_data_;
Guolin Ke's avatar
Guolin Ke committed
121
    bag_data_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
122
  }
123
124
125
126
127
128
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto curr_tree = i * num_class_ + curr_class;
      train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
    }
129
130
  }
}
131
132

void GBDT::AddValidDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
133
  const std::vector<const Metric*>& valid_metrics) {
134
135
  if (!train_data_->CheckAlign(*valid_data)) {
    Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
136
  }
Guolin Ke's avatar
Guolin Ke committed
137
  // for a validation dataset, we need its score and metric
Guolin Ke's avatar
Guolin Ke committed
138
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
139
140
141
142
143
144
145
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto curr_tree = i * num_class_ + curr_class;
      new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
146
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
147
  valid_metrics_.emplace_back();
148
149
150
151
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
  }
Guolin Ke's avatar
Guolin Ke committed
152
153
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
154
155
156
157
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
    }
Guolin Ke's avatar
Guolin Ke committed
158
  }
Guolin Ke's avatar
Guolin Ke committed
159
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
160
161
162
}


163
void GBDT::Bagging(int iter, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
164
  // if need bagging
Guolin Ke's avatar
Guolin Ke committed
165
  if (out_of_bag_data_indices_.size() > 0 && iter % gbdt_config_->bagging_freq == 0) {
Guolin Ke's avatar
Guolin Ke committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
    // if doesn't have query data
    if (train_data_->metadata().query_boundaries() == nullptr) {
      bag_data_cnt_ =
        static_cast<data_size_t>(gbdt_config_->bagging_fraction * num_data_);
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one record
      for (data_size_t i = 0; i < num_data_; ++i) {
        double prob =
          (bag_data_cnt_ - cur_left_cnt) / static_cast<double>(num_data_ - i);
        if (random_.NextDouble() < prob) {
          bag_data_indices_[cur_left_cnt++] = i;
        } else {
          out_of_bag_data_indices_[cur_right_cnt++] = i;
        }
      }
    } else {
      // if have query data
      const data_size_t* query_boundaries = train_data_->metadata().query_boundaries();
      data_size_t num_query = train_data_->metadata().num_queries();
      data_size_t bag_query_cnt =
          static_cast<data_size_t>(num_query * gbdt_config_->bagging_fraction);
      data_size_t cur_left_query_cnt = 0;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one query
      for (data_size_t i = 0; i < num_query; ++i) {
        double prob =
            (bag_query_cnt - cur_left_query_cnt) / static_cast<double>(num_query - i);
        if (random_.NextDouble() < prob) {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            bag_data_indices_[cur_left_cnt++] = j;
          }
          cur_left_query_cnt++;
        } else {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            out_of_bag_data_indices_[cur_right_cnt++] = j;
          }
        }
      }
      bag_data_cnt_ = cur_left_cnt;
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
    }
Guolin Ke's avatar
Guolin Ke committed
210
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
211
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
212
    tree_learner_[curr_class]->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
213
214
215
  }
}

216
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
217
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
218
219
  if (out_of_bag_data_indices_.size() > 0) {
    train_score_updater_->AddScore(tree, out_of_bag_data_indices_.data(), out_of_bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
220
221
222
  }
}

223
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
    Boosting();
    gradient = gradients_.data();
    hessian = hessians_.data();
  }

  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    // bagging logic
    Bagging(iter_, curr_class);

    // train a new tree
    std::unique_ptr<Tree> new_tree(tree_learner_[curr_class]->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
    // if cannot learn a new tree, then stop
    if (new_tree->num_leaves() <= 1) {
      Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
      return true;
241
    }
242

Guolin Ke's avatar
Guolin Ke committed
243
244
245
246
247
    // shrinkage by learning rate
    new_tree->Shrinkage(shrinkage_rate_);
    // update score
    UpdateScore(new_tree.get(), curr_class);
    UpdateScoreOutOfBag(new_tree.get(), curr_class);
248

Guolin Ke's avatar
Guolin Ke committed
249
250
251
252
253
254
255
256
257
    // add model
    models_.push_back(std::move(new_tree));
  }
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
258

Guolin Ke's avatar
Guolin Ke committed
259
}
260

261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
void GBDT::RollbackOneIter() {
  if (iter_ == 0) { return; }
  int cur_iter = iter_ - 1;
  // reset score
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    auto curr_tree = cur_iter * num_class_ + curr_class;
    models_[curr_tree]->Shrinkage(-1.0);
    train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
  // remove model
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
280
bool GBDT::EvalAndCheckEarlyStopping() {
281
282
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
283
  is_met_early_stopping = OutputMetric(iter_);
284
285
286
287
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
    // pop last early_stopping_round_ models
288
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
289
290
291
292
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
293
294
}

295
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
296
  // update training score
Guolin Ke's avatar
Guolin Ke committed
297
  train_score_updater_->AddScore(tree_learner_[curr_class].get(), curr_class);
Guolin Ke's avatar
Guolin Ke committed
298
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
299
300
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
301
302
303
  }
}

wxchan's avatar
wxchan committed
304
305
bool GBDT::OutputMetric(int iter) {
  bool ret = false;
Guolin Ke's avatar
Guolin Ke committed
306
  // print training metric
307
308
309
310
  if ((iter % gbdt_config_->output_freq) == 0) {
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
311
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
312
        Log::Info("Iteration:%d, training %s : %f", iter, name[k].c_str(), scores[k]);
313
      }
314
    }
Guolin Ke's avatar
Guolin Ke committed
315
316
  }
  // print validation metric
317
318
319
320
321
322
  if ((iter % gbdt_config_->output_freq) == 0 || early_stopping_round_ > 0) {
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
        if ((iter % gbdt_config_->output_freq) == 0) {
          auto name = valid_metrics_[i][j]->GetName();
Guolin Ke's avatar
Guolin Ke committed
323
          for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
324
            Log::Info("Iteration:%d, valid_%d %s : %f", iter, i + 1, name[k].c_str(), test_scores[k]);
325
          }
wxchan's avatar
wxchan committed
326
        }
327
        if (!ret && early_stopping_round_ > 0) {
328
329
330
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
331
332
            best_iter_[i][j] = iter;
          } else {
333
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = true; }
334
          }
wxchan's avatar
wxchan committed
335
336
        }
      }
Guolin Ke's avatar
Guolin Ke committed
337
338
    }
  }
wxchan's avatar
wxchan committed
339
  return ret;
Guolin Ke's avatar
Guolin Ke committed
340
341
}

342
/*! \brief Get eval result */
343
344
345
346
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
  std::vector<double> ret;
  if (data_idx == 0) {
347
348
    for (auto& sub_metric : training_metrics_) {
      auto scores = sub_metric->Eval(train_score_updater_->score());
349
350
351
      for (auto score : scores) {
        ret.push_back(score);
      }
352
353
    }
  }
354
355
356
357
358
359
360
  else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score());
      for (auto score : test_scores) {
        ret.push_back(score);
      }
361
362
363
364
365
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
366
/*! \brief Get training scores result */
367
const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
Guolin Ke's avatar
Guolin Ke committed
368
369
  *out_len = train_score_updater_->num_data() * num_class_;
  return train_score_updater_->score();
370
371
}

Guolin Ke's avatar
Guolin Ke committed
372
void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) {
Guolin Ke's avatar
Guolin Ke committed
373
374
375
376
377
378
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
  std::vector<double> ret;

  const score_t* raw_scores = nullptr;
  data_size_t num_data = 0;
  if (data_idx == 0) {
Guolin Ke's avatar
Guolin Ke committed
379
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
380
381
382
383
384
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
Guolin Ke's avatar
Guolin Ke committed
385
    *out_len = num_data * num_class_;
Guolin Ke's avatar
Guolin Ke committed
386
387
  }
  if (num_class_ > 1) {
Guolin Ke's avatar
Guolin Ke committed
388
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
389
390
391
392
393
394
395
396
397
398
    for (data_size_t i = 0; i < num_data; ++i) {
      std::vector<double> tmp_result;
      for (int j = 0; j < num_class_; ++j) {
        tmp_result.push_back(raw_scores[j * num_data + i]);
      }
      Common::Softmax(&tmp_result);
      for (int j = 0; j < num_class_; ++j) {
        out_result[j * num_data + i] = static_cast<score_t>(tmp_result[i]);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
399
  } else if(sigmoid_ > 0.0f){
Guolin Ke's avatar
Guolin Ke committed
400
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
401
402
403
404
    for (data_size_t i = 0; i < num_data; ++i) {
      out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
    }
  } else {
Guolin Ke's avatar
Guolin Ke committed
405
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
406
407
408
409
410
411
412
    for (data_size_t i = 0; i < num_data; ++i) {
      out_result[i] = raw_scores[i];
    }
  }

}

Guolin Ke's avatar
Guolin Ke committed
413
void GBDT::Boosting() {
414
415
416
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
417
  // objective function will calculate gradients and hessians
Guolin Ke's avatar
Guolin Ke committed
418
  int num_score = 0;
Guolin Ke's avatar
Guolin Ke committed
419
  object_function_->
Guolin Ke's avatar
Guolin Ke committed
420
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
421
422
}

Guolin Ke's avatar
Guolin Ke committed
423
void GBDT::SaveModelToFile(int num_iteration, bool is_finish, const char* filename) {
424
  // first time to this function, open file
Guolin Ke's avatar
Guolin Ke committed
425
  if (saved_model_size_ < 0) {
426
427
    model_output_file_.open(filename);
    // output model type
428
    model_output_file_ << Name() << std::endl;
429
430
    // output number of class
    model_output_file_ << "num_class=" << num_class_ << std::endl;
431
432
433
434
    // output label index
    model_output_file_ << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    model_output_file_ << "max_feature_idx=" << max_feature_idx_ << std::endl;
Guolin Ke's avatar
Guolin Ke committed
435
436
437
438
    // output objective name
    if (object_function_ != nullptr) {
      model_output_file_ << "objective=" << object_function_->GetName() << std::endl;
    }
439
    // output sigmoid parameter
Guolin Ke's avatar
Guolin Ke committed
440
    model_output_file_ << "sigmoid=" << sigmoid_ << std::endl;
441
442
443
444
445
446
447
    model_output_file_ << std::endl;
    saved_model_size_ = 0;
  }
  // already saved
  if (!model_output_file_.is_open()) {
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
448
449
  int num_used_model = 0;
  if (num_iteration == NO_LIMIT) {
Guolin Ke's avatar
Guolin Ke committed
450
451
    num_used_model = static_cast<int>(models_.size());
  } else {
Guolin Ke's avatar
Guolin Ke committed
452
    num_used_model = num_iteration * num_class_;
Guolin Ke's avatar
Guolin Ke committed
453
454
  }
  int rest = num_used_model - early_stopping_round_ * num_class_;
455
456
457
458
459
  // output tree models
  for (int i = saved_model_size_; i < rest; ++i) {
    model_output_file_ << "Tree=" << i << std::endl;
    model_output_file_ << models_[i]->ToString() << std::endl;
  }
460

Guolin Ke's avatar
Guolin Ke committed
461
  saved_model_size_ = std::max(saved_model_size_, rest);
462

463
464
465
  model_output_file_.flush();
  // training finished, can close file
  if (is_finish) {
Guolin Ke's avatar
Guolin Ke committed
466
    for (int i = saved_model_size_; i < num_used_model; ++i) {
467
468
469
470
471
      model_output_file_ << "Tree=" << i << std::endl;
      model_output_file_ << models_[i]->ToString() << std::endl;
    }
    model_output_file_ << std::endl << FeatureImportance() << std::endl;
    model_output_file_.close();
Guolin Ke's avatar
Guolin Ke committed
472
473
474
  }
}

Guolin Ke's avatar
Guolin Ke committed
475
void GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
476
477
478
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
479
480

  // get number of classes
481
482
483
484
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
485
    Log::Fatal("Model file doesn't specify the number of classes");
486
487
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
488
  // get index of label
489
490
491
492
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
493
    Log::Fatal("Model file doesn't specify the label index");
Guolin Ke's avatar
Guolin Ke committed
494
495
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
496
  // get max_feature_idx first
497
498
499
500
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
501
    Log::Fatal("Model file doesn't specify max_feature_idx");
Guolin Ke's avatar
Guolin Ke committed
502
503
504
    return;
  }
  // get sigmoid parameter
505
506
507
508
  line = Common::FindFromLines(lines, "sigmoid=");
  if (line.size() > 0) {
    Common::Atof(Common::Split(line.c_str(), '=')[1].c_str(), &sigmoid_);
  } else {
509
    sigmoid_ = -1.0f;
Guolin Ke's avatar
Guolin Ke committed
510
511
  }
  // get tree models
512
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
513
514
515
516
517
518
519
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
520
      std::string tree_str = Common::Join<std::string>(lines, start, end, '\n');
Guolin Ke's avatar
Guolin Ke committed
521
522
      auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
      models_.push_back(std::move(new_tree));
Guolin Ke's avatar
Guolin Ke committed
523
524
525
526
    } else {
      ++i;
    }
  }
527
  Log::Info("Finished loading %d models", models_.size());
Guolin Ke's avatar
Guolin Ke committed
528
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
529
  num_init_iteration_ = num_iteration_for_pred_;
Guolin Ke's avatar
Guolin Ke committed
530
531
}

532
std::string GBDT::FeatureImportance() const {
533
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
534
    for (size_t iter = 0; iter < models_.size(); ++iter) {
535
536
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
            ++feature_importances[models_[iter]->split_feature_real(split_idx)];
wxchan's avatar
wxchan committed
537
538
        }
    }
539
540
541
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
542
543
544
      if (feature_importances[i] > 0) {
        pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]);
      }
545
546
547
548
549
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
550
      return lhs.first > rhs.first;
551
    });
552
    std::stringstream str_buf;
553
    // write to model file
554
    str_buf << std::endl << "feature importances:" << std::endl;
555
    for (size_t i = 0; i < pairs.size(); ++i) {
556
      str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
557
    }
558
    return str_buf.str();
wxchan's avatar
wxchan committed
559
560
}

561
562
std::vector<double> GBDT::PredictRaw(const double* value) const {
  std::vector<double> ret(num_class_, 0.0f);
Guolin Ke's avatar
Guolin Ke committed
563
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
564
565
566
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
    }
Guolin Ke's avatar
Guolin Ke committed
567
568
569
570
  }
  return ret;
}

571
std::vector<double> GBDT::Predict(const double* value) const {
572
  std::vector<double> ret(num_class_, 0.0f);
Guolin Ke's avatar
Guolin Ke committed
573
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
574
575
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
576
577
    }
  }
578
579
580
581
582
583
  // if need sigmoid transform
  if (sigmoid_ > 0 && num_class_ == 1) {
    ret[0] = 1.0f / (1.0f + std::exp(- 2.0f * sigmoid_ * ret[0]));
  } else if (num_class_ > 1) {
    Common::Softmax(&ret);
  }
584
585
586
  return ret;
}

587
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
wxchan's avatar
wxchan committed
588
  std::vector<int> ret;
Guolin Ke's avatar
Guolin Ke committed
589
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
590
591
592
    for (int j = 0; j < num_class_; ++j) {
      ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
    }
wxchan's avatar
wxchan committed
593
594
595
596
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
597
}  // namespace LightGBM