"tests/vscode:/vscode.git/clone" did not exist on "c6512e0185383d1575fe0ce0645e87ba1b64c3fd"
gbdt.cpp 19.6 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include "gbdt.h"

#include <LightGBM/utils/common.h>

#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
15
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

19
20
21
22
GBDT::GBDT() 
  :saved_model_size_(-1), 
  num_iteration_for_pred_(0), 
  num_init_iteration_(0) {
Guolin Ke's avatar
Guolin Ke committed
23

Guolin Ke's avatar
Guolin Ke committed
24
25
26
}

GBDT::~GBDT() {
Guolin Ke's avatar
Guolin Ke committed
27

Guolin Ke's avatar
Guolin Ke committed
28
29
}

30
31
32
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
  iter_ = 0;
33
  saved_model_size_ = -1;
Guolin Ke's avatar
Guolin Ke committed
34
  num_iteration_for_pred_ = 0;
35
  max_feature_idx_ = 0;
36
  num_class_ = config->num_class;
37
  train_data_ = nullptr;
38
  ResetTrainingData(config, train_data, object_function, training_metrics);
39
40
}

41
42
void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
  const std::vector<const Metric*>& training_metrics) {
43
44
45
  if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
    Log::Fatal("cannot reset training data, since new training data has different bin mappers");
  }
46
47
48
  gbdt_config_ = config;
  early_stopping_round_ = gbdt_config_->early_stopping_round;
  shrinkage_rate_ = gbdt_config_->learning_rate;
Guolin Ke's avatar
Guolin Ke committed
49
  random_ = Random(gbdt_config_->bagging_seed);
Guolin Ke's avatar
Guolin Ke committed
50
  // create tree learner
51
  tree_learner_.clear();
Guolin Ke's avatar
Guolin Ke committed
52
53
  for (int i = 0; i < num_class_; ++i) {
    auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
Guolin Ke's avatar
Guolin Ke committed
54
    new_tree_learner->Init(train_data);
Guolin Ke's avatar
Guolin Ke committed
55
56
    // init tree learner
    tree_learner_.push_back(std::move(new_tree_learner));
57
  }
Guolin Ke's avatar
Guolin Ke committed
58
  tree_learner_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
59
60
  object_function_ = object_function;
  // push training metrics
61
  training_metrics_.clear();
Guolin Ke's avatar
Guolin Ke committed
62
63
64
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
Guolin Ke's avatar
Guolin Ke committed
65
66
  training_metrics_.shrink_to_fit();
  sigmoid_ = -1.0f;
67
  if (object_function_ != nullptr
Guolin Ke's avatar
Guolin Ke committed
68
69
70
    && std::string(object_function_->GetName()) == std::string("binary")) {
    // only binary classification need sigmoid transform
    sigmoid_ = gbdt_config_->sigmoid;
71
  }
Guolin Ke's avatar
Guolin Ke committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  if (train_data_ != train_data) {
    // not same training data, need reset score and others
    // create score tracker
    train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
    // update score
    for (int i = 0; i < iter_; ++i) {
      for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
        auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
        train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
      }
    }
    num_data_ = train_data->num_data();
    // create buffer for gradients and hessians
    if (object_function_ != nullptr) {
      gradients_ = std::vector<score_t>(num_data_ * num_class_);
      hessians_ = std::vector<score_t>(num_data_ * num_class_);
    }
    // get max feature index
    max_feature_idx_ = train_data->num_total_features() - 1;
    // get label index
    label_idx_ = train_data->label_idx();
    // if need bagging, create buffer
    if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
      out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
      bag_data_indices_ = std::vector<data_size_t>(num_data_);
    } else {
      out_of_bag_data_cnt_ = 0;
      out_of_bag_data_indices_.clear();
      bag_data_cnt_ = num_data_;
      bag_data_indices_.clear();
102
    }
103
  }
Guolin Ke's avatar
Guolin Ke committed
104
  train_data_ = train_data;
105
}
106
107

void GBDT::AddValidDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
108
  const std::vector<const Metric*>& valid_metrics) {
109
110
  if (!train_data_->CheckAlign(*valid_data)) {
    Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
111
  }
Guolin Ke's avatar
Guolin Ke committed
112
  // for a validation dataset, we need its score and metric
Guolin Ke's avatar
Guolin Ke committed
113
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
114
115
116
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
Guolin Ke's avatar
Guolin Ke committed
117
      auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
118
119
120
      new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
121
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
122
  valid_metrics_.emplace_back();
123
124
125
126
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
  }
Guolin Ke's avatar
Guolin Ke committed
127
128
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
129
130
131
132
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
    }
Guolin Ke's avatar
Guolin Ke committed
133
  }
Guolin Ke's avatar
Guolin Ke committed
134
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
135
136
137
}


138
void GBDT::Bagging(int iter, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
139
  // if need bagging
Guolin Ke's avatar
Guolin Ke committed
140
  if (out_of_bag_data_indices_.size() > 0 && iter % gbdt_config_->bagging_freq == 0) {
Guolin Ke's avatar
Guolin Ke committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
    // if doesn't have query data
    if (train_data_->metadata().query_boundaries() == nullptr) {
      bag_data_cnt_ =
        static_cast<data_size_t>(gbdt_config_->bagging_fraction * num_data_);
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one record
      for (data_size_t i = 0; i < num_data_; ++i) {
        double prob =
          (bag_data_cnt_ - cur_left_cnt) / static_cast<double>(num_data_ - i);
        if (random_.NextDouble() < prob) {
          bag_data_indices_[cur_left_cnt++] = i;
        } else {
          out_of_bag_data_indices_[cur_right_cnt++] = i;
        }
      }
    } else {
      // if have query data
      const data_size_t* query_boundaries = train_data_->metadata().query_boundaries();
      data_size_t num_query = train_data_->metadata().num_queries();
      data_size_t bag_query_cnt =
          static_cast<data_size_t>(num_query * gbdt_config_->bagging_fraction);
      data_size_t cur_left_query_cnt = 0;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one query
      for (data_size_t i = 0; i < num_query; ++i) {
        double prob =
            (bag_query_cnt - cur_left_query_cnt) / static_cast<double>(num_query - i);
        if (random_.NextDouble() < prob) {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            bag_data_indices_[cur_left_cnt++] = j;
          }
          cur_left_query_cnt++;
        } else {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            out_of_bag_data_indices_[cur_right_cnt++] = j;
          }
        }
      }
      bag_data_cnt_ = cur_left_cnt;
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
    }
Guolin Ke's avatar
Guolin Ke committed
185
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
186
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
187
    tree_learner_[curr_class]->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
188
189
190
  }
}

191
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
192
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
193
194
  if (out_of_bag_data_indices_.size() > 0) {
    train_score_updater_->AddScore(tree, out_of_bag_data_indices_.data(), out_of_bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
195
196
197
  }
}

198
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
    Boosting();
    gradient = gradients_.data();
    hessian = hessians_.data();
  }

  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    // bagging logic
    Bagging(iter_, curr_class);

    // train a new tree
    std::unique_ptr<Tree> new_tree(tree_learner_[curr_class]->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
    // if cannot learn a new tree, then stop
    if (new_tree->num_leaves() <= 1) {
      Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
      return true;
216
    }
217

Guolin Ke's avatar
Guolin Ke committed
218
219
220
221
222
    // shrinkage by learning rate
    new_tree->Shrinkage(shrinkage_rate_);
    // update score
    UpdateScore(new_tree.get(), curr_class);
    UpdateScoreOutOfBag(new_tree.get(), curr_class);
223

Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
228
229
230
231
232
    // add model
    models_.push_back(std::move(new_tree));
  }
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
233

Guolin Ke's avatar
Guolin Ke committed
234
}
235

236
237
void GBDT::RollbackOneIter() {
  if (iter_ == 0) { return; }
Guolin Ke's avatar
Guolin Ke committed
238
  int cur_iter = iter_ + num_init_iteration_ - 1;
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
  // reset score
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    auto curr_tree = cur_iter * num_class_ + curr_class;
    models_[curr_tree]->Shrinkage(-1.0);
    train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
  // remove model
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
255
bool GBDT::EvalAndCheckEarlyStopping() {
256
257
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
258
  is_met_early_stopping = OutputMetric(iter_);
259
260
261
262
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
    // pop last early_stopping_round_ models
263
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
264
265
266
267
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
268
269
}

270
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
271
  // update training score
Guolin Ke's avatar
Guolin Ke committed
272
  train_score_updater_->AddScore(tree_learner_[curr_class].get(), curr_class);
Guolin Ke's avatar
Guolin Ke committed
273
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
274
275
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
276
277
278
  }
}

wxchan's avatar
wxchan committed
279
280
bool GBDT::OutputMetric(int iter) {
  bool ret = false;
Guolin Ke's avatar
Guolin Ke committed
281
  // print training metric
282
283
284
285
  if ((iter % gbdt_config_->output_freq) == 0) {
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
286
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
287
        Log::Info("Iteration:%d, training %s : %f", iter, name[k].c_str(), scores[k]);
288
      }
289
    }
Guolin Ke's avatar
Guolin Ke committed
290
291
  }
  // print validation metric
292
293
294
295
296
297
  if ((iter % gbdt_config_->output_freq) == 0 || early_stopping_round_ > 0) {
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
        if ((iter % gbdt_config_->output_freq) == 0) {
          auto name = valid_metrics_[i][j]->GetName();
Guolin Ke's avatar
Guolin Ke committed
298
          for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
299
            Log::Info("Iteration:%d, valid_%d %s : %f", iter, i + 1, name[k].c_str(), test_scores[k]);
300
          }
wxchan's avatar
wxchan committed
301
        }
302
        if (!ret && early_stopping_round_ > 0) {
303
304
305
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
306
307
            best_iter_[i][j] = iter;
          } else {
308
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = true; }
309
          }
wxchan's avatar
wxchan committed
310
311
        }
      }
Guolin Ke's avatar
Guolin Ke committed
312
313
    }
  }
wxchan's avatar
wxchan committed
314
  return ret;
Guolin Ke's avatar
Guolin Ke committed
315
316
}

317
/*! \brief Get eval result */
318
319
320
321
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
  std::vector<double> ret;
  if (data_idx == 0) {
322
323
    for (auto& sub_metric : training_metrics_) {
      auto scores = sub_metric->Eval(train_score_updater_->score());
324
325
326
      for (auto score : scores) {
        ret.push_back(score);
      }
327
328
    }
  }
329
330
331
332
333
334
335
  else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score());
      for (auto score : test_scores) {
        ret.push_back(score);
      }
336
337
338
339
340
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
341
/*! \brief Get training scores result */
342
const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
Guolin Ke's avatar
Guolin Ke committed
343
344
  *out_len = train_score_updater_->num_data() * num_class_;
  return train_score_updater_->score();
345
346
}

Guolin Ke's avatar
Guolin Ke committed
347
void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) {
Guolin Ke's avatar
Guolin Ke committed
348
349
350
351
352
353
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
  std::vector<double> ret;

  const score_t* raw_scores = nullptr;
  data_size_t num_data = 0;
  if (data_idx == 0) {
Guolin Ke's avatar
Guolin Ke committed
354
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
355
356
357
358
359
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
Guolin Ke's avatar
Guolin Ke committed
360
    *out_len = num_data * num_class_;
Guolin Ke's avatar
Guolin Ke committed
361
362
  }
  if (num_class_ > 1) {
Guolin Ke's avatar
Guolin Ke committed
363
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
364
365
366
367
368
369
370
371
372
373
    for (data_size_t i = 0; i < num_data; ++i) {
      std::vector<double> tmp_result;
      for (int j = 0; j < num_class_; ++j) {
        tmp_result.push_back(raw_scores[j * num_data + i]);
      }
      Common::Softmax(&tmp_result);
      for (int j = 0; j < num_class_; ++j) {
        out_result[j * num_data + i] = static_cast<score_t>(tmp_result[i]);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
374
  } else if(sigmoid_ > 0.0f){
Guolin Ke's avatar
Guolin Ke committed
375
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
376
377
378
379
    for (data_size_t i = 0; i < num_data; ++i) {
      out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
    }
  } else {
Guolin Ke's avatar
Guolin Ke committed
380
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
381
382
383
384
385
386
387
    for (data_size_t i = 0; i < num_data; ++i) {
      out_result[i] = raw_scores[i];
    }
  }

}

Guolin Ke's avatar
Guolin Ke committed
388
void GBDT::Boosting() {
389
390
391
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
392
  // objective function will calculate gradients and hessians
Guolin Ke's avatar
Guolin Ke committed
393
  int num_score = 0;
Guolin Ke's avatar
Guolin Ke committed
394
  object_function_->
Guolin Ke's avatar
Guolin Ke committed
395
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
396
397
}

Guolin Ke's avatar
Guolin Ke committed
398
void GBDT::SaveModelToFile(int num_iteration, bool is_finish, const char* filename) {
399
  // first time to this function, open file
Guolin Ke's avatar
Guolin Ke committed
400
  if (saved_model_size_ < 0) {
401
402
    model_output_file_.open(filename);
    // output model type
403
    model_output_file_ << Name() << std::endl;
404
405
    // output number of class
    model_output_file_ << "num_class=" << num_class_ << std::endl;
406
407
408
409
    // output label index
    model_output_file_ << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    model_output_file_ << "max_feature_idx=" << max_feature_idx_ << std::endl;
Guolin Ke's avatar
Guolin Ke committed
410
411
412
413
    // output objective name
    if (object_function_ != nullptr) {
      model_output_file_ << "objective=" << object_function_->GetName() << std::endl;
    }
414
    // output sigmoid parameter
Guolin Ke's avatar
Guolin Ke committed
415
    model_output_file_ << "sigmoid=" << sigmoid_ << std::endl;
416
417
418
419
420
421
422
    model_output_file_ << std::endl;
    saved_model_size_ = 0;
  }
  // already saved
  if (!model_output_file_.is_open()) {
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
423
424
  int num_used_model = 0;
  if (num_iteration == NO_LIMIT) {
Guolin Ke's avatar
Guolin Ke committed
425
426
    num_used_model = static_cast<int>(models_.size());
  } else {
Guolin Ke's avatar
Guolin Ke committed
427
    num_used_model = num_iteration * num_class_;
Guolin Ke's avatar
Guolin Ke committed
428
429
  }
  int rest = num_used_model - early_stopping_round_ * num_class_;
430
431
432
433
434
  // output tree models
  for (int i = saved_model_size_; i < rest; ++i) {
    model_output_file_ << "Tree=" << i << std::endl;
    model_output_file_ << models_[i]->ToString() << std::endl;
  }
435

Guolin Ke's avatar
Guolin Ke committed
436
  saved_model_size_ = std::max(saved_model_size_, rest);
437

438
439
440
  model_output_file_.flush();
  // training finished, can close file
  if (is_finish) {
Guolin Ke's avatar
Guolin Ke committed
441
    for (int i = saved_model_size_; i < num_used_model; ++i) {
442
443
444
445
446
      model_output_file_ << "Tree=" << i << std::endl;
      model_output_file_ << models_[i]->ToString() << std::endl;
    }
    model_output_file_ << std::endl << FeatureImportance() << std::endl;
    model_output_file_.close();
Guolin Ke's avatar
Guolin Ke committed
447
448
449
  }
}

Guolin Ke's avatar
Guolin Ke committed
450
void GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
451
452
453
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
454
455

  // get number of classes
456
457
458
459
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
460
    Log::Fatal("Model file doesn't specify the number of classes");
461
462
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
463
  // get index of label
464
465
466
467
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
468
    Log::Fatal("Model file doesn't specify the label index");
Guolin Ke's avatar
Guolin Ke committed
469
470
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
471
  // get max_feature_idx first
472
473
474
475
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
476
    Log::Fatal("Model file doesn't specify max_feature_idx");
Guolin Ke's avatar
Guolin Ke committed
477
478
479
    return;
  }
  // get sigmoid parameter
480
481
482
483
  line = Common::FindFromLines(lines, "sigmoid=");
  if (line.size() > 0) {
    Common::Atof(Common::Split(line.c_str(), '=')[1].c_str(), &sigmoid_);
  } else {
484
    sigmoid_ = -1.0f;
Guolin Ke's avatar
Guolin Ke committed
485
486
  }
  // get tree models
487
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
488
489
490
491
492
493
494
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
495
      std::string tree_str = Common::Join<std::string>(lines, start, end, '\n');
Guolin Ke's avatar
Guolin Ke committed
496
497
      auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
      models_.push_back(std::move(new_tree));
Guolin Ke's avatar
Guolin Ke committed
498
499
500
501
    } else {
      ++i;
    }
  }
502
  Log::Info("Finished loading %d models", models_.size());
Guolin Ke's avatar
Guolin Ke committed
503
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
504
  num_init_iteration_ = num_iteration_for_pred_;
Guolin Ke's avatar
Guolin Ke committed
505
506
}

507
std::string GBDT::FeatureImportance() const {
508
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
509
    for (size_t iter = 0; iter < models_.size(); ++iter) {
510
511
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
            ++feature_importances[models_[iter]->split_feature_real(split_idx)];
wxchan's avatar
wxchan committed
512
513
        }
    }
514
515
516
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
517
518
519
      if (feature_importances[i] > 0) {
        pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]);
      }
520
521
522
523
524
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
525
      return lhs.first > rhs.first;
526
    });
527
    std::stringstream str_buf;
528
    // write to model file
529
    str_buf << std::endl << "feature importances:" << std::endl;
530
    for (size_t i = 0; i < pairs.size(); ++i) {
531
      str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
532
    }
533
    return str_buf.str();
wxchan's avatar
wxchan committed
534
535
}

536
537
std::vector<double> GBDT::PredictRaw(const double* value) const {
  std::vector<double> ret(num_class_, 0.0f);
Guolin Ke's avatar
Guolin Ke committed
538
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
539
540
541
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
    }
Guolin Ke's avatar
Guolin Ke committed
542
543
544
545
  }
  return ret;
}

546
std::vector<double> GBDT::Predict(const double* value) const {
547
  std::vector<double> ret(num_class_, 0.0f);
Guolin Ke's avatar
Guolin Ke committed
548
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
549
550
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
551
552
    }
  }
553
554
555
556
557
558
  // if need sigmoid transform
  if (sigmoid_ > 0 && num_class_ == 1) {
    ret[0] = 1.0f / (1.0f + std::exp(- 2.0f * sigmoid_ * ret[0]));
  } else if (num_class_ > 1) {
    Common::Softmax(&ret);
  }
559
560
561
  return ret;
}

562
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
wxchan's avatar
wxchan committed
563
  std::vector<int> ret;
Guolin Ke's avatar
Guolin Ke committed
564
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
565
566
567
    for (int j = 0; j < num_class_; ++j) {
      ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
    }
wxchan's avatar
wxchan committed
568
569
570
571
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
572
}  // namespace LightGBM