gbdt.cpp 17.6 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include "gbdt.h"

#include <LightGBM/utils/common.h>

#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
15
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
20
GBDT::GBDT() : saved_model_size_(-1), num_used_model_(0) {

Guolin Ke's avatar
Guolin Ke committed
21
22
23
}

GBDT::~GBDT() {
Guolin Ke's avatar
Guolin Ke committed
24

Guolin Ke's avatar
Guolin Ke committed
25
26
}

27
28
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
29
  gbdt_config_ = config;
30
  iter_ = 0;
31
  saved_model_size_ = -1;
Guolin Ke's avatar
Guolin Ke committed
32
  num_used_model_ = 0;
33
34
  max_feature_idx_ = 0;
  early_stopping_round_ = gbdt_config_->early_stopping_round;
Guolin Ke's avatar
Guolin Ke committed
35
  shrinkage_rate_ = gbdt_config_->learning_rate;
Guolin Ke's avatar
Guolin Ke committed
36
  train_data_ = train_data;
37
  num_class_ = config->num_class;
Guolin Ke's avatar
Guolin Ke committed
38
  // create tree learner
Guolin Ke's avatar
Guolin Ke committed
39
40
41
42
43
  for (int i = 0; i < num_class_; ++i) {
    auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
    new_tree_learner->Init(train_data_);
    // init tree learner
    tree_learner_.push_back(std::move(new_tree_learner));
44
  }
Guolin Ke's avatar
Guolin Ke committed
45
  tree_learner_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
46
47
48
49
50
  object_function_ = object_function;
  // push training metrics
  for (const auto& metric : training_metrics) {
    training_metrics_.push_back(metric);
  }
Guolin Ke's avatar
Guolin Ke committed
51
  training_metrics_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
52
  // create score tracker
Guolin Ke's avatar
Guolin Ke committed
53
  train_score_updater_.reset(new ScoreUpdater(train_data_, num_class_));
Guolin Ke's avatar
Guolin Ke committed
54
55
  num_data_ = train_data_->num_data();
  // create buffer for gradients and hessians
56
  if (object_function_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
61
62
63
64
    gradients_ = std::vector<score_t>(num_data_ * num_class_);
    hessians_ = std::vector<score_t>(num_data_ * num_class_);
  }
  sigmoid_ = -1.0f;
  if (object_function_ != nullptr 
    && std::string(object_function_->GetName()) == std::string("binary")) {
    // only binary classification need sigmoid transform
    sigmoid_ = gbdt_config_->sigmoid;
65
  }
Guolin Ke's avatar
Guolin Ke committed
66
  // get max feature index
67
  max_feature_idx_ = train_data_->num_total_features() - 1;
Guolin Ke's avatar
Guolin Ke committed
68
69
  // get label index
  label_idx_ = train_data_->label_idx();
Guolin Ke's avatar
Guolin Ke committed
70
71
  // if need bagging, create buffer
  if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
Guolin Ke's avatar
Guolin Ke committed
72
73
    out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
    bag_data_indices_ = std::vector<data_size_t>(num_data_);
Guolin Ke's avatar
Guolin Ke committed
74
75
  } else {
    out_of_bag_data_cnt_ = 0;
Guolin Ke's avatar
Guolin Ke committed
76
    out_of_bag_data_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
77
    bag_data_cnt_ = num_data_;
Guolin Ke's avatar
Guolin Ke committed
78
    bag_data_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82
83
84
85
  }
  // initialize random generator
  random_ = Random(gbdt_config_->bagging_seed);

}

void GBDT::AddDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
86
  const std::vector<const Metric*>& valid_metrics) {
87
88
89
  if (iter_ > 0) {
    Log::Fatal("Cannot add validation data after training started");
  }
Guolin Ke's avatar
Guolin Ke committed
90
  // for a validation dataset, we need its score and metric
Guolin Ke's avatar
Guolin Ke committed
91
92
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
93
  valid_metrics_.emplace_back();
94
95
96
97
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
  }
Guolin Ke's avatar
Guolin Ke committed
98
99
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
100
101
102
103
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
    }
Guolin Ke's avatar
Guolin Ke committed
104
  }
Guolin Ke's avatar
Guolin Ke committed
105
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
106
107
108
}


109
void GBDT::Bagging(int iter, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
110
  // if need bagging
Guolin Ke's avatar
Guolin Ke committed
111
  if (out_of_bag_data_indices_.size() > 0 && iter % gbdt_config_->bagging_freq == 0) {
Guolin Ke's avatar
Guolin Ke committed
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    // if doesn't have query data
    if (train_data_->metadata().query_boundaries() == nullptr) {
      bag_data_cnt_ =
        static_cast<data_size_t>(gbdt_config_->bagging_fraction * num_data_);
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one record
      for (data_size_t i = 0; i < num_data_; ++i) {
        double prob =
          (bag_data_cnt_ - cur_left_cnt) / static_cast<double>(num_data_ - i);
        if (random_.NextDouble() < prob) {
          bag_data_indices_[cur_left_cnt++] = i;
        } else {
          out_of_bag_data_indices_[cur_right_cnt++] = i;
        }
      }
    } else {
      // if have query data
      const data_size_t* query_boundaries = train_data_->metadata().query_boundaries();
      data_size_t num_query = train_data_->metadata().num_queries();
      data_size_t bag_query_cnt =
          static_cast<data_size_t>(num_query * gbdt_config_->bagging_fraction);
      data_size_t cur_left_query_cnt = 0;
      data_size_t cur_left_cnt = 0;
      data_size_t cur_right_cnt = 0;
      // random bagging, minimal unit is one query
      for (data_size_t i = 0; i < num_query; ++i) {
        double prob =
            (bag_query_cnt - cur_left_query_cnt) / static_cast<double>(num_query - i);
        if (random_.NextDouble() < prob) {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            bag_data_indices_[cur_left_cnt++] = j;
          }
          cur_left_query_cnt++;
        } else {
          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
            out_of_bag_data_indices_[cur_right_cnt++] = j;
          }
        }
      }
      bag_data_cnt_ = cur_left_cnt;
      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
    }
Guolin Ke's avatar
Guolin Ke committed
156
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
157
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
158
    tree_learner_[curr_class]->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
159
160
161
  }
}

162
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
163
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
164
165
  if (out_of_bag_data_indices_.size() > 0) {
    train_score_updater_->AddScore(tree, out_of_bag_data_indices_.data(), out_of_bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
166
167
168
  }
}

169
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
Guolin Ke's avatar
Guolin Ke committed
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
    Boosting();
    gradient = gradients_.data();
    hessian = hessians_.data();
  }

  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    // bagging logic
    Bagging(iter_, curr_class);

    // train a new tree
    std::unique_ptr<Tree> new_tree(tree_learner_[curr_class]->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
    // if cannot learn a new tree, then stop
    if (new_tree->num_leaves() <= 1) {
      Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
      return true;
187
    }
188

Guolin Ke's avatar
Guolin Ke committed
189
190
191
192
193
    // shrinkage by learning rate
    new_tree->Shrinkage(shrinkage_rate_);
    // update score
    UpdateScore(new_tree.get(), curr_class);
    UpdateScoreOutOfBag(new_tree.get(), curr_class);
194

Guolin Ke's avatar
Guolin Ke committed
195
196
197
198
199
200
201
202
203
    // add model
    models_.push_back(std::move(new_tree));
  }
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
204

Guolin Ke's avatar
Guolin Ke committed
205
}
206

Guolin Ke's avatar
Guolin Ke committed
207
bool GBDT::EvalAndCheckEarlyStopping() {
208
209
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
210
  is_met_early_stopping = OutputMetric(iter_);
211
212
213
214
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
    // pop last early_stopping_round_ models
215
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
216
217
218
219
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
220
221
}

222
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
223
  // update training score
Guolin Ke's avatar
Guolin Ke committed
224
  train_score_updater_->AddScore(tree_learner_[curr_class].get(), curr_class);
Guolin Ke's avatar
Guolin Ke committed
225
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
226
227
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
228
229
230
  }
}

wxchan's avatar
wxchan committed
231
232
bool GBDT::OutputMetric(int iter) {
  bool ret = false;
Guolin Ke's avatar
Guolin Ke committed
233
  // print training metric
234
235
236
237
  if ((iter % gbdt_config_->output_freq) == 0) {
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
238
      for (size_t k = 0; k < name.size(); ++k) {
239
240
        Log::Info("Iteration: %d, %s : %f", iter, name[k].c_str(), scores[k]);
      }
241
    }
Guolin Ke's avatar
Guolin Ke committed
242
243
  }
  // print validation metric
244
245
246
247
248
249
  if ((iter % gbdt_config_->output_freq) == 0 || early_stopping_round_ > 0) {
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
        if ((iter % gbdt_config_->output_freq) == 0) {
          auto name = valid_metrics_[i][j]->GetName();
Guolin Ke's avatar
Guolin Ke committed
250
          for (size_t k = 0; k < name.size(); ++k) {
251
252
            Log::Info("Iteration: %d, %s : %f", iter, name[k].c_str(), test_scores[k]);
          }
wxchan's avatar
wxchan committed
253
        }
254
        if (!ret && early_stopping_round_ > 0) {
255
256
257
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
258
259
            best_iter_[i][j] = iter;
          } else {
260
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = true; }
261
          }
wxchan's avatar
wxchan committed
262
263
        }
      }
Guolin Ke's avatar
Guolin Ke committed
264
265
    }
  }
wxchan's avatar
wxchan committed
266
  return ret;
Guolin Ke's avatar
Guolin Ke committed
267
268
}

269
/*! \brief Get eval result */
270
271
272
273
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
  std::vector<double> ret;
  if (data_idx == 0) {
274
275
    for (auto& sub_metric : training_metrics_) {
      auto scores = sub_metric->Eval(train_score_updater_->score());
276
277
278
      for (auto score : scores) {
        ret.push_back(score);
      }
279
280
    }
  }
281
282
283
284
285
286
287
  else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score());
      for (auto score : test_scores) {
        ret.push_back(score);
      }
288
289
290
291
292
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
293
/*! \brief Get training scores result */
294
const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
Guolin Ke's avatar
Guolin Ke committed
295
296
  *out_len = train_score_updater_->num_data() * num_class_;
  return train_score_updater_->score();
297
298
}

Guolin Ke's avatar
Guolin Ke committed
299
void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) const {
Guolin Ke's avatar
Guolin Ke committed
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
  std::vector<double> ret;

  const score_t* raw_scores = nullptr;
  data_size_t num_data = 0;
  if (data_idx == 0) {
    raw_scores = train_score_updater_->score();
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
  }
  *out_len = num_data * num_class_;

  if (num_class_ > 1) {
#pragma omp parallel for schedule(guided)
    for (data_size_t i = 0; i < num_data; ++i) {
      std::vector<double> tmp_result;
      for (int j = 0; j < num_class_; ++j) {
        tmp_result.push_back(raw_scores[j * num_data + i]);
      }
      Common::Softmax(&tmp_result);
      for (int j = 0; j < num_class_; ++j) {
        out_result[j * num_data + i] = static_cast<score_t>(tmp_result[i]);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
327
  } else if(sigmoid_ > 0.0f){
Guolin Ke's avatar
Guolin Ke committed
328
329
330
331
332
333
334
335
336
337
338
339
340
#pragma omp parallel for schedule(guided)
    for (data_size_t i = 0; i < num_data; ++i) {
      out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
    }
  } else {
#pragma omp parallel for schedule(guided)
    for (data_size_t i = 0; i < num_data; ++i) {
      out_result[i] = raw_scores[i];
    }
  }

}

Guolin Ke's avatar
Guolin Ke committed
341
void GBDT::Boosting() {
342
343
344
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
345
  // objective function will calculate gradients and hessians
Guolin Ke's avatar
Guolin Ke committed
346
  int num_score = 0;
Guolin Ke's avatar
Guolin Ke committed
347
  object_function_->
Guolin Ke's avatar
Guolin Ke committed
348
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
349
350
}

Guolin Ke's avatar
Guolin Ke committed
351
void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filename) {
352
  // first time to this function, open file
Guolin Ke's avatar
Guolin Ke committed
353
  if (saved_model_size_ < 0) {
354
355
    model_output_file_.open(filename);
    // output model type
356
    model_output_file_ << Name() << std::endl;
357
358
    // output number of class
    model_output_file_ << "num_class=" << num_class_ << std::endl;
359
360
361
362
    // output label index
    model_output_file_ << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    model_output_file_ << "max_feature_idx=" << max_feature_idx_ << std::endl;
Guolin Ke's avatar
Guolin Ke committed
363
364
365
366
    // output objective name
    if (object_function_ != nullptr) {
      model_output_file_ << "objective=" << object_function_->GetName() << std::endl;
    }
367
    // output sigmoid parameter
Guolin Ke's avatar
Guolin Ke committed
368
    model_output_file_ << "sigmoid=" << sigmoid_ << std::endl;
369
370
371
372
373
374
375
    model_output_file_ << std::endl;
    saved_model_size_ = 0;
  }
  // already saved
  if (!model_output_file_.is_open()) {
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
376
  if (num_used_model == NO_LIMIT) {
Guolin Ke's avatar
Guolin Ke committed
377
378
379
380
381
    num_used_model = static_cast<int>(models_.size());
  } else {
    num_used_model = num_used_model * num_class_;
  }
  int rest = num_used_model - early_stopping_round_ * num_class_;
382
383
384
385
386
  // output tree models
  for (int i = saved_model_size_; i < rest; ++i) {
    model_output_file_ << "Tree=" << i << std::endl;
    model_output_file_ << models_[i]->ToString() << std::endl;
  }
387

Guolin Ke's avatar
Guolin Ke committed
388
  saved_model_size_ = std::max(saved_model_size_, rest);
389

390
391
392
  model_output_file_.flush();
  // training finished, can close file
  if (is_finish) {
Guolin Ke's avatar
Guolin Ke committed
393
    for (int i = saved_model_size_; i < num_used_model; ++i) {
394
395
396
397
398
      model_output_file_ << "Tree=" << i << std::endl;
      model_output_file_ << models_[i]->ToString() << std::endl;
    }
    model_output_file_ << std::endl << FeatureImportance() << std::endl;
    model_output_file_.close();
Guolin Ke's avatar
Guolin Ke committed
399
400
401
  }
}

Guolin Ke's avatar
Guolin Ke committed
402
void GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
403
404
405
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
406
407

  // get number of classes
408
409
410
411
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
412
    Log::Fatal("Model file doesn't specify the number of classes");
413
414
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
415
  // get index of label
416
417
418
419
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
420
    Log::Fatal("Model file doesn't specify the label index");
Guolin Ke's avatar
Guolin Ke committed
421
422
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
423
  // get max_feature_idx first
424
425
426
427
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
428
    Log::Fatal("Model file doesn't specify max_feature_idx");
Guolin Ke's avatar
Guolin Ke committed
429
430
431
    return;
  }
  // get sigmoid parameter
432
433
434
435
  line = Common::FindFromLines(lines, "sigmoid=");
  if (line.size() > 0) {
    Common::Atof(Common::Split(line.c_str(), '=')[1].c_str(), &sigmoid_);
  } else {
436
    sigmoid_ = -1.0f;
Guolin Ke's avatar
Guolin Ke committed
437
438
  }
  // get tree models
439
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
440
441
442
443
444
445
446
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
447
      std::string tree_str = Common::Join<std::string>(lines, start, end, '\n');
Guolin Ke's avatar
Guolin Ke committed
448
449
      auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
      models_.push_back(std::move(new_tree));
Guolin Ke's avatar
Guolin Ke committed
450
451
452
453
    } else {
      ++i;
    }
  }
454
  Log::Info("Finished loading %d models", models_.size());
455
  num_used_model_ = static_cast<int>(models_.size()) / num_class_;
Guolin Ke's avatar
Guolin Ke committed
456
457
}

458
std::string GBDT::FeatureImportance() const {
459
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
460
    for (size_t iter = 0; iter < models_.size(); ++iter) {
461
462
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
            ++feature_importances[models_[iter]->split_feature_real(split_idx)];
wxchan's avatar
wxchan committed
463
464
        }
    }
465
466
467
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
468
469
470
      if (feature_importances[i] > 0) {
        pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]);
      }
471
472
473
474
475
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
476
      return lhs.first > rhs.first;
477
    });
478
    std::stringstream str_buf;
479
    // write to model file
480
    str_buf << std::endl << "feature importances:" << std::endl;
481
    for (size_t i = 0; i < pairs.size(); ++i) {
482
      str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
483
    }
484
    return str_buf.str();
wxchan's avatar
wxchan committed
485
486
}

487
488
std::vector<double> GBDT::PredictRaw(const double* value) const {
  std::vector<double> ret(num_class_, 0.0f);
489
  for (int i = 0; i < num_used_model_; ++i) {
490
491
492
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
    }
Guolin Ke's avatar
Guolin Ke committed
493
494
495
496
  }
  return ret;
}

497
std::vector<double> GBDT::Predict(const double* value) const {
498
  std::vector<double> ret(num_class_, 0.0f);
499
  for (int i = 0; i < num_used_model_; ++i) {
500
501
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
502
503
    }
  }
504
505
506
507
508
509
  // if need sigmoid transform
  if (sigmoid_ > 0 && num_class_ == 1) {
    ret[0] = 1.0f / (1.0f + std::exp(- 2.0f * sigmoid_ * ret[0]));
  } else if (num_class_ > 1) {
    Common::Softmax(&ret);
  }
510
511
512
  return ret;
}

513
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
wxchan's avatar
wxchan committed
514
  std::vector<int> ret;
515
  for (int i = 0; i < num_used_model_; ++i) {
516
517
518
    for (int j = 0; j < num_class_; ++j) {
      ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
    }
wxchan's avatar
wxchan committed
519
520
521
522
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
523
}  // namespace LightGBM