"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "cbc56c74e4437f44615cda5cce6028e03c62a9e0"
gbdt.cpp 26.2 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#include "gbdt.h"

3
#include <LightGBM/utils/openmp_wrapper.h>
4

Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
9
10
11
12
13
14
15
16
#include <LightGBM/utils/common.h>

#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>

#include <ctime>

#include <sstream>
#include <chrono>
#include <string>
#include <vector>
17
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
18
19
20

namespace LightGBM {

21
GBDT::GBDT()
22
  :iter_(0),
23
24
25
26
27
  train_data_(nullptr),
  object_function_(nullptr),
  early_stopping_round_(0),
  max_feature_idx_(0),
  num_class_(1),
28
  sigmoid_(-1.0f),
29
  num_iteration_for_pred_(0),
30
  shrinkage_rate_(0.1f),
wxchan's avatar
wxchan committed
31
  num_init_iteration_(0) {
32
33
34
35
36
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
37
38
39
}

GBDT::~GBDT() {
Guolin Ke's avatar
Guolin Ke committed
40

Guolin Ke's avatar
Guolin Ke committed
41
42
}

43
44
45
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
     const std::vector<const Metric*>& training_metrics) {
  iter_ = 0;
wxchan's avatar
wxchan committed
46
  num_iteration_for_pred_ = 0;
47
  max_feature_idx_ = 0;
wxchan's avatar
wxchan committed
48
49
  num_class_ = config->num_class;
  train_data_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
50
  gbdt_config_ = nullptr;
51
  tree_learner_ = nullptr;
wxchan's avatar
wxchan committed
52
53
54
55
56
  ResetTrainingData(config, train_data, object_function, training_metrics);
}

void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
  const std::vector<const Metric*>& training_metrics) {
Guolin Ke's avatar
Guolin Ke committed
57
  auto new_config = std::unique_ptr<BoostingConfig>(new BoostingConfig(*config));
wxchan's avatar
wxchan committed
58
59
60
  if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
    Log::Fatal("cannot reset training data, since new training data has different bin mappers");
  }
Guolin Ke's avatar
Guolin Ke committed
61
62
63
  early_stopping_round_ = new_config->early_stopping_round;
  shrinkage_rate_ = new_config->learning_rate;

Guolin Ke's avatar
Guolin Ke committed
64
  object_function_ = object_function;
Guolin Ke's avatar
Guolin Ke committed
65

Guolin Ke's avatar
Guolin Ke committed
66
  sigmoid_ = -1.0f;
wxchan's avatar
wxchan committed
67
  if (object_function_ != nullptr
Guolin Ke's avatar
Guolin Ke committed
68
69
    && std::string(object_function_->GetName()) == std::string("binary")) {
    // only binary classification need sigmoid transform
Guolin Ke's avatar
Guolin Ke committed
70
    sigmoid_ = new_config->sigmoid;
71
  }
Guolin Ke's avatar
Guolin Ke committed
72

Guolin Ke's avatar
Guolin Ke committed
73
  if (train_data_ != train_data && train_data != nullptr) {
74
75
    if (tree_learner_ == nullptr) {
      tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(new_config->tree_learner_type, &new_config->tree_config));
Guolin Ke's avatar
Guolin Ke committed
76
77
    }
    // init tree learner
78
    tree_learner_->Init(train_data);
Guolin Ke's avatar
Guolin Ke committed
79

Guolin Ke's avatar
Guolin Ke committed
80
81
82
83
84
85
    // push training metrics
    training_metrics_.clear();
    for (const auto& metric : training_metrics) {
      training_metrics_.push_back(metric);
    }
    training_metrics_.shrink_to_fit();
wxchan's avatar
wxchan committed
86
87
88
89
90
91
92
93
94
95
96
97
98
    // not same training data, need reset score and others
    // create score tracker
    train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
    // update score
    for (int i = 0; i < iter_; ++i) {
      for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
        auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
        train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
      }
    }
    num_data_ = train_data->num_data();
    // create buffer for gradients and hessians
    if (object_function_ != nullptr) {
99
100
101
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
wxchan's avatar
wxchan committed
102
103
104
105
106
    }
    // get max feature index
    max_feature_idx_ = train_data->num_total_features() - 1;
    // get label index
    label_idx_ = train_data->label_idx();
107
108
    // get feature names
    feature_names_ = train_data->feature_names();
109
110
111
112
113
114
115
116
117
118
    // get feature infos
    feature_infos_.clear();
    for (int i = 0; i < max_feature_idx_ + 1; ++i) {
      int feature_idx = train_data->GetInnerFeatureIndex(i);
      if (feature_idx < 0) { 
        feature_infos_.push_back("trival feature"); 
      } else {
        feature_infos_.push_back(train_data->FeatureAt(feature_idx)->bin_mapper()->bin_info());
      }
    }
Guolin Ke's avatar
Guolin Ke committed
119
120
  }

Guolin Ke's avatar
Guolin Ke committed
121
122
  if ((train_data_ != train_data && train_data != nullptr)
    || (gbdt_config_ != nullptr && gbdt_config_->bagging_fraction != new_config->bagging_fraction)) {
wxchan's avatar
wxchan committed
123
    // if need bagging, create buffer
Guolin Ke's avatar
Guolin Ke committed
124
    if (new_config->bagging_fraction < 1.0 && new_config->bagging_freq > 0) {
125
126
      bag_data_cnt_ =
        static_cast<data_size_t>(new_config->bagging_fraction * num_data_);
127
      bag_data_indices_.resize(num_data_);
128
129
130
131
132
133
      tmp_indices_.resize(num_data_);
      offsets_buf_.resize(num_threads_);
      left_cnts_buf_.resize(num_threads_);
      right_cnts_buf_.resize(num_threads_);
      left_write_pos_buf_.resize(num_threads_);
      right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
134
135
      double average_bag_rate = new_config->bagging_fraction / new_config->bagging_freq;
      is_use_subset_ = false;
136
      if (average_bag_rate <= 0.5) {
Guolin Ke's avatar
Guolin Ke committed
137
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
138
        tmp_subset_->CopyFeatureMapperFrom(train_data);
Guolin Ke's avatar
Guolin Ke committed
139
140
141
        is_use_subset_ = true;
        Log::Debug("use subset for bagging");
      }
wxchan's avatar
wxchan committed
142
143
144
    } else {
      bag_data_cnt_ = num_data_;
      bag_data_indices_.clear();
145
      tmp_indices_.clear();
Guolin Ke's avatar
Guolin Ke committed
146
      is_use_subset_ = false;
wxchan's avatar
wxchan committed
147
    }
Guolin Ke's avatar
Guolin Ke committed
148
  }
wxchan's avatar
wxchan committed
149
  train_data_ = train_data;
Guolin Ke's avatar
Guolin Ke committed
150
151
  if (train_data_ != nullptr) {
    // reset config for tree learner
152
    tree_learner_->ResetConfig(&new_config->tree_config);
Guolin Ke's avatar
Guolin Ke committed
153
  }
Guolin Ke's avatar
Guolin Ke committed
154
  gbdt_config_.reset(new_config.release());
Guolin Ke's avatar
Guolin Ke committed
155
156
}

wxchan's avatar
wxchan committed
157
void GBDT::AddValidDataset(const Dataset* valid_data,
Guolin Ke's avatar
Guolin Ke committed
158
  const std::vector<const Metric*>& valid_metrics) {
wxchan's avatar
wxchan committed
159
160
  if (!train_data_->CheckAlign(*valid_data)) {
    Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
161
  }
Guolin Ke's avatar
Guolin Ke committed
162
  // for a validation dataset, we need its score and metric
Guolin Ke's avatar
Guolin Ke committed
163
  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
wxchan's avatar
wxchan committed
164
165
166
167
168
169
170
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
      new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
171
  valid_score_updater_.push_back(std::move(new_score_updater));
Guolin Ke's avatar
Guolin Ke committed
172
  valid_metrics_.emplace_back();
173
174
175
  if (early_stopping_round_ > 0) {
    best_iter_.emplace_back();
    best_score_.emplace_back();
Guolin Ke's avatar
Guolin Ke committed
176
    best_msg_.emplace_back();
177
  }
Guolin Ke's avatar
Guolin Ke committed
178
179
  for (const auto& metric : valid_metrics) {
    valid_metrics_.back().push_back(metric);
180
181
182
    if (early_stopping_round_ > 0) {
      best_iter_.back().push_back(0);
      best_score_.back().push_back(kMinScore);
Guolin Ke's avatar
Guolin Ke committed
183
      best_msg_.back().emplace_back();
184
    }
Guolin Ke's avatar
Guolin Ke committed
185
  }
Guolin Ke's avatar
Guolin Ke committed
186
  valid_metrics_.back().shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
187
188
}

Guolin Ke's avatar
Guolin Ke committed
189
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer){
190
191
192
  if (cnt <= 0) {
    return 0;
  }
193
194
195
196
  data_size_t bag_data_cnt =
    static_cast<data_size_t>(gbdt_config_->bagging_fraction * cnt);
  data_size_t cur_left_cnt = 0;
  data_size_t cur_right_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
197
  auto right_buffer = buffer + bag_data_cnt;
198
199
  // random bagging, minimal unit is one record
  for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
200
201
202
    float prob =
      (bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
    if (cur_rand.NextFloat() < prob) {
203
204
      buffer[cur_left_cnt++] = start + i;
    } else {
Guolin Ke's avatar
Guolin Ke committed
205
      right_buffer[cur_right_cnt++] = start + i;
206
207
208
209
210
    }
  }
  CHECK(cur_left_cnt == bag_data_cnt);
  return cur_left_cnt;
}
Guolin Ke's avatar
Guolin Ke committed
211

Guolin Ke's avatar
Guolin Ke committed
212
213


214
void GBDT::Bagging(int iter) {
Guolin Ke's avatar
Guolin Ke committed
215
  // if need bagging
216
  if (bag_data_cnt_ < num_data_ && iter % gbdt_config_->bagging_freq == 0) {
Guolin Ke's avatar
Guolin Ke committed
217
    const data_size_t min_inner_size = 1000;
218
219
220
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }

Guolin Ke's avatar
Guolin Ke committed
221
#pragma omp parallel for schedule(static,1)
222
223
224
225
226
227
228
    for (int i = 0; i < num_threads_; ++i) {
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Guolin Ke's avatar
Guolin Ke committed
229
230
      Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
    }
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
249
      }
250
251
252
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
253
254
      }
    }
Guolin Ke's avatar
Guolin Ke committed
255
256
    bag_data_cnt_ = left_cnt;
    CHECK(bag_data_indices_[bag_data_cnt_ - 1] > bag_data_indices_[bag_data_cnt_]);
Guolin Ke's avatar
Guolin Ke committed
257
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
Guolin Ke's avatar
Guolin Ke committed
258
    // set bagging data to tree learner
Guolin Ke's avatar
Guolin Ke committed
259
260
261
262
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
Guolin Ke's avatar
Guolin Ke committed
263
264
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
Guolin Ke's avatar
Guolin Ke committed
265
266
      tree_learner_->ResetTrainingData(tmp_subset_.get());
    }
Guolin Ke's avatar
Guolin Ke committed
267
268
269
  }
}

270
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
Hui Xue's avatar
Hui Xue committed
271
  // we need to predict out-of-bag socres of data for boosting
Guolin Ke's avatar
Guolin Ke committed
272
  if (num_data_ - bag_data_cnt_ > 0 && !is_use_subset_) {
273
    train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, curr_class);
Guolin Ke's avatar
Guolin Ke committed
274
275
276
  }
}

277
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
Guolin Ke's avatar
Guolin Ke committed
278
279
280
281
282
283
  // boosting first
  if (gradient == nullptr || hessian == nullptr) {
    Boosting();
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
284
285
  // bagging logic
  Bagging(iter_);
Guolin Ke's avatar
Guolin Ke committed
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
  if (is_use_subset_ && bag_data_cnt_ < num_data_) {
    if (gradients_.empty()) {
      size_t total_size = static_cast<size_t>(num_data_) * num_class_;
      gradients_.resize(total_size);
      hessians_.resize(total_size);
    }    
    // get sub gradients
    for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
      auto bias = curr_class * num_data_;
      for (int i = 0; i < bag_data_cnt_; ++i) {
        gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
        hessians_[bias + i] = hessian[bias + bag_data_indices_[i]];
      }
    }
    gradient = gradients_.data();
    hessian = hessians_.data();
  }
Guolin Ke's avatar
Guolin Ke committed
303
304
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    // train a new tree
305
    std::unique_ptr<Tree> new_tree(tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
Guolin Ke's avatar
Guolin Ke committed
306
307
308
309
    // if cannot learn a new tree, then stop
    if (new_tree->num_leaves() <= 1) {
      Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
      return true;
310
    }
311

Guolin Ke's avatar
Guolin Ke committed
312
313
314
315
316
    // shrinkage by learning rate
    new_tree->Shrinkage(shrinkage_rate_);
    // update score
    UpdateScore(new_tree.get(), curr_class);
    UpdateScoreOutOfBag(new_tree.get(), curr_class);
317

Guolin Ke's avatar
Guolin Ke committed
318
319
320
321
322
323
324
325
326
    // add model
    models_.push_back(std::move(new_tree));
  }
  ++iter_;
  if (is_eval) {
    return EvalAndCheckEarlyStopping();
  } else {
    return false;
  }
327

Guolin Ke's avatar
Guolin Ke committed
328
}
329

wxchan's avatar
wxchan committed
330
void GBDT::RollbackOneIter() {
331
  if (iter_ <= 0) { return; }
wxchan's avatar
wxchan committed
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
  int cur_iter = iter_ + num_init_iteration_ - 1;
  // reset score
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    auto curr_tree = cur_iter * num_class_ + curr_class;
    models_[curr_tree]->Shrinkage(-1.0);
    train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(models_[curr_tree].get(), curr_class);
    }
  }
  // remove model
  for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
    models_.pop_back();
  }
  --iter_;
}

Guolin Ke's avatar
Guolin Ke committed
349
bool GBDT::EvalAndCheckEarlyStopping() {
350
351
  bool is_met_early_stopping = false;
  // print message for metric
Guolin Ke's avatar
Guolin Ke committed
352
353
  auto best_msg = OutputMetric(iter_);
  is_met_early_stopping = !best_msg.empty();
354
355
356
  if (is_met_early_stopping) {
    Log::Info("Early stopping at iteration %d, the best iteration round is %d",
      iter_, iter_ - early_stopping_round_);
Guolin Ke's avatar
Guolin Ke committed
357
    Log::Info("Output of best iteration round:\n%s", best_msg.c_str());
358
    // pop last early_stopping_round_ models
359
    for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
360
361
362
363
      models_.pop_back();
    }
  }
  return is_met_early_stopping;
Guolin Ke's avatar
Guolin Ke committed
364
365
}

366
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
Guolin Ke's avatar
Guolin Ke committed
367
  // update training score
Guolin Ke's avatar
Guolin Ke committed
368
369
370
371
372
  if (!is_use_subset_) {
    train_score_updater_->AddScore(tree_learner_.get(), curr_class);
  } else {
    train_score_updater_->AddScore(tree, curr_class);
  }
Guolin Ke's avatar
Guolin Ke committed
373
  // update validation score
Guolin Ke's avatar
Guolin Ke committed
374
375
  for (auto& score_updater : valid_score_updater_) {
    score_updater->AddScore(tree, curr_class);
Guolin Ke's avatar
Guolin Ke committed
376
377
378
  }
}

Guolin Ke's avatar
Guolin Ke committed
379
380
381
382
std::string GBDT::OutputMetric(int iter) {
  bool need_output = (iter % gbdt_config_->output_freq) == 0;
  std::string ret = "";
  std::stringstream msg_buf;
383
  std::vector<std::pair<size_t, size_t>> meet_early_stopping_pairs;
Guolin Ke's avatar
Guolin Ke committed
384
  // print training metric
Guolin Ke's avatar
Guolin Ke committed
385
  if (need_output) {
386
387
388
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
      auto scores = sub_metric->Eval(train_score_updater_->score());
Guolin Ke's avatar
Guolin Ke committed
389
      for (size_t k = 0; k < name.size(); ++k) {
Guolin Ke's avatar
Guolin Ke committed
390
391
392
393
394
395
396
397
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
          << ", training " << name[k]
          << " : " << scores[k];
        Log::Info(tmp_buf.str().c_str());
        if (early_stopping_round_ > 0) {
          msg_buf << tmp_buf.str() << std::endl;
        }
398
      }
399
    }
Guolin Ke's avatar
Guolin Ke committed
400
401
  }
  // print validation metric
Guolin Ke's avatar
Guolin Ke committed
402
  if (need_output || early_stopping_round_ > 0) {
403
404
405
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score());
Guolin Ke's avatar
Guolin Ke committed
406
407
408
409
410
411
412
413
414
415
416
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
          tmp_buf << "Iteration:" << iter
            << ", valid_" << i + 1 << " " << name[k]
            << " : " << test_scores[k];
          if (need_output) {
            Log::Info(tmp_buf.str().c_str());
          }
          if (early_stopping_round_ > 0) {
            msg_buf << tmp_buf.str() << std::endl;
417
          }
wxchan's avatar
wxchan committed
418
        }
Guolin Ke's avatar
Guolin Ke committed
419
        if (ret.empty() && early_stopping_round_ > 0) {
420
421
422
          auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back();
          if (cur_score > best_score_[i][j]) {
            best_score_[i][j] = cur_score;
423
            best_iter_[i][j] = iter;
Guolin Ke's avatar
Guolin Ke committed
424
            meet_early_stopping_pairs.emplace_back(i, j);
425
          } else {
Guolin Ke's avatar
Guolin Ke committed
426
            if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; }
427
          }
wxchan's avatar
wxchan committed
428
429
        }
      }
Guolin Ke's avatar
Guolin Ke committed
430
431
    }
  }
Guolin Ke's avatar
Guolin Ke committed
432
433
434
  for (auto& pair : meet_early_stopping_pairs) {
    best_msg_[pair.first][pair.second] = msg_buf.str();
  }
wxchan's avatar
wxchan committed
435
  return ret;
Guolin Ke's avatar
Guolin Ke committed
436
437
}

438
/*! \brief Get eval result */
439
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
Guolin Ke's avatar
Guolin Ke committed
440
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
441
442
  std::vector<double> ret;
  if (data_idx == 0) {
443
444
    for (auto& sub_metric : training_metrics_) {
      auto scores = sub_metric->Eval(train_score_updater_->score());
445
446
447
      for (auto score : scores) {
        ret.push_back(score);
      }
448
449
    }
  }
450
451
452
453
454
455
456
  else {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score());
      for (auto score : test_scores) {
        ret.push_back(score);
      }
457
458
459
460
461
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
462
/*! \brief Get training scores result */
463
const double* GBDT::GetTrainingScore(int64_t* out_len) {
464
  *out_len = static_cast<int64_t>(train_score_updater_->num_data()) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
465
  return train_score_updater_->score();
466
467
}

Guolin Ke's avatar
Guolin Ke committed
468
469
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
Guolin Ke's avatar
Guolin Ke committed
470

471
  const double* raw_scores = nullptr;
Guolin Ke's avatar
Guolin Ke committed
472
473
  data_size_t num_data = 0;
  if (data_idx == 0) {
wxchan's avatar
wxchan committed
474
    raw_scores = GetTrainingScore(out_len);
Guolin Ke's avatar
Guolin Ke committed
475
476
477
478
479
    num_data = train_score_updater_->num_data();
  } else {
    auto used_idx = data_idx - 1;
    raw_scores = valid_score_updater_[used_idx]->score();
    num_data = valid_score_updater_[used_idx]->num_data();
480
    *out_len = static_cast<int64_t>(num_data) * num_class_;
Guolin Ke's avatar
Guolin Ke committed
481
482
  }
  if (num_class_ > 1) {
wxchan's avatar
wxchan committed
483
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
484
    for (data_size_t i = 0; i < num_data; ++i) {
485
      std::vector<double> tmp_result(num_class_);
Guolin Ke's avatar
Guolin Ke committed
486
      for (int j = 0; j < num_class_; ++j) {
487
        tmp_result[j] = raw_scores[j * num_data + i];
Guolin Ke's avatar
Guolin Ke committed
488
489
490
      }
      Common::Softmax(&tmp_result);
      for (int j = 0; j < num_class_; ++j) {
Guolin Ke's avatar
Guolin Ke committed
491
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
Guolin Ke's avatar
Guolin Ke committed
492
493
      }
    }
Guolin Ke's avatar
Guolin Ke committed
494
  } else if(sigmoid_ > 0.0f){
wxchan's avatar
wxchan committed
495
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
496
    for (data_size_t i = 0; i < num_data; ++i) {
497
      out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(- sigmoid_ * raw_scores[i])));
Guolin Ke's avatar
Guolin Ke committed
498
499
    }
  } else {
wxchan's avatar
wxchan committed
500
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
501
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
502
      out_result[i] = static_cast<double>(raw_scores[i]);
Guolin Ke's avatar
Guolin Ke committed
503
504
505
506
507
    }
  }

}

Guolin Ke's avatar
Guolin Ke committed
508
void GBDT::Boosting() {
509
510
511
  if (object_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
Hui Xue's avatar
Hui Xue committed
512
  // objective function will calculate gradients and hessians
513
  int64_t num_score = 0;
Guolin Ke's avatar
Guolin Ke committed
514
  object_function_->
Guolin Ke's avatar
Guolin Ke committed
515
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
Guolin Ke's avatar
Guolin Ke committed
516
517
}

518
std::string GBDT::DumpModel(int num_iteration) const {
Guolin Ke's avatar
Guolin Ke committed
519
  std::stringstream str_buf;
wxchan's avatar
wxchan committed
520

Guolin Ke's avatar
Guolin Ke committed
521
  str_buf << "{";
Guolin Ke's avatar
Guolin Ke committed
522
  str_buf << "\"name\":\"" << SubModelName() << "\"," << std::endl;
Guolin Ke's avatar
Guolin Ke committed
523
524
525
526
  str_buf << "\"num_class\":" << num_class_ << "," << std::endl;
  str_buf << "\"label_index\":" << label_idx_ << "," << std::endl;
  str_buf << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
  str_buf << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
wxchan's avatar
wxchan committed
527

Guolin Ke's avatar
Guolin Ke committed
528
  str_buf << "\"feature_names\":[\"" 
529
     << Common::Join(feature_names_, "\",\"") << "\"]," 
Guolin Ke's avatar
Guolin Ke committed
530
531
     << std::endl;

Guolin Ke's avatar
Guolin Ke committed
532
  str_buf << "\"tree_info\":[";
533
534
535
536
537
  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_used_model = std::min(num_iteration * num_class_, num_used_model);
  } 
  for (int i = 0; i < num_used_model; ++i) {
wxchan's avatar
wxchan committed
538
    if (i > 0) {
Guolin Ke's avatar
Guolin Ke committed
539
      str_buf << ",";
wxchan's avatar
wxchan committed
540
    }
Guolin Ke's avatar
Guolin Ke committed
541
542
543
544
    str_buf << "{";
    str_buf << "\"tree_index\":" << i << ",";
    str_buf << models_[i]->ToJSON();
    str_buf << "}";
wxchan's avatar
wxchan committed
545
  }
Guolin Ke's avatar
Guolin Ke committed
546
  str_buf << "]" << std::endl;
wxchan's avatar
wxchan committed
547

Guolin Ke's avatar
Guolin Ke committed
548
  str_buf << "}" << std::endl;
wxchan's avatar
wxchan committed
549

Guolin Ke's avatar
Guolin Ke committed
550
  return str_buf.str();
wxchan's avatar
wxchan committed
551
552
}

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
std::string GBDT::SaveModelToString(int num_iterations) const {
    std::stringstream ss;

    // output model type
    ss << SubModelName() << std::endl;
    // output number of class
    ss << "num_class=" << num_class_ << std::endl;
    // output label index
    ss << "label_index=" << label_idx_ << std::endl;
    // output max_feature_idx
    ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
    // output objective name
    if (object_function_ != nullptr) {
      ss << "objective=" << object_function_->GetName() << std::endl;
    }
    // output sigmoid parameter
    ss << "sigmoid=" << sigmoid_ << std::endl;

    ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;

    ss << std::endl;
    int num_used_model = static_cast<int>(models_.size());
    if (num_iterations > 0) {
      num_used_model = std::min(num_iterations * num_class_, num_used_model);
    }
    // output tree models
    for (int i = 0; i < num_used_model; ++i) {
      ss << "Tree=" << i << std::endl;
      ss << models_[i]->ToString() << std::endl;
    }

    std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
    ss << std::endl << "feature importances:" << std::endl;
    for (size_t i = 0; i < pairs.size(); ++i) {
      ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
    }

    ss << std::endl << "feature information:" << std::endl;
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
591
    for (int i = 0; i < max_feature_idx_ + 1; ++i) {
592
593
594
595
596
597
      ss << feature_names_[i] << "=" << feature_infos_[i] << std::endl;
    }

    return ss.str();
}

598
bool GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
wxchan's avatar
wxchan committed
599
600
601
  /*! \brief File to write models */
  std::ofstream output_file;
  output_file.open(filename);
602

603
  output_file << SaveModelToString(num_iteration);
604

wxchan's avatar
wxchan committed
605
  output_file.close();
606
607

  return (bool)output_file;
Guolin Ke's avatar
Guolin Ke committed
608
609
}

610
bool GBDT::LoadModelFromString(const std::string& model_str) {
Guolin Ke's avatar
Guolin Ke committed
611
612
613
  // use serialized string to restore this object
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
614
615

  // get number of classes
616
617
618
619
  auto line = Common::FindFromLines(lines, "num_class=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
  } else {
620
    Log::Fatal("Model file doesn't specify the number of classes");
621
    return false;
622
  }
Guolin Ke's avatar
Guolin Ke committed
623
  // get index of label
624
625
626
627
  line = Common::FindFromLines(lines, "label_index=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
  } else {
628
    Log::Fatal("Model file doesn't specify the label index");
629
    return false;
Guolin Ke's avatar
Guolin Ke committed
630
  }
Guolin Ke's avatar
Guolin Ke committed
631
  // get max_feature_idx first
632
633
634
635
  line = Common::FindFromLines(lines, "max_feature_idx=");
  if (line.size() > 0) {
    Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
  } else {
636
    Log::Fatal("Model file doesn't specify max_feature_idx");
637
    return false;
Guolin Ke's avatar
Guolin Ke committed
638
639
  }
  // get sigmoid parameter
640
641
642
643
  line = Common::FindFromLines(lines, "sigmoid=");
  if (line.size() > 0) {
    Common::Atof(Common::Split(line.c_str(), '=')[1].c_str(), &sigmoid_);
  } else {
644
    sigmoid_ = -1.0f;
Guolin Ke's avatar
Guolin Ke committed
645
  }
Guolin Ke's avatar
Guolin Ke committed
646
647
648
  // get feature names
  line = Common::FindFromLines(lines, "feature_names=");
  if (line.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
649
    feature_names_ = Common::Split(line.substr(std::strlen("feature_names=")).c_str(), " ");
Guolin Ke's avatar
Guolin Ke committed
650
651
    if (feature_names_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_names");
652
      return false;
Guolin Ke's avatar
Guolin Ke committed
653
654
655
    }
  } else {
    Log::Fatal("Model file doesn't contain feature names");
656
    return false;
Guolin Ke's avatar
Guolin Ke committed
657
658
  }

659
  // returns offset, or lines.size() if not found.
Guolin Ke's avatar
Guolin Ke committed
660
  auto find_string_lineno = [&lines](const std::string &str, size_t start_line)
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
  {
    size_t i = start_line;
    size_t featinfo_find_pos = std::string::npos;
    while (i < lines.size()) {
      featinfo_find_pos = lines[i].find(str);
      if (featinfo_find_pos != std::string::npos)
        break;
      ++i;
    }

    return i;
  };

  // load feature information
  {
Guolin Ke's avatar
Guolin Ke committed
676
    size_t finfo_line_idx = find_string_lineno("feature information:", 0);
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698

    if (finfo_line_idx >= lines.size()) {
      Log::Fatal("Model file doesn't contain feature information");
      return false;
    }

    feature_infos_.resize(max_feature_idx_ + 1);

    // search for each feature name
    for (int i=0; i < max_feature_idx_ + 1; i++) {
      const auto feat_name = feature_names_[i];
      size_t line_idx = find_string_lineno(feat_name + "=", finfo_line_idx + 1);
      if (line_idx >= lines.size()) {
        Log::Fatal(("Model file doesn't contain feature information for feature " + feat_name).c_str());
        return false;
      }

      const auto this_line = lines[line_idx];
      feature_infos_[i] = this_line.substr((feat_name + "=").size());
    }
  }

Guolin Ke's avatar
Guolin Ke committed
699
  // get tree models
700
  size_t i = 0;
Guolin Ke's avatar
Guolin Ke committed
701
702
703
704
705
706
707
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("Tree=");
    if (find_pos != std::string::npos) {
      ++i;
      int start = static_cast<int>(i);
      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
      int end = static_cast<int>(i);
Guolin Ke's avatar
Guolin Ke committed
708
      std::string tree_str = Common::Join<std::string>(lines, start, end, "\n");
Guolin Ke's avatar
Guolin Ke committed
709
710
      auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
      models_.push_back(std::move(new_tree));
Guolin Ke's avatar
Guolin Ke committed
711
712
713
714
    } else {
      ++i;
    }
  }
715
  Log::Info("Finished loading %d models", models_.size());
wxchan's avatar
wxchan committed
716
717
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
  num_init_iteration_ = num_iteration_for_pred_;
718
  iter_ = 0;
719
720

  return true;
Guolin Ke's avatar
Guolin Ke committed
721
722
}

wxchan's avatar
wxchan committed
723
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
724

725
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
726
    for (size_t iter = 0; iter < models_.size(); ++iter) {
727
728
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
            ++feature_importances[models_[iter]->split_feature_real(split_idx)];
wxchan's avatar
wxchan committed
729
730
        }
    }
731
732
733
    // store the importance first
    std::vector<std::pair<size_t, std::string>> pairs;
    for (size_t i = 0; i < feature_importances.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
734
      if (feature_importances[i] > 0) {
735
        pairs.emplace_back(feature_importances[i], feature_names_[i]);
Guolin Ke's avatar
Guolin Ke committed
736
      }
737
738
739
740
741
    }
    // sort the importance
    std::sort(pairs.begin(), pairs.end(),
      [](const std::pair<size_t, std::string>& lhs,
        const std::pair<size_t, std::string>& rhs) {
742
      return lhs.first > rhs.first;
743
    });
wxchan's avatar
wxchan committed
744
    return pairs;
wxchan's avatar
wxchan committed
745
746
}

747
748
std::vector<double> GBDT::PredictRaw(const double* value) const {
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
749
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
750
751
752
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
    }
Guolin Ke's avatar
Guolin Ke committed
753
754
755
756
  }
  return ret;
}

757
std::vector<double> GBDT::Predict(const double* value) const {
758
  std::vector<double> ret(num_class_, 0.0f);
wxchan's avatar
wxchan committed
759
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
760
761
    for (int j = 0; j < num_class_; ++j) {
      ret[j] += models_[i * num_class_ + j]->Predict(value);
762
763
    }
  }
764
765
  // if need sigmoid transform
  if (sigmoid_ > 0 && num_class_ == 1) {
766
    ret[0] = 1.0f / (1.0f + std::exp(-sigmoid_ * ret[0]));
767
768
769
  } else if (num_class_ > 1) {
    Common::Softmax(&ret);
  }
770
771
772
  return ret;
}

773
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
wxchan's avatar
wxchan committed
774
  std::vector<int> ret;
wxchan's avatar
wxchan committed
775
  for (int i = 0; i < num_iteration_for_pred_; ++i) {
776
777
778
    for (int j = 0; j < num_class_; ++j) {
      ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
    }
wxchan's avatar
wxchan committed
779
780
781
782
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
783
}  // namespace LightGBM