gbdt_model_text.cpp 24.3 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
#include <LightGBM/config.h>
Guolin Ke's avatar
Guolin Ke committed
6
#include <LightGBM/metric.h>
7
#include <LightGBM/objective_function.h>
8
#include <LightGBM/utils/array_args.h>
9
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
10

11
#include <algorithm>
12
13
#include <string>
#include <sstream>
14
15
#include <unordered_map>
#include <utility>
16
17
#include <vector>

18
19
#include "gbdt.h"

Guolin Ke's avatar
Guolin Ke committed
20
21
namespace LightGBM {

22
const char* kModelVersion = "v4";
23

24
std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_importance_type) const {
Guolin Ke's avatar
Guolin Ke committed
25
  std::stringstream str_buf;
26
  Common::C_stringstream(str_buf);
Guolin Ke's avatar
Guolin Ke committed
27
28

  str_buf << "{";
29
30
31
32
33
34
  str_buf << "\"name\":\"" << SubModelName() << "\"," << '\n';
  str_buf << "\"version\":\"" << kModelVersion << "\"," << '\n';
  str_buf << "\"num_class\":" << num_class_ << "," << '\n';
  str_buf << "\"num_tree_per_iteration\":" << num_tree_per_iteration_ << "," << '\n';
  str_buf << "\"label_index\":" << label_idx_ << "," << '\n';
  str_buf << "\"max_feature_idx\":" << max_feature_idx_ << "," << '\n';
35
36
37
  if (objective_function_ != nullptr) {
    str_buf << "\"objective\":\"" << objective_function_->ToString() << "\",\n";
  }
Guolin Ke's avatar
Guolin Ke committed
38

39
40
  str_buf << "\"average_output\":" << (average_output_ ? "true" : "false") << ",\n";

41
  str_buf << "\"feature_names\":[\"" << CommonC::Join(feature_names_, "\",\"")
42
43
44
          << "\"]," << '\n';

  str_buf << "\"monotone_constraints\":["
45
          << CommonC::Join(monotone_constraints_, ",") << "]," << '\n';
Guolin Ke's avatar
Guolin Ke committed
46

47
48
49
50
  str_buf << "\"feature_infos\":" << "{";
  bool first_obj = true;
  for (size_t i = 0; i < feature_infos_.size(); ++i) {
    std::stringstream json_str_buf;
51
    Common::C_stringstream(json_str_buf);
52
53
54
55
    auto strs = Common::Split(feature_infos_[i].c_str(), ":");
    if (strs[0][0] == '[') {
      strs[0].erase(0, 1);  // remove '['
      strs[1].erase(strs[1].size() - 1);  // remove ']'
56
57
58
59
60
61
      double max_, min_;
      Common::Atof(strs[0].c_str(), &min_);
      Common::Atof(strs[1].c_str(), &max_);
      json_str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
      json_str_buf << "{\"min_value\":" << Common::AvoidInf(min_) << ",";
      json_str_buf << "\"max_value\":" << Common::AvoidInf(max_) << ",";
62
63
      json_str_buf << "\"values\":[]}";
    } else if (strs[0] != "none") {  // categorical feature
64
      auto vals = CommonC::StringToArray<int>(feature_infos_[i], ':');
65
66
67
68
      auto max_idx = ArrayArgs<int>::ArgMax(vals);
      auto min_idx = ArrayArgs<int>::ArgMin(vals);
      json_str_buf << "{\"min_value\":" << vals[min_idx] << ",";
      json_str_buf << "\"max_value\":" << vals[max_idx] << ",";
69
      json_str_buf << "\"values\":[" << CommonC::Join(vals, ",") << "]}";
70
71
72
73
74
75
76
77
78
79
80
81
    } else {  // unused feature
      continue;
    }
    if (!first_obj) {
      str_buf << ",";
    }
    str_buf << "\"" << feature_names_[i] << "\":";
    str_buf << json_str_buf.str();
    first_obj = false;
  }
  str_buf << "}," << '\n';

Guolin Ke's avatar
Guolin Ke committed
82
83
  str_buf << "\"tree_info\":[";
  int num_used_model = static_cast<int>(models_.size());
84
85
86
  int total_iteration = num_used_model / num_tree_per_iteration_;
  start_iteration = std::max(start_iteration, 0);
  start_iteration = std::min(start_iteration, total_iteration);
Guolin Ke's avatar
Guolin Ke committed
87
  if (num_iteration > 0) {
88
89
    int end_iteration = start_iteration + num_iteration;
    num_used_model = std::min(end_iteration * num_tree_per_iteration_ , num_used_model);
Guolin Ke's avatar
Guolin Ke committed
90
  }
91
92
93
  int start_model = start_iteration * num_tree_per_iteration_;
  for (int i = start_model; i < num_used_model; ++i) {
    if (i > start_model) {
Guolin Ke's avatar
Guolin Ke committed
94
95
96
97
98
99
100
      str_buf << ",";
    }
    str_buf << "{";
    str_buf << "\"tree_index\":" << i << ",";
    str_buf << models_[i]->ToJSON();
    str_buf << "}";
  }
101
102
  str_buf << "]," << '\n';

103
104
  std::vector<double> feature_importances = FeatureImportance(
      num_iteration, feature_importance_type);
105
106
107
108
109
110
111
112
113
  // store the importance first
  std::vector<std::pair<size_t, std::string>> pairs;
  for (size_t i = 0; i < feature_importances.size(); ++i) {
    size_t feature_importances_int = static_cast<size_t>(feature_importances[i]);
    if (feature_importances_int > 0) {
      pairs.emplace_back(feature_importances_int, feature_names_[i]);
    }
  }
  str_buf << '\n' << "\"feature_importances\":" << "{";
114
115
  for (size_t i = 0; i < pairs.size(); ++i) {
    if (i > 0) {
116
117
      str_buf << ",";
    }
118
    str_buf << "\"" << pairs[i].second << "\":" << std::to_string(pairs[i].first);
119
120
  }
  str_buf << "}" << '\n';
Guolin Ke's avatar
Guolin Ke committed
121

122
  str_buf << "}" << '\n';
Guolin Ke's avatar
Guolin Ke committed
123
124
125
126
127
128

  return str_buf.str();
}

std::string GBDT::ModelToIfElse(int num_iteration) const {
  std::stringstream str_buf;
129
  Common::C_stringstream(str_buf);
Guolin Ke's avatar
Guolin Ke committed
130

131
132
133
134
135
136
137
138
139
140
141
142
  str_buf << "#include \"gbdt.h\"" << '\n';
  str_buf << "#include <LightGBM/utils/common.h>" << '\n';
  str_buf << "#include <LightGBM/objective_function.h>" << '\n';
  str_buf << "#include <LightGBM/metric.h>" << '\n';
  str_buf << "#include <LightGBM/prediction_early_stop.h>" << '\n';
  str_buf << "#include <ctime>" << '\n';
  str_buf << "#include <sstream>" << '\n';
  str_buf << "#include <chrono>" << '\n';
  str_buf << "#include <string>" << '\n';
  str_buf << "#include <vector>" << '\n';
  str_buf << "#include <utility>" << '\n';
  str_buf << "namespace LightGBM {" << '\n';
Guolin Ke's avatar
Guolin Ke committed
143
144
145
146
147
148
149
150

  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_used_model = std::min(num_iteration * num_tree_per_iteration_, num_used_model);
  }

  // PredictRaw
  for (int i = 0; i < num_used_model; ++i) {
151
    str_buf << models_[i]->ToIfElse(i, false) << '\n';
Guolin Ke's avatar
Guolin Ke committed
152
153
154
155
156
157
158
159
160
  }

  str_buf << "double (*PredictTreePtr[])(const double*) = { ";
  for (int i = 0; i < num_used_model; ++i) {
    if (i > 0) {
      str_buf << " , ";
    }
    str_buf << "PredictTree" << i;
  }
161
  str_buf << " };" << '\n' << '\n';
Guolin Ke's avatar
Guolin Ke committed
162
163

  std::stringstream pred_str_buf;
164
  Common::C_stringstream(pred_str_buf);
Guolin Ke's avatar
Guolin Ke committed
165

166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
  pred_str_buf << "\t" << "int early_stop_round_counter = 0;" << '\n';
  pred_str_buf << "\t" << "std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);" << '\n';
  pred_str_buf << "\t" << "for (int i = 0; i < num_iteration_for_pred_; ++i) {" << '\n';
  pred_str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << '\n';
  pred_str_buf << "\t\t\t" << "output[k] += (*PredictTreePtr[i * num_tree_per_iteration_ + k])(features);" << '\n';
  pred_str_buf << "\t\t" << "}" << '\n';
  pred_str_buf << "\t\t" << "++early_stop_round_counter;" << '\n';
  pred_str_buf << "\t\t" << "if (early_stop->round_period == early_stop_round_counter) {" << '\n';
  pred_str_buf << "\t\t\t" << "if (early_stop->callback_function(output, num_tree_per_iteration_))" << '\n';
  pred_str_buf << "\t\t\t\t" << "return;" << '\n';
  pred_str_buf << "\t\t\t" << "early_stop_round_counter = 0;" << '\n';
  pred_str_buf << "\t\t" << "}" << '\n';
  pred_str_buf << "\t" << "}" << '\n';

  str_buf << "void GBDT::PredictRaw(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << '\n';
Guolin Ke's avatar
Guolin Ke committed
181
  str_buf << pred_str_buf.str();
182
183
  str_buf << "}" << '\n';
  str_buf << '\n';
Guolin Ke's avatar
Guolin Ke committed
184

185
186
187
  // PredictRawByMap
  str_buf << "double (*PredictTreeByMapPtr[])(const std::unordered_map<int, double>&) = { ";
  for (int i = 0; i < num_used_model; ++i) {
Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
    if (i > 0) {
      str_buf << " , ";
    }
    str_buf << "PredictTree" << i << "ByMap";
192
  }
193
  str_buf << " };" << '\n' << '\n';
194
195

  std::stringstream pred_str_buf_map;
196
  Common::C_stringstream(pred_str_buf_map);
197

198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
  pred_str_buf_map << "\t" << "int early_stop_round_counter = 0;" << '\n';
  pred_str_buf_map << "\t" << "std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);" << '\n';
  pred_str_buf_map << "\t" << "for (int i = 0; i < num_iteration_for_pred_; ++i) {" << '\n';
  pred_str_buf_map << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << '\n';
  pred_str_buf_map << "\t\t\t" << "output[k] += (*PredictTreeByMapPtr[i * num_tree_per_iteration_ + k])(features);" << '\n';
  pred_str_buf_map << "\t\t" << "}" << '\n';
  pred_str_buf_map << "\t\t" << "++early_stop_round_counter;" << '\n';
  pred_str_buf_map << "\t\t" << "if (early_stop->round_period == early_stop_round_counter) {" << '\n';
  pred_str_buf_map << "\t\t\t" << "if (early_stop->callback_function(output, num_tree_per_iteration_))" << '\n';
  pred_str_buf_map << "\t\t\t\t" << "return;" << '\n';
  pred_str_buf_map << "\t\t\t" << "early_stop_round_counter = 0;" << '\n';
  pred_str_buf_map << "\t\t" << "}" << '\n';
  pred_str_buf_map << "\t" << "}" << '\n';

  str_buf << "void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {" << '\n';
213
  str_buf << pred_str_buf_map.str();
214
215
  str_buf << "}" << '\n';
  str_buf << '\n';
216

Guolin Ke's avatar
Guolin Ke committed
217
  // Predict
218
219
220
221
222
223
224
  str_buf << "void GBDT::Predict(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << '\n';
  str_buf << "\t" << "PredictRaw(features, output, early_stop);" << '\n';
  str_buf << "\t" << "if (average_output_) {" << '\n';
  str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << '\n';
  str_buf << "\t\t\t" << "output[k] /= num_iteration_for_pred_;" << '\n';
  str_buf << "\t\t" << "}" << '\n';
  str_buf << "\t" << "}" << '\n';
Guolin Ke's avatar
Guolin Ke committed
225
  str_buf << "\t" << "if (objective_function_ != nullptr) {" << '\n';
226
227
228
229
  str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << '\n';
  str_buf << "\t" << "}" << '\n';
  str_buf << "}" << '\n';
  str_buf << '\n';
Guolin Ke's avatar
Guolin Ke committed
230

231
  // PredictByMap
232
233
234
235
236
237
238
  str_buf << "void GBDT::PredictByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {" << '\n';
  str_buf << "\t" << "PredictRawByMap(features, output, early_stop);" << '\n';
  str_buf << "\t" << "if (average_output_) {" << '\n';
  str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << '\n';
  str_buf << "\t\t\t" << "output[k] /= num_iteration_for_pred_;" << '\n';
  str_buf << "\t\t" << "}" << '\n';
  str_buf << "\t" << "}" << '\n';
Guolin Ke's avatar
Guolin Ke committed
239
  str_buf << "\t" << "if (objective_function_ != nullptr) {" << '\n';
240
241
242
243
  str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << '\n';
  str_buf << "\t" << "}" << '\n';
  str_buf << "}" << '\n';
  str_buf << '\n';
244
245


Guolin Ke's avatar
Guolin Ke committed
246
247
  // PredictLeafIndex
  for (int i = 0; i < num_used_model; ++i) {
248
    str_buf << models_[i]->ToIfElse(i, true) << '\n';
Guolin Ke's avatar
Guolin Ke committed
249
250
251
252
253
254
255
256
257
  }

  str_buf << "double (*PredictTreeLeafPtr[])(const double*) = { ";
  for (int i = 0; i < num_used_model; ++i) {
    if (i > 0) {
      str_buf << " , ";
    }
    str_buf << "PredictTree" << i << "Leaf";
  }
258
  str_buf << " };" << '\n' << '\n';
Guolin Ke's avatar
Guolin Ke committed
259

260
261
262
263
264
265
  str_buf << "void GBDT::PredictLeafIndex(const double* features, double *output) const {" << '\n';
  str_buf << "\t" << "int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;" << '\n';
  str_buf << "\t" << "for (int i = 0; i < total_tree; ++i) {" << '\n';
  str_buf << "\t\t" << "output[i] = (*PredictTreeLeafPtr[i])(features);" << '\n';
  str_buf << "\t" << "}" << '\n';
  str_buf << "}" << '\n';
Guolin Ke's avatar
Guolin Ke committed
266

267
  // PredictLeafIndexByMap
268
269
  str_buf << "double (*PredictTreeLeafByMapPtr[])(const std::unordered_map<int, double>&) = { ";
  for (int i = 0; i < num_used_model; ++i) {
Guolin Ke's avatar
Guolin Ke committed
270
271
272
273
    if (i > 0) {
      str_buf << " , ";
    }
    str_buf << "PredictTree" << i << "LeafByMap";
274
  }
275
  str_buf << " };" << '\n' << '\n';
276

277
278
279
280
281
282
  str_buf << "void GBDT::PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const {" << '\n';
  str_buf << "\t" << "int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;" << '\n';
  str_buf << "\t" << "for (int i = 0; i < total_tree; ++i) {" << '\n';
  str_buf << "\t\t" << "output[i] = (*PredictTreeLeafByMapPtr[i])(features);" << '\n';
  str_buf << "\t" << "}" << '\n';
  str_buf << "}" << '\n';
283

284
  str_buf << "}  // namespace LightGBM" << '\n';
Guolin Ke's avatar
Guolin Ke committed
285
286
287
288
289
290
291
292
293
294
295
296

  return str_buf.str();
}

bool GBDT::SaveModelToIfElse(int num_iteration, const char* filename) const {
  /*! \brief File to write models */
  std::ofstream output_file;
  std::ifstream ifs(filename);
  if (ifs.good()) {
    std::string origin((std::istreambuf_iterator<char>(ifs)),
      (std::istreambuf_iterator<char>()));
    output_file.open(filename);
297
298
299
300
    output_file << "#define USE_HARD_CODE 0" << '\n';
    output_file << "#ifndef USE_HARD_CODE" << '\n';
    output_file << origin << '\n';
    output_file << "#else" << '\n';
Guolin Ke's avatar
Guolin Ke committed
301
    output_file << ModelToIfElse(num_iteration);
302
    output_file << "#endif" << '\n';
Guolin Ke's avatar
Guolin Ke committed
303
304
305
306
307
308
309
310
  } else {
    output_file.open(filename);
    output_file << ModelToIfElse(num_iteration);
  }

  ifs.close();
  output_file.close();

311
  return static_cast<bool>(output_file);
Guolin Ke's avatar
Guolin Ke committed
312
313
}

314
std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int feature_importance_type) const {
Guolin Ke's avatar
Guolin Ke committed
315
  std::stringstream ss;
316
  Common::C_stringstream(ss);
Guolin Ke's avatar
Guolin Ke committed
317
318

  // output model type
319
320
  ss << SubModelName() << '\n';
  ss << "version=" << kModelVersion << '\n';
Guolin Ke's avatar
Guolin Ke committed
321
  // output number of class
322
323
  ss << "num_class=" << num_class_ << '\n';
  ss << "num_tree_per_iteration=" << num_tree_per_iteration_ << '\n';
Guolin Ke's avatar
Guolin Ke committed
324
  // output label index
325
  ss << "label_index=" << label_idx_ << '\n';
Guolin Ke's avatar
Guolin Ke committed
326
  // output max_feature_idx
327
  ss << "max_feature_idx=" << max_feature_idx_ << '\n';
Guolin Ke's avatar
Guolin Ke committed
328
329
  // output objective
  if (objective_function_ != nullptr) {
330
    ss << "objective=" << objective_function_->ToString() << '\n';
Guolin Ke's avatar
Guolin Ke committed
331
332
333
  }

  if (average_output_) {
334
    ss << "average_output" << '\n';
Guolin Ke's avatar
Guolin Ke committed
335
336
  }

337
  ss << "feature_names=" << CommonC::Join(feature_names_, " ") << '\n';
Guolin Ke's avatar
Guolin Ke committed
338

339
  if (monotone_constraints_.size() != 0) {
340
    ss << "monotone_constraints=" << CommonC::Join(monotone_constraints_, " ")
341
342
343
       << '\n';
  }

344
  ss << "feature_infos=" << CommonC::Join(feature_infos_, " ") << '\n';
Guolin Ke's avatar
Guolin Ke committed
345
346

  int num_used_model = static_cast<int>(models_.size());
347
348
349
  int total_iteration = num_used_model / num_tree_per_iteration_;
  start_iteration = std::max(start_iteration, 0);
  start_iteration = std::min(start_iteration, total_iteration);
Guolin Ke's avatar
Guolin Ke committed
350
  if (num_iteration > 0) {
351
352
    int end_iteration = start_iteration + num_iteration;
    num_used_model = std::min(end_iteration * num_tree_per_iteration_, num_used_model);
Guolin Ke's avatar
Guolin Ke committed
353
  }
354

355
356
357
358
  int start_model = start_iteration * num_tree_per_iteration_;

  std::vector<std::string> tree_strs(num_used_model - start_model);
  std::vector<size_t> tree_sizes(num_used_model - start_model);
Guolin Ke's avatar
Guolin Ke committed
359
  // output tree models
360
  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
361
362
363
364
365
  for (int i = start_model; i < num_used_model; ++i) {
    const int idx = i - start_model;
    tree_strs[idx] = "Tree=" + std::to_string(idx) + '\n';
    tree_strs[idx] += models_[i]->ToString() + '\n';
    tree_sizes[idx] = tree_strs[idx].size();
366
367
  }

368
  ss << "tree_sizes=" << CommonC::Join(tree_sizes, " ") << '\n';
369
370
  ss << '\n';

371
  for (int i = 0; i < num_used_model - start_model; ++i) {
372
373
    ss << tree_strs[i];
    tree_strs[i].clear();
Guolin Ke's avatar
Guolin Ke committed
374
  }
Guolin Ke's avatar
Guolin Ke committed
375
  ss << "end of trees" << "\n";
376
377
  std::vector<double> feature_importances = FeatureImportance(
      num_iteration, feature_importance_type);
Guolin Ke's avatar
Guolin Ke committed
378
379
380
381
382
383
384
385
386
  // store the importance first
  std::vector<std::pair<size_t, std::string>> pairs;
  for (size_t i = 0; i < feature_importances.size(); ++i) {
    size_t feature_importances_int = static_cast<size_t>(feature_importances[i]);
    if (feature_importances_int > 0) {
      pairs.emplace_back(feature_importances_int, feature_names_[i]);
    }
  }
  // sort the importance
387
388
389
  std::stable_sort(pairs.begin(), pairs.end(),
                   [](const std::pair<size_t, std::string>& lhs,
                      const std::pair<size_t, std::string>& rhs) {
Guolin Ke's avatar
Guolin Ke committed
390
391
    return lhs.first > rhs.first;
  });
392
  ss << '\n' << "feature_importances:" << '\n';
Guolin Ke's avatar
Guolin Ke committed
393
  for (size_t i = 0; i < pairs.size(); ++i) {
394
    ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << '\n';
Guolin Ke's avatar
Guolin Ke committed
395
  }
Guolin Ke's avatar
Guolin Ke committed
396
  if (config_ != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
397
    ss << "\nparameters:" << '\n';
Guolin Ke's avatar
Guolin Ke committed
398
    ss << config_->ToString() << "\n";
Guolin Ke's avatar
Guolin Ke committed
399
400
401
402
403
    ss << "end of parameters" << '\n';
  } else if (!loaded_parameter_.empty()) {
    ss << "\nparameters:" << '\n';
    ss << loaded_parameter_ << "\n";
    ss << "end of parameters" << '\n';
Guolin Ke's avatar
Guolin Ke committed
404
  }
405
406
407
408
409
  if (!parser_config_str_.empty()) {
    ss << "\nparser:" << '\n';
    ss << parser_config_str_ << "\n";
    ss << "end of parser" << '\n';
  }
Nikita Titov's avatar
Nikita Titov committed
410
  return ss.str();
Guolin Ke's avatar
Guolin Ke committed
411
412
}

413
bool GBDT::SaveModelToFile(int start_iteration, int num_iteration, int feature_importance_type, const char* filename) const {
Guolin Ke's avatar
Guolin Ke committed
414
  /*! \brief File to write models */
415
416
417
418
  auto writer = VirtualFileWriter::Make(filename);
  if (!writer->Init()) {
    Log::Fatal("Model file %s is not available for writes", filename);
  }
419
  std::string str_to_write = SaveModelToString(start_iteration, num_iteration, feature_importance_type);
420
421
  auto size = writer->Write(str_to_write.c_str(), str_to_write.size());
  return size > 0;
Guolin Ke's avatar
Guolin Ke committed
422
423
}

424
bool GBDT::LoadModelFromString(const char* buffer, size_t len) {
Guolin Ke's avatar
Guolin Ke committed
425
426
  // use serialized string to restore this object
  models_.clear();
427
428
429
430
431
432
433
  auto c_str = buffer;
  auto p = c_str;
  auto end = p + len;
  std::unordered_map<std::string, std::string> key_vals;
  while (p < end) {
    auto line_len = Common::GetLine(p);
    if (line_len > 0) {
434
      std::string cur_line(p, line_len);
435
436
437
438
      if (!Common::StartsWith(cur_line, "Tree=")) {
        auto strs = Common::Split(cur_line.c_str(), '=');
        if (strs.size() == 1) {
          key_vals[strs[0]] = "";
439
        } else if (strs.size() == 2) {
440
          key_vals[strs[0]] = strs[1];
441
        } else if (strs.size() > 2) {
Guolin Ke's avatar
Guolin Ke committed
442
          if (strs[0] == "feature_names") {
Guolin Ke's avatar
Guolin Ke committed
443
            key_vals[strs[0]] = cur_line.substr(std::strlen("feature_names="));
444
445
          } else if (strs[0] == "monotone_constraints") {
            key_vals[strs[0]] = cur_line.substr(std::strlen("monotone_constraints="));
446
          } else {
Guolin Ke's avatar
Guolin Ke committed
447
448
            // Use first 128 chars to avoid exceed the message buffer.
            Log::Fatal("Wrong line at model file: %s", cur_line.substr(0, std::min<size_t>(128, cur_line.size())).c_str());
449
          }
450
        }
451
      } else {
452
453
454
455
456
457
        break;
      }
    }
    p += line_len;
    p = Common::SkipNewLine(p);
  }
Guolin Ke's avatar
Guolin Ke committed
458
459

  // get number of classes
460
461
  if (key_vals.count("num_class")) {
    Common::Atoi(key_vals["num_class"].c_str(), &num_class_);
Guolin Ke's avatar
Guolin Ke committed
462
463
464
465
466
  } else {
    Log::Fatal("Model file doesn't specify the number of classes");
    return false;
  }

467
468
  if (key_vals.count("num_tree_per_iteration")) {
    Common::Atoi(key_vals["num_tree_per_iteration"].c_str(), &num_tree_per_iteration_);
Guolin Ke's avatar
Guolin Ke committed
469
470
471
472
473
  } else {
    num_tree_per_iteration_ = num_class_;
  }

  // get index of label
474
475
  if (key_vals.count("label_index")) {
    Common::Atoi(key_vals["label_index"].c_str(), &label_idx_);
Guolin Ke's avatar
Guolin Ke committed
476
477
478
479
  } else {
    Log::Fatal("Model file doesn't specify the label index");
    return false;
  }
480

Guolin Ke's avatar
Guolin Ke committed
481
  // get max_feature_idx first
482
483
  if (key_vals.count("max_feature_idx")) {
    Common::Atoi(key_vals["max_feature_idx"].c_str(), &max_feature_idx_);
Guolin Ke's avatar
Guolin Ke committed
484
485
486
487
  } else {
    Log::Fatal("Model file doesn't specify max_feature_idx");
    return false;
  }
488

Guolin Ke's avatar
Guolin Ke committed
489
  // get average_output
490
  if (key_vals.count("average_output")) {
Guolin Ke's avatar
Guolin Ke committed
491
492
    average_output_ = true;
  }
493

Guolin Ke's avatar
Guolin Ke committed
494
  // get feature names
495
496
  if (key_vals.count("feature_names")) {
    feature_names_ = Common::Split(key_vals["feature_names"].c_str(), ' ');
Guolin Ke's avatar
Guolin Ke committed
497
498
499
500
501
    if (feature_names_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_names");
      return false;
    }
  } else {
502
    Log::Fatal("Model file doesn't contain feature_names");
Guolin Ke's avatar
Guolin Ke committed
503
504
505
    return false;
  }

506
507
  // get monotone_constraints
  if (key_vals.count("monotone_constraints")) {
508
    monotone_constraints_ = CommonC::StringToArray<int8_t>(key_vals["monotone_constraints"].c_str(), ' ');
509
510
511
512
513
514
    if (monotone_constraints_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of monotone_constraints");
      return false;
    }
  }

515
516
  if (key_vals.count("feature_infos")) {
    feature_infos_ = Common::Split(key_vals["feature_infos"].c_str(), ' ');
Guolin Ke's avatar
Guolin Ke committed
517
518
519
520
521
    if (feature_infos_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
      Log::Fatal("Wrong size of feature_infos");
      return false;
    }
  } else {
522
    Log::Fatal("Model file doesn't contain feature_infos");
Guolin Ke's avatar
Guolin Ke committed
523
524
525
    return false;
  }

526
527
  if (key_vals.count("objective")) {
    auto str = key_vals["objective"];
528
    loaded_objective_.reset(ObjectiveFunction::CreateObjectiveFunction(ParseObjectiveAlias(str)));
Guolin Ke's avatar
Guolin Ke committed
529
530
    objective_function_ = loaded_objective_.get();
  }
531

532
533
534
535
  if (!key_vals.count("tree_sizes")) {
    while (p < end) {
      auto line_len = Common::GetLine(p);
      if (line_len > 0) {
536
        std::string cur_line(p, line_len);
537
538
539
540
541
542
        if (Common::StartsWith(cur_line, "Tree=")) {
          p += line_len;
          p = Common::SkipNewLine(p);
          size_t used_len = 0;
          models_.emplace_back(new Tree(p, &used_len));
          p += used_len;
543
        } else {
544
545
546
547
548
549
          break;
        }
      }
      p = Common::SkipNewLine(p);
    }
  } else {
550
    std::vector<size_t> tree_sizes = CommonC::StringToArray<size_t>(key_vals["tree_sizes"].c_str(), ' ');
551
    std::vector<size_t> tree_boundaries(tree_sizes.size() + 1, 0);
552
553
    int num_trees = static_cast<int>(tree_sizes.size());
    for (int i = 0; i < num_trees; ++i) {
554
      tree_boundaries[i + 1] = tree_boundaries[i] + tree_sizes[i];
555
      models_.emplace_back(nullptr);
Guolin Ke's avatar
Guolin Ke committed
556
    }
557
    OMP_INIT_EX();
558
    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
559
560
    for (int i = 0; i < num_trees; ++i) {
      OMP_LOOP_EX_BEGIN();
561
      auto cur_p = p + tree_boundaries[i];
562
563
564
565
566
567
568
569
570
571
572
573
574
      auto line_len = Common::GetLine(cur_p);
      std::string cur_line(cur_p, line_len);
      if (Common::StartsWith(cur_line, "Tree=")) {
        cur_p += line_len;
        cur_p = Common::SkipNewLine(cur_p);
        size_t used_len = 0;
        models_[i].reset(new Tree(cur_p, &used_len));
      } else {
        Log::Fatal("Model format error, expect a tree here. met %s", cur_line.c_str());
      }
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
575
576
577
578
  }
  num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
  num_init_iteration_ = num_iteration_for_pred_;
  iter_ = 0;
579
  bool is_inparameter = false, is_inparser = false;
Guolin Ke's avatar
Guolin Ke committed
580
  std::stringstream ss;
581
  Common::C_stringstream(ss);
Guolin Ke's avatar
Guolin Ke committed
582
583
584
  while (p < end) {
    auto line_len = Common::GetLine(p);
    if (line_len > 0) {
585
      std::string cur_line(p, line_len);
Guolin Ke's avatar
Guolin Ke committed
586
587
588
589
590
591
      if (cur_line == std::string("parameters:")) {
        is_inparameter = true;
      } else if (cur_line == std::string("end of parameters")) {
        break;
      } else if (is_inparameter) {
        ss << cur_line << "\n";
592
593
594
595
596
        if (Common::StartsWith(cur_line, "[linear_tree: ")) {
          int is_linear = 0;
          Common::Atoi(cur_line.substr(14, 1).c_str(), &is_linear);
          linear_tree_ = static_cast<bool>(is_linear);
        }
Guolin Ke's avatar
Guolin Ke committed
597
598
599
600
601
602
603
604
      }
    }
    p += line_len;
    p = Common::SkipNewLine(p);
  }
  if (!ss.str().empty()) {
    loaded_parameter_ = ss.str();
  }
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
  ss.clear();
  ss.str("");
  while (p < end) {
    auto line_len = Common::GetLine(p);
    if (line_len > 0) {
      std::string cur_line(p, line_len);
      if (cur_line == std::string("parser:")) {
        is_inparser = true;
      } else if (cur_line == std::string("end of parser")) {
        p += line_len;
        p = Common::SkipNewLine(p);
        break;
      } else if (is_inparser) {
        ss << cur_line << "\n";
      }
    }
    p += line_len;
    p = Common::SkipNewLine(p);
  }
  parser_config_str_ = ss.str();
  ss.clear();
  ss.str("");
Guolin Ke's avatar
Guolin Ke committed
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
  return true;
}

std::vector<double> GBDT::FeatureImportance(int num_iteration, int importance_type) const {
  int num_used_model = static_cast<int>(models_.size());
  if (num_iteration > 0) {
    num_iteration += 0;
    num_used_model = std::min(num_iteration * num_tree_per_iteration_, num_used_model);
  }

  std::vector<double> feature_importances(max_feature_idx_ + 1, 0.0);
  if (importance_type == 0) {
    for (int iter = 0; iter < num_used_model; ++iter) {
      for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
        if (models_[iter]->split_gain(split_idx) > 0) {
642
#ifdef DEBUG
643
          CHECK_GE(models_[iter]->split_feature(split_idx), 0);
644
#endif
Guolin Ke's avatar
Guolin Ke committed
645
646
647
648
649
650
651
652
          feature_importances[models_[iter]->split_feature(split_idx)] += 1.0;
        }
      }
    }
  } else if (importance_type == 1) {
    for (int iter = 0; iter < num_used_model; ++iter) {
      for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
        if (models_[iter]->split_gain(split_idx) > 0) {
653
#ifdef DEBUG
654
          CHECK_GE(models_[iter]->split_feature(split_idx), 0);
655
#endif
Guolin Ke's avatar
Guolin Ke committed
656
657
658
659
660
          feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx);
        }
      }
    }
  } else {
661
    Log::Fatal("Unknown importance type: only support split=0 and gain=1");
Guolin Ke's avatar
Guolin Ke committed
662
  }
Nikita Titov's avatar
Nikita Titov committed
663
  return feature_importances;
Guolin Ke's avatar
Guolin Ke committed
664
665
666
}

}  // namespace LightGBM