"include/vscode:/vscode.git/clone" did not exist on "13ed38ca55dc7d454712d794281c628f62301a0f"
tree.cpp 28.7 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
6
7
8
9

#include <functional>
#include <iomanip>
#include <sstream>

Guolin Ke's avatar
Guolin Ke committed
10
11
12
#include <LightGBM/tree.h>

#include <LightGBM/dataset.h>
13
14
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h>
Guolin Ke's avatar
Guolin Ke committed
15
16
17
18
19

namespace LightGBM {

Tree::Tree(int max_leaves)
  :max_leaves_(max_leaves) {
Guolin Ke's avatar
Guolin Ke committed
20
21
22
23
24
25
  left_child_.resize(max_leaves_ - 1);
  right_child_.resize(max_leaves_ - 1);
  split_feature_inner_.resize(max_leaves_ - 1);
  split_feature_.resize(max_leaves_ - 1);
  threshold_in_bin_.resize(max_leaves_ - 1);
  threshold_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
26
  decision_type_.resize(max_leaves_ - 1, 0);
Guolin Ke's avatar
Guolin Ke committed
27
28
29
  split_gain_.resize(max_leaves_ - 1);
  leaf_parent_.resize(max_leaves_);
  leaf_value_.resize(max_leaves_);
30
  leaf_weight_.resize(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
31
32
  leaf_count_.resize(max_leaves_);
  internal_value_.resize(max_leaves_ - 1);
33
  internal_weight_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
34
35
  internal_count_.resize(max_leaves_ - 1);
  leaf_depth_.resize(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
36
37
  // root is in the depth 0
  leaf_depth_[0] = 0;
Guolin Ke's avatar
Guolin Ke committed
38
  num_leaves_ = 1;
39
  leaf_value_[0] = 0.0f;
40
  leaf_weight_[0] = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
41
  leaf_parent_[0] = -1;
Guolin Ke's avatar
Guolin Ke committed
42
  shrinkage_ = 1.0f;
43
  num_cat_ = 0;
44
45
  cat_boundaries_.push_back(0);
  cat_boundaries_inner_.push_back(0);
46
  max_depth_ = -1;
Guolin Ke's avatar
Guolin Ke committed
47
}
Guolin Ke's avatar
Guolin Ke committed
48

Guolin Ke's avatar
Guolin Ke committed
49
50
51
Tree::~Tree() {
}

52
53
int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
                double threshold_double, double left_value, double right_value,
54
55
                int left_cnt, int right_cnt, double left_weight, double right_weight, float gain,
                MissingType missing_type, bool default_left) {
56
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
Guolin Ke's avatar
Guolin Ke committed
57
  int new_node_idx = num_leaves_ - 1;
Guolin Ke's avatar
Guolin Ke committed
58
  decision_type_[new_node_idx] = 0;
59
  SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
60
  SetDecisionType(&decision_type_[new_node_idx], default_left, kDefaultLeftMask);
Guolin Ke's avatar
Guolin Ke committed
61
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
Guolin Ke's avatar
Guolin Ke committed
62
  threshold_in_bin_[new_node_idx] = threshold_bin;
Guolin Ke's avatar
Guolin Ke committed
63
  threshold_[new_node_idx] = threshold_double;
64
65
66
  ++num_leaves_;
  return num_leaves_ - 1;
}
Guolin Ke's avatar
Guolin Ke committed
67

68
69
int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                           const uint32_t* threshold, int num_threshold, double left_value, double right_value,
70
71
                           data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) {
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
72
73
74
  int new_node_idx = num_leaves_ - 1;
  decision_type_[new_node_idx] = 0;
  SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
75
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
76
77
  threshold_in_bin_[new_node_idx] = num_cat_;
  threshold_[new_node_idx] = num_cat_;
78
  ++num_cat_;
79
80
81
82
83
84
85
86
  cat_boundaries_.push_back(cat_boundaries_.back() + num_threshold);
  for (int i = 0; i < num_threshold; ++i) {
    cat_threshold_.push_back(threshold[i]);
  }
  cat_boundaries_inner_.push_back(cat_boundaries_inner_.back() + num_threshold_bin);
  for (int i = 0; i < num_threshold_bin; ++i) {
    cat_threshold_inner_.push_back(threshold_bin[i]);
  }
Guolin Ke's avatar
Guolin Ke committed
87
88
89
90
  ++num_leaves_;
  return num_leaves_ - 1;
}

Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#define PredictionFun(niter, fidx_in_iter, start_pos, decision_fun, iter_idx, \
                      data_idx)                                               \
  std::vector<std::unique_ptr<BinIterator>> iter((niter));                    \
  for (int i = 0; i < (niter); ++i) {                                         \
    iter[i].reset(data->FeatureIterator((fidx_in_iter)));                     \
    iter[i]->Reset((start_pos));                                              \
  }                                                                           \
  for (data_size_t i = start; i < end; ++i) {                                 \
    int node = 0;                                                             \
    while (node >= 0) {                                                       \
      node = decision_fun(iter[(iter_idx)]->Get((data_idx)), node,            \
                          default_bins[node], max_bins[node]);                \
    }                                                                         \
    score[(data_idx)] += static_cast<double>(leaf_value_[~node]);             \
105
106
  }\

107
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
Guolin Ke's avatar
Guolin Ke committed
108
109
  if (num_leaves_ <= 1) {
    if (leaf_value_[0] != 0.0f) {
Guolin Ke's avatar
Guolin Ke committed
110
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
111
112
113
114
115
116
      for (data_size_t i = 0; i < num_data; ++i) {
        score[i] += leaf_value_[0];
      }
    }
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
121
122
123
124
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
125
  if (num_cat_ > 0) {
126
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
127
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
128
129
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, DecisionInner, node, i);
130
131
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
132
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
133
134
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, start, DecisionInner, split_feature_inner_[node], i);
135
136
      });
    }
Guolin Ke's avatar
Guolin Ke committed
137
  } else {
138
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
139
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
140
141
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, NumericalDecisionInner, node, i);
142
143
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
144
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
145
146
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, start, NumericalDecisionInner, split_feature_inner_[node], i);
147
148
      });
    }
Guolin Ke's avatar
Guolin Ke committed
149
  }
Guolin Ke's avatar
Guolin Ke committed
150
151
}

Guolin Ke's avatar
Guolin Ke committed
152
void Tree::AddPredictionToScore(const Dataset* data,
Guolin Ke's avatar
Guolin Ke committed
153
154
155
                                const data_size_t* used_data_indices,
                                data_size_t num_data, double* score) const {
  if (num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
156
    if (leaf_value_[0] != 0.0f) {
Guolin Ke's avatar
Guolin Ke committed
157
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
158
159
160
161
      for (data_size_t i = 0; i < num_data; ++i) {
        score[used_data_indices[i]] += leaf_value_[0];
      }
    }
Guolin Ke's avatar
Guolin Ke committed
162
    return;
Guolin Ke's avatar
Guolin Ke committed
163
  }
Guolin Ke's avatar
Guolin Ke committed
164
165
166
167
168
169
170
171
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
172
  if (num_cat_ > 0) {
173
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
174
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
175
176
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], DecisionInner, node, used_data_indices[i]);
177
178
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
179
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
180
181
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, used_data_indices[start], DecisionInner, split_feature_inner_[node], used_data_indices[i]);
182
183
      });
    }
Guolin Ke's avatar
Guolin Ke committed
184
  } else {
185
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
186
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
187
188
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], NumericalDecisionInner, node, used_data_indices[i]);
189
190
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
191
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
192
193
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, used_data_indices[start], NumericalDecisionInner, split_feature_inner_[node], used_data_indices[i]);
194
195
      });
    }
Guolin Ke's avatar
Guolin Ke committed
196
  }
Guolin Ke's avatar
Guolin Ke committed
197
198
}

199
200
#undef PredictionFun

201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
double Tree::GetUpperBoundValue() const {
  double upper_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] > upper_bound) {
      upper_bound = leaf_value_[i];
    }
  }
  return upper_bound;
}

double Tree::GetLowerBoundValue() const {
  double lower_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] < lower_bound) {
      lower_bound = leaf_value_[i];
    }
  }
  return lower_bound;
}

Guolin Ke's avatar
Guolin Ke committed
221
std::string Tree::ToString() const {
222
  std::stringstream str_buf;
223
224
  str_buf << "num_leaves=" << num_leaves_ << '\n';
  str_buf << "num_cat=" << num_cat_ << '\n';
225
  str_buf << "split_feature="
226
    << Common::ArrayToStringFast(split_feature_, num_leaves_ - 1) << '\n';
227
  str_buf << "split_gain="
228
    << Common::ArrayToStringFast(split_gain_, num_leaves_ - 1) << '\n';
229
  str_buf << "threshold="
230
    << Common::ArrayToString(threshold_, num_leaves_ - 1) << '\n';
231
  str_buf << "decision_type="
232
    << Common::ArrayToStringFast(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1) << '\n';
233
  str_buf << "left_child="
234
    << Common::ArrayToStringFast(left_child_, num_leaves_ - 1) << '\n';
235
  str_buf << "right_child="
236
    << Common::ArrayToStringFast(right_child_, num_leaves_ - 1) << '\n';
237
  str_buf << "leaf_value="
238
    << Common::ArrayToString(leaf_value_, num_leaves_) << '\n';
239
240
  str_buf << "leaf_weight="
    << Common::ArrayToString(leaf_weight_, num_leaves_) << '\n';
241
  str_buf << "leaf_count="
242
    << Common::ArrayToStringFast(leaf_count_, num_leaves_) << '\n';
243
  str_buf << "internal_value="
244
    << Common::ArrayToStringFast(internal_value_, num_leaves_ - 1) << '\n';
245
246
  str_buf << "internal_weight="
    << Common::ArrayToStringFast(internal_weight_, num_leaves_ - 1) << '\n';
247
  str_buf << "internal_count="
248
    << Common::ArrayToStringFast(internal_count_, num_leaves_ - 1) << '\n';
249
250
  if (num_cat_ > 0) {
    str_buf << "cat_boundaries="
251
      << Common::ArrayToStringFast(cat_boundaries_, num_cat_ + 1) << '\n';
252
    str_buf << "cat_threshold="
253
      << Common::ArrayToStringFast(cat_threshold_, cat_threshold_.size()) << '\n';
254
  }
255
256
  str_buf << "shrinkage=" << shrinkage_ << '\n';
  str_buf << '\n';
257
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
258
259
}

Guolin Ke's avatar
Guolin Ke committed
260
std::string Tree::ToJSON() const {
261
  std::stringstream str_buf;
262
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
263
264
265
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << '\n';
  str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
  str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
wxchan's avatar
wxchan committed
266
  if (num_leaves_ == 1) {
267
    str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n';
wxchan's avatar
wxchan committed
268
  } else {
269
    str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
wxchan's avatar
wxchan committed
270
  }
wxchan's avatar
wxchan committed
271

272
  return str_buf.str();
wxchan's avatar
wxchan committed
273
274
}

Guolin Ke's avatar
Guolin Ke committed
275
std::string Tree::NodeToJSON(int index) const {
276
  std::stringstream str_buf;
277
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
wxchan's avatar
wxchan committed
278
279
  if (index >= 0) {
    // non-leaf
280
281
282
    str_buf << "{" << '\n';
    str_buf << "\"split_index\":" << index << "," << '\n';
    str_buf << "\"split_feature\":" << split_feature_[index] << "," << '\n';
Guolin Ke's avatar
Guolin Ke committed
283
    str_buf << "\"split_gain\":" << Common::AvoidInf(split_gain_[index]) << "," << '\n';
284
    if (GetDecisionType(decision_type_[index], kCategoricalMask)) {
285
286
287
288
289
290
291
292
293
294
295
      int cat_idx = static_cast<int>(threshold_[index]);
      std::vector<int> cats;
      for (int i = cat_boundaries_[cat_idx]; i < cat_boundaries_[cat_idx + 1]; ++i) {
        for (int j = 0; j < 32; ++j) {
          int cat = (i - cat_boundaries_[cat_idx]) * 32 + j;
          if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                                   cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], cat)) {
            cats.push_back(cat);
          }
        }
      }
296
297
      str_buf << "\"threshold\":\"" << Common::Join(cats, "||") << "\"," << '\n';
      str_buf << "\"decision_type\":\"==\"," << '\n';
298
    } else {
299
300
      str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << '\n';
      str_buf << "\"decision_type\":\"<=\"," << '\n';
301
302
    }
    if (GetDecisionType(decision_type_[index], kDefaultLeftMask)) {
303
      str_buf << "\"default_left\":true," << '\n';
304
    } else {
305
      str_buf << "\"default_left\":false," << '\n';
306
307
    }
    uint8_t missing_type = GetMissingType(decision_type_[index]);
308
    if (missing_type == MissingType::None) {
309
      str_buf << "\"missing_type\":\"None\"," << '\n';
310
    } else if (missing_type == MissingType::Zero) {
311
      str_buf << "\"missing_type\":\"Zero\"," << '\n';
312
    } else {
313
      str_buf << "\"missing_type\":\"NaN\"," << '\n';
314
    }
315
    str_buf << "\"internal_value\":" << internal_value_[index] << "," << '\n';
316
    str_buf << "\"internal_weight\":" << internal_weight_[index] << "," << '\n';
317
318
319
    str_buf << "\"internal_count\":" << internal_count_[index] << "," << '\n';
    str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << '\n';
    str_buf << "\"right_child\":" << NodeToJSON(right_child_[index]) << '\n';
320
    str_buf << "}";
wxchan's avatar
wxchan committed
321
322
323
  } else {
    // leaf
    index = ~index;
324
325
326
    str_buf << "{" << '\n';
    str_buf << "\"leaf_index\":" << index << "," << '\n';
    str_buf << "\"leaf_value\":" << leaf_value_[index] << "," << '\n';
327
    str_buf << "\"leaf_weight\":" << leaf_weight_[index] << "," << '\n';
328
    str_buf << "\"leaf_count\":" << leaf_count_[index] << '\n';
329
    str_buf << "}";
wxchan's avatar
wxchan committed
330
331
  }

332
  return str_buf.str();
wxchan's avatar
wxchan committed
333
334
}

Guolin Ke's avatar
Guolin Ke committed
335
336
337
338
std::string Tree::NumericalDecisionIfElse(int node) const {
  std::stringstream str_buf;
  uint8_t missing_type = GetMissingType(decision_type_[node]);
  bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask);
Nikita Titov's avatar
Nikita Titov committed
339
  if (missing_type == MissingType::None
340
      || (missing_type == MissingType::Zero && default_left && kZeroThreshold < threshold_[node])) {
Guolin Ke's avatar
Guolin Ke committed
341
    str_buf << "if (fval <= " << threshold_[node] << ") {";
342
  } else if (missing_type == MissingType::Zero) {
Guolin Ke's avatar
Guolin Ke committed
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
    if (default_left) {
      str_buf << "if (fval <= " << threshold_[node] << " || Tree::IsZero(fval)" << " || std::isnan(fval)) {";
    } else {
      str_buf << "if (fval <= " << threshold_[node] << " && !Tree::IsZero(fval)" << " && !std::isnan(fval)) {";
    }
  } else {
    if (default_left) {
      str_buf << "if (fval <= " << threshold_[node] << " || std::isnan(fval)) {";
    } else {
      str_buf << "if (fval <= " << threshold_[node] << " && !std::isnan(fval)) {";
    }
  }
  return str_buf.str();
}

std::string Tree::CategoricalDecisionIfElse(int node) const {
  uint8_t missing_type = GetMissingType(decision_type_[node]);
  std::stringstream str_buf;
361
  if (missing_type == MissingType::NaN) {
Guolin Ke's avatar
Guolin Ke committed
362
363
364
365
    str_buf << "if (std::isnan(fval)) { int_fval = -1; } else { int_fval = static_cast<int>(fval); }";
  } else {
    str_buf << "if (std::isnan(fval)) { int_fval = 0; } else { int_fval = static_cast<int>(fval); }";
  }
366
  int cat_idx = static_cast<int>(threshold_[node]);
367
368
369
370
  str_buf << "if (int_fval >= 0 && int_fval < 32 * (";
  str_buf << cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx];
  str_buf << ") && (((cat_threshold[" << cat_boundaries_[cat_idx];
  str_buf << " + int_fval / 32] >> (int_fval & 31)) & 1))) {";
Guolin Ke's avatar
Guolin Ke committed
371
372
373
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
374
std::string Tree::ToIfElse(int index, bool predict_leaf_index) const {
375
376
  std::stringstream str_buf;
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
377
  if (predict_leaf_index) {
378
379
380
    str_buf << "Leaf";
  }
  str_buf << "(const double* arr) { ";
381
382
  if (num_leaves_ <= 1) {
    str_buf << "return " << leaf_value_[0] << ";";
383
  } else {
384
385
386
387
388
389
390
391
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
392
393
394
395
396
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
397
    str_buf << NodeToIfElse(0, predict_leaf_index);
398
  }
399
  str_buf << " }" << '\n';
400

401
  // Predict func by Map to ifelse
402
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
403
  if (predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
404
405
406
    str_buf << "LeafByMap";
  } else {
    str_buf << "ByMap";
407
408
409
  }
  str_buf << "(const std::unordered_map<int, double>& arr) { ";
  if (num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
    str_buf << "return " << leaf_value_[0] << ";";
  } else {
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
425
    str_buf << NodeToIfElseByMap(0, predict_leaf_index);
426
  }
427
  str_buf << " }" << '\n';
428

429
430
431
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
432
std::string Tree::NodeToIfElse(int index, bool predict_leaf_index) const {
433
434
435
436
  std::stringstream str_buf;
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
437
    str_buf << "fval = arr[" << split_feature_[index] << "];";
Guolin Ke's avatar
Guolin Ke committed
438
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
439
      str_buf << NumericalDecisionIfElse(index);
440
    } else {
441
      str_buf << CategoricalDecisionIfElse(index);
442
443
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
444
    str_buf << NodeToIfElse(left_child_[index], predict_leaf_index);
445
446
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
447
    str_buf << NodeToIfElse(right_child_[index], predict_leaf_index);
448
449
450
451
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
452
    if (predict_leaf_index) {
453
454
455
456
457
458
459
460
461
462
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
463
std::string Tree::NodeToIfElseByMap(int index, bool predict_leaf_index) const {
464
465
466
467
468
469
470
471
472
473
474
  std::stringstream str_buf;
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
    str_buf << "fval = arr.count(" << split_feature_[index] << ") > 0 ? arr.at(" << split_feature_[index] << ") : 0.0f;";
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
      str_buf << NumericalDecisionIfElse(index);
    } else {
      str_buf << CategoricalDecisionIfElse(index);
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
475
    str_buf << NodeToIfElseByMap(left_child_[index], predict_leaf_index);
476
477
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
478
    str_buf << NodeToIfElseByMap(right_child_[index], predict_leaf_index);
479
480
481
482
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
483
    if (predict_leaf_index) {
484
485
486
487
488
489
490
491
492
493
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

494
495
Tree::Tree(const char* str, size_t* used_len) {
  auto p = str;
Guolin Ke's avatar
Guolin Ke committed
496
  std::unordered_map<std::string, std::string> key_vals;
497
  const int max_num_line = 17;
498
499
500
501
  int read_line = 0;
  while (read_line < max_num_line) {
    if (*p == '\r' || *p == '\n') break;
    auto start = p;
502
    while (*p != '=') ++p;
503
504
505
506
507
508
509
510
511
512
513
    std::string key(start, p - start);
    ++p;
    start = p;
    while (*p != '\r' && *p != '\n') ++p;
    key_vals[key] = std::string(start, p - start);
    ++read_line;
    if (*p == '\r') ++p;
    if (*p == '\n') ++p;
  }
  *used_len = p - str;

514
  if (key_vals.count("num_leaves") <= 0) {
515
    Log::Fatal("Tree model should contain num_leaves field");
Guolin Ke's avatar
Guolin Ke committed
516
517
518
519
  }

  Common::Atoi(key_vals["num_leaves"].c_str(), &num_leaves_);

520
  if (key_vals.count("num_cat") <= 0) {
521
    Log::Fatal("Tree model should contain num_cat field");
522
523
524
525
  }

  Common::Atoi(key_vals["num_cat"].c_str(), &num_cat_);

526
  if (key_vals.count("leaf_value")) {
527
    leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], num_leaves_);
528
529
530
  } else {
    Log::Fatal("Tree model string format error, should contain leaf_value field");
  }
531

Guolin Ke's avatar
Guolin Ke committed
532
533
534
535
536
  if (key_vals.count("shrinkage")) {
    Common::Atof(key_vals["shrinkage"].c_str(), &shrinkage_);
  } else {
    shrinkage_ = 1.0f;
  }
537

538
539
  if (num_leaves_ <= 1) { return; }

Guolin Ke's avatar
Guolin Ke committed
540
  if (key_vals.count("left_child")) {
541
    left_child_ = Common::StringToArrayFast<int>(key_vals["left_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
542
543
  } else {
    Log::Fatal("Tree model string format error, should contain left_child field");
544
545
  }

Guolin Ke's avatar
Guolin Ke committed
546
  if (key_vals.count("right_child")) {
547
    right_child_ = Common::StringToArrayFast<int>(key_vals["right_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
548
549
550
551
552
  } else {
    Log::Fatal("Tree model string format error, should contain right_child field");
  }

  if (key_vals.count("split_feature")) {
553
    split_feature_ = Common::StringToArrayFast<int>(key_vals["split_feature"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
554
555
556
557
558
  } else {
    Log::Fatal("Tree model string format error, should contain split_feature field");
  }

  if (key_vals.count("threshold")) {
559
    threshold_ = Common::StringToArray<double>(key_vals["threshold"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
560
561
562
563
564
  } else {
    Log::Fatal("Tree model string format error, should contain threshold field");
  }

  if (key_vals.count("split_gain")) {
565
    split_gain_ = Common::StringToArrayFast<float>(key_vals["split_gain"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
566
567
568
569
570
  } else {
    split_gain_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_count")) {
571
    internal_count_ = Common::StringToArrayFast<int>(key_vals["internal_count"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
572
573
574
575
576
  } else {
    internal_count_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_value")) {
577
    internal_value_ = Common::StringToArrayFast<double>(key_vals["internal_value"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
578
579
580
581
  } else {
    internal_value_.resize(num_leaves_ - 1);
  }

582
583
  if (key_vals.count("internal_weight")) {
    internal_weight_ = Common::StringToArrayFast<double>(key_vals["internal_weight"], num_leaves_ - 1);
584
  } else {
585
586
587
588
589
    internal_weight_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("leaf_weight")) {
    leaf_weight_ = Common::StringToArrayFast<double>(key_vals["leaf_weight"], num_leaves_);
590
  } else {
591
592
593
    leaf_weight_.resize(num_leaves_);
  }

Guolin Ke's avatar
Guolin Ke committed
594
  if (key_vals.count("leaf_count")) {
595
    leaf_count_ = Common::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
Guolin Ke's avatar
Guolin Ke committed
596
597
598
599
600
  } else {
    leaf_count_.resize(num_leaves_);
  }

  if (key_vals.count("decision_type")) {
601
    decision_type_ = Common::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
602
603
604
605
  } else {
    decision_type_ = std::vector<int8_t>(num_leaves_ - 1, 0);
  }

606
607
  if (num_cat_ > 0) {
    if (key_vals.count("cat_boundaries")) {
608
      cat_boundaries_ = Common::StringToArrayFast<int>(key_vals["cat_boundaries"], num_cat_ + 1);
609
610
611
612
613
    } else {
      Log::Fatal("Tree model should contain cat_boundaries field.");
    }

    if (key_vals.count("cat_threshold")) {
614
      cat_threshold_ = Common::StringToArrayFast<uint32_t>(key_vals["cat_threshold"], cat_boundaries_.back());
615
    } else {
616
      Log::Fatal("Tree model should contain cat_threshold field");
617
618
    }
  }
619
  max_depth_ = -1;
Guolin Ke's avatar
Guolin Ke committed
620
621
}

Guolin Ke's avatar
Guolin Ke committed
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
void Tree::ExtendPath(PathElement *unique_path, int unique_depth,
                      double zero_fraction, double one_fraction, int feature_index) {
  unique_path[unique_depth].feature_index = feature_index;
  unique_path[unique_depth].zero_fraction = zero_fraction;
  unique_path[unique_depth].one_fraction = one_fraction;
  unique_path[unique_depth].pweight = (unique_depth == 0 ? 1 : 0);
  for (int i = unique_depth - 1; i >= 0; i--) {
    unique_path[i + 1].pweight += one_fraction*unique_path[i].pweight*(i + 1)
      / static_cast<double>(unique_depth + 1);
    unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth - i)
      / static_cast<double>(unique_depth + 1);
  }
}

void Tree::UnwindPath(PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;

  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = unique_path[i].pweight;
      unique_path[i].pweight = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth - i)
        / static_cast<double>(unique_depth + 1);
    } else {
      unique_path[i].pweight = (unique_path[i].pweight*(unique_depth + 1))
        / static_cast<double>(zero_fraction*(unique_depth - i));
    }
  }

  for (int i = path_index; i < unique_depth; ++i) {
    unique_path[i].feature_index = unique_path[i + 1].feature_index;
    unique_path[i].zero_fraction = unique_path[i + 1].zero_fraction;
    unique_path[i].one_fraction = unique_path[i + 1].one_fraction;
  }
}

double Tree::UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;
  double total = 0;
  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      total += tmp;
      next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth - i)
                                                                     / static_cast<double>(unique_depth + 1));
    } else {
      total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i)
                                                           / static_cast<double>(unique_depth + 1));
    }
  }
  return total;
}

// recursive computation of SHAP values for a decision tree
void Tree::TreeSHAP(const double *feature_values, double *phi,
                    int node, int unique_depth,
                    PathElement *parent_unique_path, double parent_zero_fraction,
                    double parent_one_fraction, int parent_feature_index) const {
  // extend the unique path
687
  PathElement* unique_path = parent_unique_path + unique_depth;
Guolin Ke's avatar
Guolin Ke committed
688
689
690
691
692
693
694
695
696
697
698
699
700
701
  if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);

  // leaf node
  if (node < 0) {
    for (int i = 1; i <= unique_depth; ++i) {
      const double w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
      phi[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
    }

    // internal node
  } else {
702
    const int hot_index = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
703
704
705
706
707
708
709
710
711
712
713
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index) / w;
    const double cold_zero_fraction = data_count(cold_index) / w;
    double incoming_zero_fraction = 1;
    double incoming_one_fraction = 1;

    // see if we have already split on this feature,
    // if so we undo that split so we can redo it for this node
    int path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
714
      if (unique_path[path_index].feature_index == split_feature_[node]) break;
Guolin Ke's avatar
Guolin Ke committed
715
716
717
718
719
720
721
722
723
    }
    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

    TreeSHAP(feature_values, phi, hot_index, unique_depth + 1, unique_path,
724
             hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
725
726

    TreeSHAP(feature_values, phi, cold_index, unique_depth + 1, unique_path,
727
             cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
728
729
730
  }
}

731
732
733
734
735
double Tree::ExpectedValue() const {
  if (num_leaves_ == 1) return LeafOutput(0);
  const double total_count = internal_count_[0];
  double exp_value = 0.0;
  for (int i = 0; i < num_leaves(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
736
    exp_value += (leaf_count_[i] / total_count)*LeafOutput(i);
Guolin Ke's avatar
Guolin Ke committed
737
  }
738
  return exp_value;
Guolin Ke's avatar
Guolin Ke committed
739
740
}

741
742
743
744
745
746
747
748
749
750
751
void Tree::RecomputeMaxDepth() {
  if (num_leaves_ == 1) {
    max_depth_ = 0;
  } else {
    if (leaf_depth_.size() == 0) {
      RecomputeLeafDepths(0, 0);
    }
    max_depth_ = leaf_depth_[0];
    for (int i = 1; i < num_leaves(); ++i) {
      if (max_depth_ < leaf_depth_[i]) max_depth_ = leaf_depth_[i];
    }
Guolin Ke's avatar
Guolin Ke committed
752
753
754
  }
}

Guolin Ke's avatar
Guolin Ke committed
755
}  // namespace LightGBM