tree.cpp 31.2 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#include <LightGBM/tree.h>

#include <LightGBM/dataset.h>
8
9
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h>
Guolin Ke's avatar
Guolin Ke committed
10

11
12
13
14
#include <functional>
#include <iomanip>
#include <sstream>

Guolin Ke's avatar
Guolin Ke committed
15
16
namespace LightGBM {

17
18
Tree::Tree(int max_leaves, bool track_branch_features)
  :max_leaves_(max_leaves), track_branch_features_(track_branch_features) {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
22
23
24
  left_child_.resize(max_leaves_ - 1);
  right_child_.resize(max_leaves_ - 1);
  split_feature_inner_.resize(max_leaves_ - 1);
  split_feature_.resize(max_leaves_ - 1);
  threshold_in_bin_.resize(max_leaves_ - 1);
  threshold_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
25
  decision_type_.resize(max_leaves_ - 1, 0);
Guolin Ke's avatar
Guolin Ke committed
26
27
28
  split_gain_.resize(max_leaves_ - 1);
  leaf_parent_.resize(max_leaves_);
  leaf_value_.resize(max_leaves_);
29
  leaf_weight_.resize(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
30
31
  leaf_count_.resize(max_leaves_);
  internal_value_.resize(max_leaves_ - 1);
32
  internal_weight_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
33
34
  internal_count_.resize(max_leaves_ - 1);
  leaf_depth_.resize(max_leaves_);
35
36
37
  if (track_branch_features_) {
    branch_features_ = std::vector<std::vector<int>>(max_leaves_);
  }
Guolin Ke's avatar
Guolin Ke committed
38
39
  // root is in the depth 0
  leaf_depth_[0] = 0;
Guolin Ke's avatar
Guolin Ke committed
40
  num_leaves_ = 1;
41
  leaf_value_[0] = 0.0f;
42
  leaf_weight_[0] = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
43
  leaf_parent_[0] = -1;
Guolin Ke's avatar
Guolin Ke committed
44
  shrinkage_ = 1.0f;
45
  num_cat_ = 0;
46
47
  cat_boundaries_.push_back(0);
  cat_boundaries_inner_.push_back(0);
48
  max_depth_ = -1;
Guolin Ke's avatar
Guolin Ke committed
49
}
Guolin Ke's avatar
Guolin Ke committed
50

Guolin Ke's avatar
Guolin Ke committed
51
52
53
Tree::~Tree() {
}

54
55
int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
                double threshold_double, double left_value, double right_value,
56
57
                int left_cnt, int right_cnt, double left_weight, double right_weight, float gain,
                MissingType missing_type, bool default_left) {
58
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
Guolin Ke's avatar
Guolin Ke committed
59
  int new_node_idx = num_leaves_ - 1;
Guolin Ke's avatar
Guolin Ke committed
60
  decision_type_[new_node_idx] = 0;
61
  SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
62
  SetDecisionType(&decision_type_[new_node_idx], default_left, kDefaultLeftMask);
Guolin Ke's avatar
Guolin Ke committed
63
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
Guolin Ke's avatar
Guolin Ke committed
64
  threshold_in_bin_[new_node_idx] = threshold_bin;
Guolin Ke's avatar
Guolin Ke committed
65
  threshold_[new_node_idx] = threshold_double;
66
67
68
  ++num_leaves_;
  return num_leaves_ - 1;
}
Guolin Ke's avatar
Guolin Ke committed
69

70
71
int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                           const uint32_t* threshold, int num_threshold, double left_value, double right_value,
72
73
                           data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) {
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
74
75
76
  int new_node_idx = num_leaves_ - 1;
  decision_type_[new_node_idx] = 0;
  SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
77
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
78
79
  threshold_in_bin_[new_node_idx] = num_cat_;
  threshold_[new_node_idx] = num_cat_;
80
  ++num_cat_;
81
82
83
84
85
86
87
88
  cat_boundaries_.push_back(cat_boundaries_.back() + num_threshold);
  for (int i = 0; i < num_threshold; ++i) {
    cat_threshold_.push_back(threshold[i]);
  }
  cat_boundaries_inner_.push_back(cat_boundaries_inner_.back() + num_threshold_bin);
  for (int i = 0; i < num_threshold_bin; ++i) {
    cat_threshold_inner_.push_back(threshold_bin[i]);
  }
Guolin Ke's avatar
Guolin Ke committed
89
90
91
92
  ++num_leaves_;
  return num_leaves_ - 1;
}

Guolin Ke's avatar
Guolin Ke committed
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#define PredictionFun(niter, fidx_in_iter, start_pos, decision_fun, iter_idx, \
                      data_idx)                                               \
  std::vector<std::unique_ptr<BinIterator>> iter((niter));                    \
  for (int i = 0; i < (niter); ++i) {                                         \
    iter[i].reset(data->FeatureIterator((fidx_in_iter)));                     \
    iter[i]->Reset((start_pos));                                              \
  }                                                                           \
  for (data_size_t i = start; i < end; ++i) {                                 \
    int node = 0;                                                             \
    while (node >= 0) {                                                       \
      node = decision_fun(iter[(iter_idx)]->Get((data_idx)), node,            \
                          default_bins[node], max_bins[node]);                \
    }                                                                         \
    score[(data_idx)] += static_cast<double>(leaf_value_[~node]);             \
107
108
  }\

109
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
Guolin Ke's avatar
Guolin Ke committed
110
111
  if (num_leaves_ <= 1) {
    if (leaf_value_[0] != 0.0f) {
Guolin Ke's avatar
Guolin Ke committed
112
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
118
      for (data_size_t i = 0; i < num_data; ++i) {
        score[i] += leaf_value_[0];
      }
    }
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
119
120
121
122
123
124
125
126
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
127
  if (num_cat_ > 0) {
128
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
129
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
130
131
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, DecisionInner, node, i);
132
133
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
134
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
135
136
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, start, DecisionInner, split_feature_inner_[node], i);
137
138
      });
    }
Guolin Ke's avatar
Guolin Ke committed
139
  } else {
140
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
141
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
142
143
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, NumericalDecisionInner, node, i);
144
145
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
146
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
147
148
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, start, NumericalDecisionInner, split_feature_inner_[node], i);
149
150
      });
    }
Guolin Ke's avatar
Guolin Ke committed
151
  }
Guolin Ke's avatar
Guolin Ke committed
152
153
}

Guolin Ke's avatar
Guolin Ke committed
154
void Tree::AddPredictionToScore(const Dataset* data,
Guolin Ke's avatar
Guolin Ke committed
155
156
157
                                const data_size_t* used_data_indices,
                                data_size_t num_data, double* score) const {
  if (num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
158
    if (leaf_value_[0] != 0.0f) {
Guolin Ke's avatar
Guolin Ke committed
159
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
160
161
162
163
      for (data_size_t i = 0; i < num_data; ++i) {
        score[used_data_indices[i]] += leaf_value_[0];
      }
    }
Guolin Ke's avatar
Guolin Ke committed
164
    return;
Guolin Ke's avatar
Guolin Ke committed
165
  }
Guolin Ke's avatar
Guolin Ke committed
166
167
168
169
170
171
172
173
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
174
  if (num_cat_ > 0) {
175
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
176
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
177
178
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], DecisionInner, node, used_data_indices[i]);
179
180
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
181
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
182
183
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, used_data_indices[start], DecisionInner, split_feature_inner_[node], used_data_indices[i]);
184
185
      });
    }
Guolin Ke's avatar
Guolin Ke committed
186
  } else {
187
    if (data->num_features() > num_leaves_ - 1) {
Guolin Ke's avatar
Guolin Ke committed
188
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
189
190
      (int, data_size_t start, data_size_t end) {
        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], NumericalDecisionInner, node, used_data_indices[i]);
191
192
      });
    } else {
Guolin Ke's avatar
Guolin Ke committed
193
      Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
194
195
      (int, data_size_t start, data_size_t end) {
        PredictionFun(data->num_features(), i, used_data_indices[start], NumericalDecisionInner, split_feature_inner_[node], used_data_indices[i]);
196
197
      });
    }
Guolin Ke's avatar
Guolin Ke committed
198
  }
Guolin Ke's avatar
Guolin Ke committed
199
200
}

201
202
#undef PredictionFun

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
double Tree::GetUpperBoundValue() const {
  double upper_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] > upper_bound) {
      upper_bound = leaf_value_[i];
    }
  }
  return upper_bound;
}

double Tree::GetLowerBoundValue() const {
  double lower_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] < lower_bound) {
      lower_bound = leaf_value_[i];
    }
  }
  return lower_bound;
}

Guolin Ke's avatar
Guolin Ke committed
223
std::string Tree::ToString() const {
224
  std::stringstream str_buf;
225
226
  str_buf << "num_leaves=" << num_leaves_ << '\n';
  str_buf << "num_cat=" << num_cat_ << '\n';
227
  str_buf << "split_feature="
228
    << Common::ArrayToStringFast(split_feature_, num_leaves_ - 1) << '\n';
229
  str_buf << "split_gain="
230
    << Common::ArrayToStringFast(split_gain_, num_leaves_ - 1) << '\n';
231
  str_buf << "threshold="
232
    << Common::ArrayToString(threshold_, num_leaves_ - 1) << '\n';
233
  str_buf << "decision_type="
234
    << Common::ArrayToStringFast(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1) << '\n';
235
  str_buf << "left_child="
236
    << Common::ArrayToStringFast(left_child_, num_leaves_ - 1) << '\n';
237
  str_buf << "right_child="
238
    << Common::ArrayToStringFast(right_child_, num_leaves_ - 1) << '\n';
239
  str_buf << "leaf_value="
240
    << Common::ArrayToString(leaf_value_, num_leaves_) << '\n';
241
242
  str_buf << "leaf_weight="
    << Common::ArrayToString(leaf_weight_, num_leaves_) << '\n';
243
  str_buf << "leaf_count="
244
    << Common::ArrayToStringFast(leaf_count_, num_leaves_) << '\n';
245
  str_buf << "internal_value="
246
    << Common::ArrayToStringFast(internal_value_, num_leaves_ - 1) << '\n';
247
248
  str_buf << "internal_weight="
    << Common::ArrayToStringFast(internal_weight_, num_leaves_ - 1) << '\n';
249
  str_buf << "internal_count="
250
    << Common::ArrayToStringFast(internal_count_, num_leaves_ - 1) << '\n';
251
252
  if (num_cat_ > 0) {
    str_buf << "cat_boundaries="
253
      << Common::ArrayToStringFast(cat_boundaries_, num_cat_ + 1) << '\n';
254
    str_buf << "cat_threshold="
255
      << Common::ArrayToStringFast(cat_threshold_, cat_threshold_.size()) << '\n';
256
  }
257
258
  str_buf << "shrinkage=" << shrinkage_ << '\n';
  str_buf << '\n';
259
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
260
261
}

Guolin Ke's avatar
Guolin Ke committed
262
std::string Tree::ToJSON() const {
263
  std::stringstream str_buf;
264
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
265
266
267
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << '\n';
  str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
  str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
wxchan's avatar
wxchan committed
268
  if (num_leaves_ == 1) {
269
    str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n';
wxchan's avatar
wxchan committed
270
  } else {
271
    str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
wxchan's avatar
wxchan committed
272
  }
wxchan's avatar
wxchan committed
273

274
  return str_buf.str();
wxchan's avatar
wxchan committed
275
276
}

Guolin Ke's avatar
Guolin Ke committed
277
std::string Tree::NodeToJSON(int index) const {
278
  std::stringstream str_buf;
279
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
wxchan's avatar
wxchan committed
280
281
  if (index >= 0) {
    // non-leaf
282
283
284
    str_buf << "{" << '\n';
    str_buf << "\"split_index\":" << index << "," << '\n';
    str_buf << "\"split_feature\":" << split_feature_[index] << "," << '\n';
Guolin Ke's avatar
Guolin Ke committed
285
    str_buf << "\"split_gain\":" << Common::AvoidInf(split_gain_[index]) << "," << '\n';
286
    if (GetDecisionType(decision_type_[index], kCategoricalMask)) {
287
288
289
290
291
292
293
294
295
296
297
      int cat_idx = static_cast<int>(threshold_[index]);
      std::vector<int> cats;
      for (int i = cat_boundaries_[cat_idx]; i < cat_boundaries_[cat_idx + 1]; ++i) {
        for (int j = 0; j < 32; ++j) {
          int cat = (i - cat_boundaries_[cat_idx]) * 32 + j;
          if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                                   cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], cat)) {
            cats.push_back(cat);
          }
        }
      }
298
299
      str_buf << "\"threshold\":\"" << Common::Join(cats, "||") << "\"," << '\n';
      str_buf << "\"decision_type\":\"==\"," << '\n';
300
    } else {
301
302
      str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << '\n';
      str_buf << "\"decision_type\":\"<=\"," << '\n';
303
304
    }
    if (GetDecisionType(decision_type_[index], kDefaultLeftMask)) {
305
      str_buf << "\"default_left\":true," << '\n';
306
    } else {
307
      str_buf << "\"default_left\":false," << '\n';
308
309
    }
    uint8_t missing_type = GetMissingType(decision_type_[index]);
310
    if (missing_type == MissingType::None) {
311
      str_buf << "\"missing_type\":\"None\"," << '\n';
312
    } else if (missing_type == MissingType::Zero) {
313
      str_buf << "\"missing_type\":\"Zero\"," << '\n';
314
    } else {
315
      str_buf << "\"missing_type\":\"NaN\"," << '\n';
316
    }
317
    str_buf << "\"internal_value\":" << internal_value_[index] << "," << '\n';
318
    str_buf << "\"internal_weight\":" << internal_weight_[index] << "," << '\n';
319
320
321
    str_buf << "\"internal_count\":" << internal_count_[index] << "," << '\n';
    str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << '\n';
    str_buf << "\"right_child\":" << NodeToJSON(right_child_[index]) << '\n';
322
    str_buf << "}";
wxchan's avatar
wxchan committed
323
324
325
  } else {
    // leaf
    index = ~index;
326
327
328
    str_buf << "{" << '\n';
    str_buf << "\"leaf_index\":" << index << "," << '\n';
    str_buf << "\"leaf_value\":" << leaf_value_[index] << "," << '\n';
329
    str_buf << "\"leaf_weight\":" << leaf_weight_[index] << "," << '\n';
330
    str_buf << "\"leaf_count\":" << leaf_count_[index] << '\n';
331
    str_buf << "}";
wxchan's avatar
wxchan committed
332
333
  }

334
  return str_buf.str();
wxchan's avatar
wxchan committed
335
336
}

Guolin Ke's avatar
Guolin Ke committed
337
338
339
340
std::string Tree::NumericalDecisionIfElse(int node) const {
  std::stringstream str_buf;
  uint8_t missing_type = GetMissingType(decision_type_[node]);
  bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask);
Nikita Titov's avatar
Nikita Titov committed
341
  if (missing_type == MissingType::None
342
      || (missing_type == MissingType::Zero && default_left && kZeroThreshold < threshold_[node])) {
Guolin Ke's avatar
Guolin Ke committed
343
    str_buf << "if (fval <= " << threshold_[node] << ") {";
344
  } else if (missing_type == MissingType::Zero) {
Guolin Ke's avatar
Guolin Ke committed
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
    if (default_left) {
      str_buf << "if (fval <= " << threshold_[node] << " || Tree::IsZero(fval)" << " || std::isnan(fval)) {";
    } else {
      str_buf << "if (fval <= " << threshold_[node] << " && !Tree::IsZero(fval)" << " && !std::isnan(fval)) {";
    }
  } else {
    if (default_left) {
      str_buf << "if (fval <= " << threshold_[node] << " || std::isnan(fval)) {";
    } else {
      str_buf << "if (fval <= " << threshold_[node] << " && !std::isnan(fval)) {";
    }
  }
  return str_buf.str();
}

std::string Tree::CategoricalDecisionIfElse(int node) const {
  uint8_t missing_type = GetMissingType(decision_type_[node]);
  std::stringstream str_buf;
363
  if (missing_type == MissingType::NaN) {
Guolin Ke's avatar
Guolin Ke committed
364
365
366
367
    str_buf << "if (std::isnan(fval)) { int_fval = -1; } else { int_fval = static_cast<int>(fval); }";
  } else {
    str_buf << "if (std::isnan(fval)) { int_fval = 0; } else { int_fval = static_cast<int>(fval); }";
  }
368
  int cat_idx = static_cast<int>(threshold_[node]);
369
370
371
372
  str_buf << "if (int_fval >= 0 && int_fval < 32 * (";
  str_buf << cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx];
  str_buf << ") && (((cat_threshold[" << cat_boundaries_[cat_idx];
  str_buf << " + int_fval / 32] >> (int_fval & 31)) & 1))) {";
Guolin Ke's avatar
Guolin Ke committed
373
374
375
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
376
std::string Tree::ToIfElse(int index, bool predict_leaf_index) const {
377
378
  std::stringstream str_buf;
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
379
  if (predict_leaf_index) {
380
381
382
    str_buf << "Leaf";
  }
  str_buf << "(const double* arr) { ";
383
384
  if (num_leaves_ <= 1) {
    str_buf << "return " << leaf_value_[0] << ";";
385
  } else {
386
387
388
389
390
391
392
393
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
394
395
396
397
398
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
399
    str_buf << NodeToIfElse(0, predict_leaf_index);
400
  }
401
  str_buf << " }" << '\n';
402

403
  // Predict func by Map to ifelse
404
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
405
  if (predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
406
407
408
    str_buf << "LeafByMap";
  } else {
    str_buf << "ByMap";
409
410
411
  }
  str_buf << "(const std::unordered_map<int, double>& arr) { ";
  if (num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
    str_buf << "return " << leaf_value_[0] << ";";
  } else {
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
427
    str_buf << NodeToIfElseByMap(0, predict_leaf_index);
428
  }
429
  str_buf << " }" << '\n';
430

431
432
433
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
434
std::string Tree::NodeToIfElse(int index, bool predict_leaf_index) const {
435
436
437
438
  std::stringstream str_buf;
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
439
    str_buf << "fval = arr[" << split_feature_[index] << "];";
Guolin Ke's avatar
Guolin Ke committed
440
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
441
      str_buf << NumericalDecisionIfElse(index);
442
    } else {
443
      str_buf << CategoricalDecisionIfElse(index);
444
445
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
446
    str_buf << NodeToIfElse(left_child_[index], predict_leaf_index);
447
448
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
449
    str_buf << NodeToIfElse(right_child_[index], predict_leaf_index);
450
451
452
453
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
454
    if (predict_leaf_index) {
455
456
457
458
459
460
461
462
463
464
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
465
std::string Tree::NodeToIfElseByMap(int index, bool predict_leaf_index) const {
466
467
468
469
470
471
472
473
474
475
476
  std::stringstream str_buf;
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
    str_buf << "fval = arr.count(" << split_feature_[index] << ") > 0 ? arr.at(" << split_feature_[index] << ") : 0.0f;";
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
      str_buf << NumericalDecisionIfElse(index);
    } else {
      str_buf << CategoricalDecisionIfElse(index);
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
477
    str_buf << NodeToIfElseByMap(left_child_[index], predict_leaf_index);
478
479
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
480
    str_buf << NodeToIfElseByMap(right_child_[index], predict_leaf_index);
481
482
483
484
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
485
    if (predict_leaf_index) {
486
487
488
489
490
491
492
493
494
495
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

496
497
Tree::Tree(const char* str, size_t* used_len) {
  auto p = str;
Guolin Ke's avatar
Guolin Ke committed
498
  std::unordered_map<std::string, std::string> key_vals;
499
  const int max_num_line = 17;
500
501
502
503
  int read_line = 0;
  while (read_line < max_num_line) {
    if (*p == '\r' || *p == '\n') break;
    auto start = p;
504
    while (*p != '=') ++p;
505
506
507
508
509
510
511
512
513
514
515
    std::string key(start, p - start);
    ++p;
    start = p;
    while (*p != '\r' && *p != '\n') ++p;
    key_vals[key] = std::string(start, p - start);
    ++read_line;
    if (*p == '\r') ++p;
    if (*p == '\n') ++p;
  }
  *used_len = p - str;

516
  if (key_vals.count("num_leaves") <= 0) {
517
    Log::Fatal("Tree model should contain num_leaves field");
Guolin Ke's avatar
Guolin Ke committed
518
519
520
521
  }

  Common::Atoi(key_vals["num_leaves"].c_str(), &num_leaves_);

522
  if (key_vals.count("num_cat") <= 0) {
523
    Log::Fatal("Tree model should contain num_cat field");
524
525
526
527
  }

  Common::Atoi(key_vals["num_cat"].c_str(), &num_cat_);

528
  if (key_vals.count("leaf_value")) {
529
    leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], num_leaves_);
530
531
532
  } else {
    Log::Fatal("Tree model string format error, should contain leaf_value field");
  }
533

Guolin Ke's avatar
Guolin Ke committed
534
535
536
537
538
  if (key_vals.count("shrinkage")) {
    Common::Atof(key_vals["shrinkage"].c_str(), &shrinkage_);
  } else {
    shrinkage_ = 1.0f;
  }
539

540
541
  if (num_leaves_ <= 1) { return; }

Guolin Ke's avatar
Guolin Ke committed
542
  if (key_vals.count("left_child")) {
543
    left_child_ = Common::StringToArrayFast<int>(key_vals["left_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
544
545
  } else {
    Log::Fatal("Tree model string format error, should contain left_child field");
546
547
  }

Guolin Ke's avatar
Guolin Ke committed
548
  if (key_vals.count("right_child")) {
549
    right_child_ = Common::StringToArrayFast<int>(key_vals["right_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
550
551
552
553
554
  } else {
    Log::Fatal("Tree model string format error, should contain right_child field");
  }

  if (key_vals.count("split_feature")) {
555
    split_feature_ = Common::StringToArrayFast<int>(key_vals["split_feature"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
556
557
558
559
560
  } else {
    Log::Fatal("Tree model string format error, should contain split_feature field");
  }

  if (key_vals.count("threshold")) {
561
    threshold_ = Common::StringToArray<double>(key_vals["threshold"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
562
563
564
565
566
  } else {
    Log::Fatal("Tree model string format error, should contain threshold field");
  }

  if (key_vals.count("split_gain")) {
567
    split_gain_ = Common::StringToArrayFast<float>(key_vals["split_gain"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
568
569
570
571
572
  } else {
    split_gain_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_count")) {
573
    internal_count_ = Common::StringToArrayFast<int>(key_vals["internal_count"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
574
575
576
577
578
  } else {
    internal_count_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_value")) {
579
    internal_value_ = Common::StringToArrayFast<double>(key_vals["internal_value"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
580
581
582
583
  } else {
    internal_value_.resize(num_leaves_ - 1);
  }

584
585
  if (key_vals.count("internal_weight")) {
    internal_weight_ = Common::StringToArrayFast<double>(key_vals["internal_weight"], num_leaves_ - 1);
586
  } else {
587
588
589
590
    internal_weight_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("leaf_weight")) {
Guolin Ke's avatar
Guolin Ke committed
591
    leaf_weight_ = Common::StringToArray<double>(key_vals["leaf_weight"], num_leaves_);
592
  } else {
593
594
595
    leaf_weight_.resize(num_leaves_);
  }

Guolin Ke's avatar
Guolin Ke committed
596
  if (key_vals.count("leaf_count")) {
597
    leaf_count_ = Common::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
Guolin Ke's avatar
Guolin Ke committed
598
599
600
601
602
  } else {
    leaf_count_.resize(num_leaves_);
  }

  if (key_vals.count("decision_type")) {
603
    decision_type_ = Common::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
604
605
606
607
  } else {
    decision_type_ = std::vector<int8_t>(num_leaves_ - 1, 0);
  }

608
609
  if (num_cat_ > 0) {
    if (key_vals.count("cat_boundaries")) {
610
      cat_boundaries_ = Common::StringToArrayFast<int>(key_vals["cat_boundaries"], num_cat_ + 1);
611
612
613
614
615
    } else {
      Log::Fatal("Tree model should contain cat_boundaries field.");
    }

    if (key_vals.count("cat_threshold")) {
616
      cat_threshold_ = Common::StringToArrayFast<uint32_t>(key_vals["cat_threshold"], cat_boundaries_.back());
617
    } else {
618
      Log::Fatal("Tree model should contain cat_threshold field");
619
620
    }
  }
621
  max_depth_ = -1;
Guolin Ke's avatar
Guolin Ke committed
622
623
}

Guolin Ke's avatar
Guolin Ke committed
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
void Tree::ExtendPath(PathElement *unique_path, int unique_depth,
                      double zero_fraction, double one_fraction, int feature_index) {
  unique_path[unique_depth].feature_index = feature_index;
  unique_path[unique_depth].zero_fraction = zero_fraction;
  unique_path[unique_depth].one_fraction = one_fraction;
  unique_path[unique_depth].pweight = (unique_depth == 0 ? 1 : 0);
  for (int i = unique_depth - 1; i >= 0; i--) {
    unique_path[i + 1].pweight += one_fraction*unique_path[i].pweight*(i + 1)
      / static_cast<double>(unique_depth + 1);
    unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth - i)
      / static_cast<double>(unique_depth + 1);
  }
}

void Tree::UnwindPath(PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;

  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = unique_path[i].pweight;
      unique_path[i].pweight = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth - i)
        / static_cast<double>(unique_depth + 1);
    } else {
      unique_path[i].pweight = (unique_path[i].pweight*(unique_depth + 1))
        / static_cast<double>(zero_fraction*(unique_depth - i));
    }
  }

  for (int i = path_index; i < unique_depth; ++i) {
    unique_path[i].feature_index = unique_path[i + 1].feature_index;
    unique_path[i].zero_fraction = unique_path[i + 1].zero_fraction;
    unique_path[i].one_fraction = unique_path[i + 1].one_fraction;
  }
}

double Tree::UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;
  double total = 0;
  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      total += tmp;
      next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth - i)
                                                                     / static_cast<double>(unique_depth + 1));
    } else {
      total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i)
                                                           / static_cast<double>(unique_depth + 1));
    }
  }
  return total;
}

// recursive computation of SHAP values for a decision tree
void Tree::TreeSHAP(const double *feature_values, double *phi,
                    int node, int unique_depth,
                    PathElement *parent_unique_path, double parent_zero_fraction,
                    double parent_one_fraction, int parent_feature_index) const {
  // extend the unique path
689
  PathElement* unique_path = parent_unique_path + unique_depth;
690
691
692
  if (unique_depth > 0) {
    std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
  }
Guolin Ke's avatar
Guolin Ke committed
693
694
695
696
697
698
699
700
701
702
703
704
705
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);

  // leaf node
  if (node < 0) {
    for (int i = 1; i <= unique_depth; ++i) {
      const double w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
      phi[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
    }

    // internal node
  } else {
706
    const int hot_index = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
707
708
709
710
711
712
713
714
715
716
717
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index) / w;
    const double cold_zero_fraction = data_count(cold_index) / w;
    double incoming_zero_fraction = 1;
    double incoming_one_fraction = 1;

    // see if we have already split on this feature,
    // if so we undo that split so we can redo it for this node
    int path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
718
      if (unique_path[path_index].feature_index == split_feature_[node]) break;
Guolin Ke's avatar
Guolin Ke committed
719
720
721
722
723
724
725
726
727
    }
    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

    TreeSHAP(feature_values, phi, hot_index, unique_depth + 1, unique_path,
728
             hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
729
730

    TreeSHAP(feature_values, phi, cold_index, unique_depth + 1, unique_path,
731
             cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
732
733
734
  }
}

735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
// recursive sparse computation of SHAP values for a decision tree
void Tree::TreeSHAPByMap(const std::unordered_map<int, double>& feature_values, std::unordered_map<int, double>* phi,
                         int node, int unique_depth,
                         PathElement *parent_unique_path, double parent_zero_fraction,
                         double parent_one_fraction, int parent_feature_index) const {
  // extend the unique path
  PathElement* unique_path = parent_unique_path + unique_depth;
  if (unique_depth > 0) {
    std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
  }
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);

  // leaf node
  if (node < 0) {
    for (int i = 1; i <= unique_depth; ++i) {
      const double w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
      (*phi)[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
    }

  // internal node
  } else {
    const int hot_index = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index) / w;
    const double cold_zero_fraction = data_count(cold_index) / w;
    double incoming_zero_fraction = 1;
    double incoming_one_fraction = 1;

    // see if we have already split on this feature,
    // if so we undo that split so we can redo it for this node
    int path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
      if (unique_path[path_index].feature_index == split_feature_[node]) break;
    }
    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

    TreeSHAPByMap(feature_values, phi, hot_index, unique_depth + 1, unique_path,
                  hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);

    TreeSHAPByMap(feature_values, phi, cold_index, unique_depth + 1, unique_path,
                  cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
  }
}

787
788
789
790
791
double Tree::ExpectedValue() const {
  if (num_leaves_ == 1) return LeafOutput(0);
  const double total_count = internal_count_[0];
  double exp_value = 0.0;
  for (int i = 0; i < num_leaves(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
792
    exp_value += (leaf_count_[i] / total_count)*LeafOutput(i);
Guolin Ke's avatar
Guolin Ke committed
793
  }
794
  return exp_value;
Guolin Ke's avatar
Guolin Ke committed
795
796
}

797
798
799
800
801
802
803
804
805
806
807
void Tree::RecomputeMaxDepth() {
  if (num_leaves_ == 1) {
    max_depth_ = 0;
  } else {
    if (leaf_depth_.size() == 0) {
      RecomputeLeafDepths(0, 0);
    }
    max_depth_ = leaf_depth_[0];
    for (int i = 1; i < num_leaves(); ++i) {
      if (max_depth_ < leaf_depth_[i]) max_depth_ = leaf_depth_[i];
    }
Guolin Ke's avatar
Guolin Ke committed
808
809
810
  }
}

Guolin Ke's avatar
Guolin Ke committed
811
}  // namespace LightGBM