"vscode:/vscode.git/clone" did not exist on "3a657c8bf245777bb3cdc6de535b851674ec278c"
tree.cpp 41.6 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#include <LightGBM/tree.h>

#include <LightGBM/dataset.h>
8
9
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h>
Guolin Ke's avatar
Guolin Ke committed
10

11
12
13
14
#include <functional>
#include <iomanip>
#include <sstream>

Guolin Ke's avatar
Guolin Ke committed
15
16
namespace LightGBM {

17
Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear)
18
  :max_leaves_(max_leaves), track_branch_features_(track_branch_features) {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
22
23
24
  left_child_.resize(max_leaves_ - 1);
  right_child_.resize(max_leaves_ - 1);
  split_feature_inner_.resize(max_leaves_ - 1);
  split_feature_.resize(max_leaves_ - 1);
  threshold_in_bin_.resize(max_leaves_ - 1);
  threshold_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
25
  decision_type_.resize(max_leaves_ - 1, 0);
Guolin Ke's avatar
Guolin Ke committed
26
27
28
  split_gain_.resize(max_leaves_ - 1);
  leaf_parent_.resize(max_leaves_);
  leaf_value_.resize(max_leaves_);
29
  leaf_weight_.resize(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
30
31
  leaf_count_.resize(max_leaves_);
  internal_value_.resize(max_leaves_ - 1);
32
  internal_weight_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
33
34
  internal_count_.resize(max_leaves_ - 1);
  leaf_depth_.resize(max_leaves_);
35
36
37
  if (track_branch_features_) {
    branch_features_ = std::vector<std::vector<int>>(max_leaves_);
  }
Guolin Ke's avatar
Guolin Ke committed
38
39
  // root is in the depth 0
  leaf_depth_[0] = 0;
Guolin Ke's avatar
Guolin Ke committed
40
  num_leaves_ = 1;
41
  leaf_value_[0] = 0.0f;
42
  leaf_weight_[0] = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
43
  leaf_parent_[0] = -1;
Guolin Ke's avatar
Guolin Ke committed
44
  shrinkage_ = 1.0f;
45
  num_cat_ = 0;
46
47
  cat_boundaries_.push_back(0);
  cat_boundaries_inner_.push_back(0);
48
  max_depth_ = -1;
49
50
51
52
53
54
55
  is_linear_ = is_linear;
  if (is_linear_) {
    leaf_coeff_.resize(max_leaves_);
    leaf_const_ = std::vector<double>(max_leaves_, 0);
    leaf_features_.resize(max_leaves_);
    leaf_features_inner_.resize(max_leaves_);
  }
56
  #ifdef USE_CUDA
57
  is_cuda_tree_ = false;
58
  #endif  // USE_CUDA
Guolin Ke's avatar
Guolin Ke committed
59
}
Guolin Ke's avatar
Guolin Ke committed
60

61
62
int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
                double threshold_double, double left_value, double right_value,
63
64
                int left_cnt, int right_cnt, double left_weight, double right_weight, float gain,
                MissingType missing_type, bool default_left) {
65
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
Guolin Ke's avatar
Guolin Ke committed
66
  int new_node_idx = num_leaves_ - 1;
Guolin Ke's avatar
Guolin Ke committed
67
  decision_type_[new_node_idx] = 0;
68
  SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
69
  SetDecisionType(&decision_type_[new_node_idx], default_left, kDefaultLeftMask);
Guolin Ke's avatar
Guolin Ke committed
70
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
Guolin Ke's avatar
Guolin Ke committed
71
  threshold_in_bin_[new_node_idx] = threshold_bin;
Guolin Ke's avatar
Guolin Ke committed
72
  threshold_[new_node_idx] = threshold_double;
73
74
75
  ++num_leaves_;
  return num_leaves_ - 1;
}
Guolin Ke's avatar
Guolin Ke committed
76

77
78
int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                           const uint32_t* threshold, int num_threshold, double left_value, double right_value,
79
80
                           data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) {
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
81
82
83
  int new_node_idx = num_leaves_ - 1;
  decision_type_[new_node_idx] = 0;
  SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
84
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
85
86
  threshold_in_bin_[new_node_idx] = num_cat_;
  threshold_[new_node_idx] = num_cat_;
87
  ++num_cat_;
88
89
90
91
92
93
94
95
  cat_boundaries_.push_back(cat_boundaries_.back() + num_threshold);
  for (int i = 0; i < num_threshold; ++i) {
    cat_threshold_.push_back(threshold[i]);
  }
  cat_boundaries_inner_.push_back(cat_boundaries_inner_.back() + num_threshold_bin);
  for (int i = 0; i < num_threshold_bin; ++i) {
    cat_threshold_inner_.push_back(threshold_bin[i]);
  }
Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
  ++num_leaves_;
  return num_leaves_ - 1;
}

Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#define PredictionFun(niter, fidx_in_iter, start_pos, decision_fun, iter_idx, \
                      data_idx)                                               \
  std::vector<std::unique_ptr<BinIterator>> iter((niter));                    \
  for (int i = 0; i < (niter); ++i) {                                         \
    iter[i].reset(data->FeatureIterator((fidx_in_iter)));                     \
    iter[i]->Reset((start_pos));                                              \
  }                                                                           \
  for (data_size_t i = start; i < end; ++i) {                                 \
    int node = 0;                                                             \
    while (node >= 0) {                                                       \
      node = decision_fun(iter[(iter_idx)]->Get((data_idx)), node,            \
                          default_bins[node], max_bins[node]);                \
    }                                                                         \
    score[(data_idx)] += static_cast<double>(leaf_value_[~node]);             \
114
115
  }\

116
117
118
119
120
121
122
123
124
125

#define PredictionFunLinear(niter, fidx_in_iter, start_pos, decision_fun,     \
                            iter_idx, data_idx)                               \
  std::vector<std::unique_ptr<BinIterator>> iter((niter));                    \
  for (int i = 0; i < (niter); ++i) {                                         \
    iter[i].reset(data->FeatureIterator((fidx_in_iter)));                     \
    iter[i]->Reset((start_pos));                                              \
  }                                                                           \
  for (data_size_t i = start; i < end; ++i) {                                 \
    int node = 0;                                                             \
126
127
128
129
130
131
    if (num_leaves_ > 1) {                                                    \
      while (node >= 0) {                                                     \
        node = decision_fun(iter[(iter_idx)]->Get((data_idx)), node,          \
                            default_bins[node], max_bins[node]);              \
      }                                                                       \
      node = ~node;                                                           \
132
    }                                                                         \
133
    double add_score = leaf_const_[node];                                     \
134
    bool nan_found = false;                                                   \
135
136
137
    const double* coeff_ptr = leaf_coeff_[node].data();                       \
    const float** data_ptr = feat_ptr[node].data();                           \
    for (size_t j = 0; j < leaf_features_inner_[node].size(); ++j) {          \
138
139
140
141
142
143
144
145
       float feat_val = data_ptr[j][(data_idx)];                              \
       if (std::isnan(feat_val)) {                                            \
          nan_found = true;                                                   \
          break;                                                              \
       }                                                                      \
       add_score += coeff_ptr[j] * feat_val;                                  \
    }                                                                         \
    if (nan_found) {                                                          \
146
       score[(data_idx)] += leaf_value_[node];                                \
147
148
149
150
151
152
    } else {                                                                  \
      score[(data_idx)] += add_score;                                         \
    }                                                                         \
}\


153
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
154
  if (!is_linear_ && num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
155
    if (leaf_value_[0] != 0.0f) {
156
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
157
158
159
160
161
162
      for (data_size_t i = 0; i < num_data; ++i) {
        score[i] += leaf_value_[0];
      }
    }
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
163
164
165
166
167
168
169
170
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
  if (is_linear_) {
    std::vector<std::vector<const float*>> feat_ptr(num_leaves_);
    for (int leaf_num = 0; leaf_num < num_leaves_; ++leaf_num) {
      for (int feat : leaf_features_inner_[leaf_num]) {
        feat_ptr[leaf_num].push_back(data->raw_index(feat));
      }
    }
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], start, DecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, start, DecisionInner, split_feature_inner_[node], i);
        });
      }
190
    } else {
191
192
193
194
195
196
197
198
199
200
201
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], start, NumericalDecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, start, NumericalDecisionInner, split_feature_inner_[node], i);
        });
      }
202
    }
Guolin Ke's avatar
Guolin Ke committed
203
  } else {
204
205
206
207
208
209
210
211
212
213
214
215
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, DecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, start, DecisionInner, split_feature_inner_[node], i);
        });
      }
216
    } else {
217
218
219
220
221
222
223
224
225
226
227
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, NumericalDecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, start, NumericalDecisionInner, split_feature_inner_[node], i);
        });
      }
228
    }
Guolin Ke's avatar
Guolin Ke committed
229
  }
Guolin Ke's avatar
Guolin Ke committed
230
231
}

Guolin Ke's avatar
Guolin Ke committed
232
void Tree::AddPredictionToScore(const Dataset* data,
233
234
235
  const data_size_t* used_data_indices,
  data_size_t num_data, double* score) const {
  if (!is_linear_ && num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
236
    if (leaf_value_[0] != 0.0f) {
237
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
238
239
240
241
      for (data_size_t i = 0; i < num_data; ++i) {
        score[used_data_indices[i]] += leaf_value_[0];
      }
    }
Guolin Ke's avatar
Guolin Ke committed
242
    return;
Guolin Ke's avatar
Guolin Ke committed
243
  }
Guolin Ke's avatar
Guolin Ke committed
244
245
246
247
248
249
250
251
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
  if (is_linear_) {
    std::vector<std::vector<const float*>> feat_ptr(num_leaves_);
    for (int leaf_num = 0; leaf_num < num_leaves_; ++leaf_num) {
      for (int feat : leaf_features_inner_[leaf_num]) {
        feat_ptr[leaf_num].push_back(data->raw_index(feat));
      }
    }
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], DecisionInner,
                              node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, used_data_indices[start], DecisionInner, split_feature_inner_[node], used_data_indices[i]);
        });
      }
272
    } else {
273
274
275
276
277
278
279
280
281
282
283
284
285
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], NumericalDecisionInner,
                              node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, used_data_indices[start], NumericalDecisionInner,
                              split_feature_inner_[node], used_data_indices[i]);
        });
      }
286
    }
Guolin Ke's avatar
Guolin Ke committed
287
  } else {
288
289
290
291
292
293
294
295
296
297
298
299
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], DecisionInner, node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, used_data_indices[start], DecisionInner, split_feature_inner_[node], used_data_indices[i]);
        });
      }
300
    } else {
301
302
303
304
305
306
307
308
309
310
311
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], NumericalDecisionInner, node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, used_data_indices[start], NumericalDecisionInner, split_feature_inner_[node], used_data_indices[i]);
        });
      }
312
    }
Guolin Ke's avatar
Guolin Ke committed
313
  }
Guolin Ke's avatar
Guolin Ke committed
314
315
}

316
#undef PredictionFun
317
#undef PredictionFunLinear
318

319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
double Tree::GetUpperBoundValue() const {
  double upper_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] > upper_bound) {
      upper_bound = leaf_value_[i];
    }
  }
  return upper_bound;
}

double Tree::GetLowerBoundValue() const {
  double lower_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] < lower_bound) {
      lower_bound = leaf_value_[i];
    }
  }
  return lower_bound;
}

Guolin Ke's avatar
Guolin Ke committed
339
std::string Tree::ToString() const {
340
  std::stringstream str_buf;
341
342
343
344
  Common::C_stringstream(str_buf);

  using CommonC::ArrayToString;

345
346
  str_buf << "num_leaves=" << num_leaves_ << '\n';
  str_buf << "num_cat=" << num_cat_ << '\n';
347
  str_buf << "split_feature="
348
    << ArrayToString(split_feature_, num_leaves_ - 1) << '\n';
349
  str_buf << "split_gain="
350
    << ArrayToString(split_gain_, num_leaves_ - 1) << '\n';
351
  str_buf << "threshold="
352
    << ArrayToString<true>(threshold_, num_leaves_ - 1) << '\n';
353
  str_buf << "decision_type="
354
    << ArrayToString(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1) << '\n';
355
  str_buf << "left_child="
356
    << ArrayToString(left_child_, num_leaves_ - 1) << '\n';
357
  str_buf << "right_child="
358
    << ArrayToString(right_child_, num_leaves_ - 1) << '\n';
359
  str_buf << "leaf_value="
360
    << ArrayToString<true>(leaf_value_, num_leaves_) << '\n';
361
  str_buf << "leaf_weight="
362
    << ArrayToString<true>(leaf_weight_, num_leaves_) << '\n';
363
  str_buf << "leaf_count="
364
    << ArrayToString(leaf_count_, num_leaves_) << '\n';
365
  str_buf << "internal_value="
366
    << ArrayToString(internal_value_, num_leaves_ - 1) << '\n';
367
  str_buf << "internal_weight="
368
    << ArrayToString(internal_weight_, num_leaves_ - 1) << '\n';
369
  str_buf << "internal_count="
370
    << ArrayToString(internal_count_, num_leaves_ - 1) << '\n';
371
372
  if (num_cat_ > 0) {
    str_buf << "cat_boundaries="
373
      << ArrayToString(cat_boundaries_, num_cat_ + 1) << '\n';
374
    str_buf << "cat_threshold="
375
      << ArrayToString(cat_threshold_, cat_threshold_.size()) << '\n';
376
  }
377
378
379
380
  str_buf << "is_linear=" << is_linear_ << '\n';

  if (is_linear_) {
    str_buf << "leaf_const="
381
      << ArrayToString<true>(leaf_const_, num_leaves_) << '\n';
382
383
    std::vector<int> num_feat(num_leaves_);
    for (int i = 0; i < num_leaves_; ++i) {
384
      num_feat[i] = static_cast<int>(leaf_coeff_[i].size());
385
386
387
388
389
390
391
392
393
394
395
396
397
398
    }
    str_buf << "num_features="
      << ArrayToString(num_feat, num_leaves_) << '\n';
    str_buf << "leaf_features=";
    for (int i = 0; i < num_leaves_; ++i) {
      if (num_feat[i] > 0) {
        str_buf << ArrayToString(leaf_features_[i], leaf_features_[i].size()) << ' ';
      }
      str_buf << ' ';
    }
    str_buf << '\n';
    str_buf << "leaf_coeff=";
    for (int i = 0; i < num_leaves_; ++i) {
      if (num_feat[i] > 0) {
399
        str_buf << ArrayToString<true>(leaf_coeff_[i], leaf_coeff_[i].size()) << ' ';
400
401
402
403
404
      }
      str_buf << ' ';
    }
    str_buf << '\n';
  }
405
406
  str_buf << "shrinkage=" << shrinkage_ << '\n';
  str_buf << '\n';
407

408
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
409
410
}

Guolin Ke's avatar
Guolin Ke committed
411
std::string Tree::ToJSON() const {
412
  std::stringstream str_buf;
413
  Common::C_stringstream(str_buf);
414
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
415
416
417
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << '\n';
  str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
  str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
wxchan's avatar
wxchan committed
418
  if (num_leaves_ == 1) {
419
420
421
422
423
424
    if (is_linear_) {
      str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << ", " << "\n";
      str_buf << LinearModelToJSON(0) << "}" << "\n";
    } else {
      str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n';
    }
wxchan's avatar
wxchan committed
425
  } else {
426
    str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
wxchan's avatar
wxchan committed
427
  }
428
429
  return str_buf.str();
}
wxchan's avatar
wxchan committed
430

431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
std::string Tree::LinearModelToJSON(int index) const {
  std::stringstream str_buf;
  Common::C_stringstream(str_buf);
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  str_buf << "\"leaf_const\":" << leaf_const_[index] << "," << "\n";
  int num_features = static_cast<int>(leaf_features_[index].size());
  if (num_features > 0) {
    str_buf << "\"leaf_features\":[";
    for (int i = 0; i < num_features - 1; ++i) {
      str_buf << leaf_features_[index][i] << ", ";
    }
    str_buf << leaf_features_[index][num_features - 1] << "]" << ", " << "\n";
    str_buf << "\"leaf_coeff\":[";
    for (int i = 0; i < num_features - 1; ++i) {
      str_buf << leaf_coeff_[index][i] << ", ";
    }
    str_buf << leaf_coeff_[index][num_features - 1] << "]" << "\n";
  } else {
    str_buf << "\"leaf_features\":[],\n";
    str_buf << "\"leaf_coeff\":[]\n";
  }
452
  return str_buf.str();
wxchan's avatar
wxchan committed
453
454
}

Guolin Ke's avatar
Guolin Ke committed
455
std::string Tree::NodeToJSON(int index) const {
456
  std::stringstream str_buf;
457
  Common::C_stringstream(str_buf);
458
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
wxchan's avatar
wxchan committed
459
460
  if (index >= 0) {
    // non-leaf
461
462
463
    str_buf << "{" << '\n';
    str_buf << "\"split_index\":" << index << "," << '\n';
    str_buf << "\"split_feature\":" << split_feature_[index] << "," << '\n';
Guolin Ke's avatar
Guolin Ke committed
464
    str_buf << "\"split_gain\":" << Common::AvoidInf(split_gain_[index]) << "," << '\n';
465
    if (GetDecisionType(decision_type_[index], kCategoricalMask)) {
466
467
468
469
470
471
472
473
474
475
476
      int cat_idx = static_cast<int>(threshold_[index]);
      std::vector<int> cats;
      for (int i = cat_boundaries_[cat_idx]; i < cat_boundaries_[cat_idx + 1]; ++i) {
        for (int j = 0; j < 32; ++j) {
          int cat = (i - cat_boundaries_[cat_idx]) * 32 + j;
          if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                                   cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], cat)) {
            cats.push_back(cat);
          }
        }
      }
477
      str_buf << "\"threshold\":\"" << CommonC::Join(cats, "||") << "\"," << '\n';
478
      str_buf << "\"decision_type\":\"==\"," << '\n';
479
    } else {
480
481
      str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << '\n';
      str_buf << "\"decision_type\":\"<=\"," << '\n';
482
483
    }
    if (GetDecisionType(decision_type_[index], kDefaultLeftMask)) {
484
      str_buf << "\"default_left\":true," << '\n';
485
    } else {
486
      str_buf << "\"default_left\":false," << '\n';
487
488
    }
    uint8_t missing_type = GetMissingType(decision_type_[index]);
489
    if (missing_type == MissingType::None) {
490
      str_buf << "\"missing_type\":\"None\"," << '\n';
491
    } else if (missing_type == MissingType::Zero) {
492
      str_buf << "\"missing_type\":\"Zero\"," << '\n';
493
    } else {
494
      str_buf << "\"missing_type\":\"NaN\"," << '\n';
495
    }
496
    str_buf << "\"internal_value\":" << internal_value_[index] << "," << '\n';
497
    str_buf << "\"internal_weight\":" << internal_weight_[index] << "," << '\n';
498
499
500
    str_buf << "\"internal_count\":" << internal_count_[index] << "," << '\n';
    str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << '\n';
    str_buf << "\"right_child\":" << NodeToJSON(right_child_[index]) << '\n';
501
    str_buf << "}";
wxchan's avatar
wxchan committed
502
503
504
  } else {
    // leaf
    index = ~index;
505
506
507
    str_buf << "{" << '\n';
    str_buf << "\"leaf_index\":" << index << "," << '\n';
    str_buf << "\"leaf_value\":" << leaf_value_[index] << "," << '\n';
508
    str_buf << "\"leaf_weight\":" << leaf_weight_[index] << "," << '\n';
509
510
511
512
513
514
    if (is_linear_) {
      str_buf << "\"leaf_count\":" << leaf_count_[index] << "," << '\n';
      str_buf << LinearModelToJSON(index);
    } else {
      str_buf << "\"leaf_count\":" << leaf_count_[index] << '\n';
    }
515
    str_buf << "}";
wxchan's avatar
wxchan committed
516
  }
517
  return str_buf.str();
wxchan's avatar
wxchan committed
518
519
}

Guolin Ke's avatar
Guolin Ke committed
520
521
std::string Tree::NumericalDecisionIfElse(int node) const {
  std::stringstream str_buf;
522
  Common::C_stringstream(str_buf);
523
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
Guolin Ke's avatar
Guolin Ke committed
524
525
  uint8_t missing_type = GetMissingType(decision_type_[node]);
  bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask);
José Morales's avatar
José Morales committed
526
527
528
529
  if (missing_type != MissingType::NaN) {
    str_buf << "if (std::isnan(fval)) fval = 0.0;";
  }
  if (missing_type == MissingType::Zero) {
Guolin Ke's avatar
Guolin Ke committed
530
    if (default_left) {
José Morales's avatar
José Morales committed
531
      str_buf << "if (Tree::IsZero(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
532
    } else {
José Morales's avatar
José Morales committed
533
      str_buf << "if (!Tree::IsZero(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
534
    }
José Morales's avatar
José Morales committed
535
  } else if (missing_type == MissingType::NaN) {
Guolin Ke's avatar
Guolin Ke committed
536
    if (default_left) {
José Morales's avatar
José Morales committed
537
      str_buf << "if (std::isnan(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
538
    } else {
José Morales's avatar
José Morales committed
539
      str_buf << "if (!std::isnan(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
540
    }
José Morales's avatar
José Morales committed
541
542
  } else {
    str_buf << "if (fval <= " << threshold_[node] << ") {";
Guolin Ke's avatar
Guolin Ke committed
543
544
545
546
547
548
  }
  return str_buf.str();
}

std::string Tree::CategoricalDecisionIfElse(int node) const {
  std::stringstream str_buf;
549
  Common::C_stringstream(str_buf);
550
  int cat_idx = static_cast<int>(threshold_[node]);
José Morales's avatar
José Morales committed
551
  str_buf << "if (std::isnan(fval)) { int_fval = -1; } else { int_fval = static_cast<int>(fval); }";
552
553
554
555
  str_buf << "if (int_fval >= 0 && int_fval < 32 * (";
  str_buf << cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx];
  str_buf << ") && (((cat_threshold[" << cat_boundaries_[cat_idx];
  str_buf << " + int_fval / 32] >> (int_fval & 31)) & 1))) {";
Guolin Ke's avatar
Guolin Ke committed
556
557
558
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
559
std::string Tree::ToIfElse(int index, bool predict_leaf_index) const {
560
  std::stringstream str_buf;
561
  Common::C_stringstream(str_buf);
562
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
563
  if (predict_leaf_index) {
564
565
566
    str_buf << "Leaf";
  }
  str_buf << "(const double* arr) { ";
567
568
  if (num_leaves_ <= 1) {
    str_buf << "return " << leaf_value_[0] << ";";
569
  } else {
570
571
572
573
574
575
576
577
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
578
579
580
581
582
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
583
    str_buf << NodeToIfElse(0, predict_leaf_index);
584
  }
585
  str_buf << " }" << '\n';
586

587
  // Predict func by Map to ifelse
588
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
589
  if (predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
590
591
592
    str_buf << "LeafByMap";
  } else {
    str_buf << "ByMap";
593
594
595
  }
  str_buf << "(const std::unordered_map<int, double>& arr) { ";
  if (num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
    str_buf << "return " << leaf_value_[0] << ";";
  } else {
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
611
    str_buf << NodeToIfElseByMap(0, predict_leaf_index);
612
  }
613
  str_buf << " }" << '\n';
614

615
616
617
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
618
std::string Tree::NodeToIfElse(int index, bool predict_leaf_index) const {
619
  std::stringstream str_buf;
620
  Common::C_stringstream(str_buf);
621
622
623
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
624
    str_buf << "fval = arr[" << split_feature_[index] << "];";
Guolin Ke's avatar
Guolin Ke committed
625
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
626
      str_buf << NumericalDecisionIfElse(index);
627
    } else {
628
      str_buf << CategoricalDecisionIfElse(index);
629
630
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
631
    str_buf << NodeToIfElse(left_child_[index], predict_leaf_index);
632
633
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
634
    str_buf << NodeToIfElse(right_child_[index], predict_leaf_index);
635
636
637
638
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
639
    if (predict_leaf_index) {
640
641
642
643
644
645
646
647
648
649
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
650
std::string Tree::NodeToIfElseByMap(int index, bool predict_leaf_index) const {
651
  std::stringstream str_buf;
652
  Common::C_stringstream(str_buf);
653
654
655
656
657
658
659
660
661
662
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
    str_buf << "fval = arr.count(" << split_feature_[index] << ") > 0 ? arr.at(" << split_feature_[index] << ") : 0.0f;";
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
      str_buf << NumericalDecisionIfElse(index);
    } else {
      str_buf << CategoricalDecisionIfElse(index);
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
663
    str_buf << NodeToIfElseByMap(left_child_[index], predict_leaf_index);
664
665
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
666
    str_buf << NodeToIfElseByMap(right_child_[index], predict_leaf_index);
667
668
669
670
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
671
    if (predict_leaf_index) {
672
673
674
675
676
677
678
679
680
681
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

682
683
Tree::Tree(const char* str, size_t* used_len) {
  auto p = str;
Guolin Ke's avatar
Guolin Ke committed
684
  std::unordered_map<std::string, std::string> key_vals;
685
  const int max_num_line = 22;
686
687
688
689
  int read_line = 0;
  while (read_line < max_num_line) {
    if (*p == '\r' || *p == '\n') break;
    auto start = p;
690
    while (*p != '=') ++p;
691
692
693
694
695
696
697
698
699
700
701
    std::string key(start, p - start);
    ++p;
    start = p;
    while (*p != '\r' && *p != '\n') ++p;
    key_vals[key] = std::string(start, p - start);
    ++read_line;
    if (*p == '\r') ++p;
    if (*p == '\n') ++p;
  }
  *used_len = p - str;

702
  if (key_vals.count("num_leaves") <= 0) {
703
    Log::Fatal("Tree model should contain num_leaves field");
Guolin Ke's avatar
Guolin Ke committed
704
705
706
707
  }

  Common::Atoi(key_vals["num_leaves"].c_str(), &num_leaves_);

708
  if (key_vals.count("num_cat") <= 0) {
709
    Log::Fatal("Tree model should contain num_cat field");
710
711
712
713
  }

  Common::Atoi(key_vals["num_cat"].c_str(), &num_cat_);

714
  if (key_vals.count("leaf_value")) {
715
    leaf_value_ = CommonC::StringToArray<double>(key_vals["leaf_value"], num_leaves_);
716
717
718
  } else {
    Log::Fatal("Tree model string format error, should contain leaf_value field");
  }
719

Guolin Ke's avatar
Guolin Ke committed
720
  if (key_vals.count("shrinkage")) {
721
    CommonC::Atof(key_vals["shrinkage"].c_str(), &shrinkage_);
Guolin Ke's avatar
Guolin Ke committed
722
723
724
  } else {
    shrinkage_ = 1.0f;
  }
725

726
727
728
729
  if (key_vals.count("is_linear")) {
    int is_linear_int;
    Common::Atoi(key_vals["is_linear"].c_str(), &is_linear_int);
    is_linear_ = static_cast<bool>(is_linear_int);
730
731
  } else {
    is_linear_ = false;
732
733
  }

734
  #ifdef USE_CUDA
735
  is_cuda_tree_ = false;
736
  #endif  // USE_CUDA
737

738
739
740
  if ((num_leaves_ <= 1) && !is_linear_) {
    return;
  }
741

Guolin Ke's avatar
Guolin Ke committed
742
  if (key_vals.count("left_child")) {
743
    left_child_ = CommonC::StringToArrayFast<int>(key_vals["left_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
744
745
  } else {
    Log::Fatal("Tree model string format error, should contain left_child field");
746
747
  }

Guolin Ke's avatar
Guolin Ke committed
748
  if (key_vals.count("right_child")) {
749
    right_child_ = CommonC::StringToArrayFast<int>(key_vals["right_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
750
751
752
753
754
  } else {
    Log::Fatal("Tree model string format error, should contain right_child field");
  }

  if (key_vals.count("split_feature")) {
755
    split_feature_ = CommonC::StringToArrayFast<int>(key_vals["split_feature"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
756
757
758
759
760
  } else {
    Log::Fatal("Tree model string format error, should contain split_feature field");
  }

  if (key_vals.count("threshold")) {
761
    threshold_ = CommonC::StringToArray<double>(key_vals["threshold"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
762
763
764
765
766
  } else {
    Log::Fatal("Tree model string format error, should contain threshold field");
  }

  if (key_vals.count("split_gain")) {
767
    split_gain_ = CommonC::StringToArrayFast<float>(key_vals["split_gain"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
768
769
770
771
772
  } else {
    split_gain_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_count")) {
773
    internal_count_ = CommonC::StringToArrayFast<int>(key_vals["internal_count"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
774
775
776
777
778
  } else {
    internal_count_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_value")) {
779
    internal_value_ = CommonC::StringToArrayFast<double>(key_vals["internal_value"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
780
781
782
783
  } else {
    internal_value_.resize(num_leaves_ - 1);
  }

784
  if (key_vals.count("internal_weight")) {
785
    internal_weight_ = CommonC::StringToArrayFast<double>(key_vals["internal_weight"], num_leaves_ - 1);
786
  } else {
787
788
789
790
    internal_weight_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("leaf_weight")) {
791
    leaf_weight_ = CommonC::StringToArray<double>(key_vals["leaf_weight"], num_leaves_);
792
  } else {
793
794
795
    leaf_weight_.resize(num_leaves_);
  }

Guolin Ke's avatar
Guolin Ke committed
796
  if (key_vals.count("leaf_count")) {
797
    leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
Guolin Ke's avatar
Guolin Ke committed
798
799
800
801
802
  } else {
    leaf_count_.resize(num_leaves_);
  }

  if (key_vals.count("decision_type")) {
803
    decision_type_ = CommonC::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
804
805
806
807
  } else {
    decision_type_ = std::vector<int8_t>(num_leaves_ - 1, 0);
  }

808
809
  if (is_linear_) {
    if (key_vals.count("leaf_const")) {
810
      leaf_const_ = Common::StringToArray<double>(key_vals["leaf_const"], num_leaves_);
811
812
813
814
815
816
817
818
819
820
821
822
    } else {
      leaf_const_.resize(num_leaves_);
    }
    std::vector<int> num_feat;
    if (key_vals.count("num_features")) {
      num_feat = Common::StringToArrayFast<int>(key_vals["num_features"], num_leaves_);
    }
    leaf_coeff_.resize(num_leaves_);
    leaf_features_.resize(num_leaves_);
    leaf_features_inner_.resize(num_leaves_);
    if (num_feat.size() > 0) {
      int total_num_feat = 0;
823
824
825
      for (size_t i = 0; i < num_feat.size(); ++i) {
        total_num_feat += num_feat[i];
      }
826
827
828
829
830
831
      std::vector<int> all_leaf_features;
      if (key_vals.count("leaf_features")) {
        all_leaf_features = Common::StringToArrayFast<int>(key_vals["leaf_features"], total_num_feat);
      }
      std::vector<double> all_leaf_coeff;
      if (key_vals.count("leaf_coeff")) {
832
        all_leaf_coeff = Common::StringToArray<double>(key_vals["leaf_coeff"], total_num_feat);
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
      }
      int sum_num_feat = 0;
      for (int i = 0; i < num_leaves_; ++i) {
        if (num_feat[i] > 0) {
          if (key_vals.count("leaf_features"))  {
            leaf_features_[i].assign(all_leaf_features.begin() + sum_num_feat, all_leaf_features.begin() + sum_num_feat + num_feat[i]);
          }
          if (key_vals.count("leaf_coeff")) {
            leaf_coeff_[i].assign(all_leaf_coeff.begin() + sum_num_feat, all_leaf_coeff.begin() + sum_num_feat + num_feat[i]);
          }
        }
        sum_num_feat += num_feat[i];
      }
    }
  }

849
850
  if (num_cat_ > 0) {
    if (key_vals.count("cat_boundaries")) {
851
      cat_boundaries_ = CommonC::StringToArrayFast<int>(key_vals["cat_boundaries"], num_cat_ + 1);
852
853
854
855
856
    } else {
      Log::Fatal("Tree model should contain cat_boundaries field.");
    }

    if (key_vals.count("cat_threshold")) {
857
      cat_threshold_ = CommonC::StringToArrayFast<uint32_t>(key_vals["cat_threshold"], cat_boundaries_.back());
858
    } else {
859
      Log::Fatal("Tree model should contain cat_threshold field");
860
861
    }
  }
862
  max_depth_ = -1;
Guolin Ke's avatar
Guolin Ke committed
863
864
}

Guolin Ke's avatar
Guolin Ke committed
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
void Tree::ExtendPath(PathElement *unique_path, int unique_depth,
                      double zero_fraction, double one_fraction, int feature_index) {
  unique_path[unique_depth].feature_index = feature_index;
  unique_path[unique_depth].zero_fraction = zero_fraction;
  unique_path[unique_depth].one_fraction = one_fraction;
  unique_path[unique_depth].pweight = (unique_depth == 0 ? 1 : 0);
  for (int i = unique_depth - 1; i >= 0; i--) {
    unique_path[i + 1].pweight += one_fraction*unique_path[i].pweight*(i + 1)
      / static_cast<double>(unique_depth + 1);
    unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth - i)
      / static_cast<double>(unique_depth + 1);
  }
}

void Tree::UnwindPath(PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;

  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = unique_path[i].pweight;
      unique_path[i].pweight = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth - i)
        / static_cast<double>(unique_depth + 1);
    } else {
      unique_path[i].pweight = (unique_path[i].pweight*(unique_depth + 1))
        / static_cast<double>(zero_fraction*(unique_depth - i));
    }
  }

  for (int i = path_index; i < unique_depth; ++i) {
    unique_path[i].feature_index = unique_path[i + 1].feature_index;
    unique_path[i].zero_fraction = unique_path[i + 1].zero_fraction;
    unique_path[i].one_fraction = unique_path[i + 1].one_fraction;
  }
}

double Tree::UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;
  double total = 0;
  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      total += tmp;
      next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth - i)
                                                                     / static_cast<double>(unique_depth + 1));
    } else {
      total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i)
                                                           / static_cast<double>(unique_depth + 1));
    }
  }
  return total;
}

// recursive computation of SHAP values for a decision tree
void Tree::TreeSHAP(const double *feature_values, double *phi,
                    int node, int unique_depth,
                    PathElement *parent_unique_path, double parent_zero_fraction,
                    double parent_one_fraction, int parent_feature_index) const {
  // extend the unique path
930
  PathElement* unique_path = parent_unique_path + unique_depth;
931
932
933
  if (unique_depth > 0) {
    std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
  }
Guolin Ke's avatar
Guolin Ke committed
934
935
936
937
938
939
940
941
942
943
944
945
946
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);

  // leaf node
  if (node < 0) {
    for (int i = 1; i <= unique_depth; ++i) {
      const double w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
      phi[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
    }

    // internal node
  } else {
947
    const int hot_index = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
948
949
950
951
952
953
954
955
956
957
958
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index) / w;
    const double cold_zero_fraction = data_count(cold_index) / w;
    double incoming_zero_fraction = 1;
    double incoming_one_fraction = 1;

    // see if we have already split on this feature,
    // if so we undo that split so we can redo it for this node
    int path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
959
      if (unique_path[path_index].feature_index == split_feature_[node]) break;
Guolin Ke's avatar
Guolin Ke committed
960
961
962
963
964
965
966
967
968
    }
    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

    TreeSHAP(feature_values, phi, hot_index, unique_depth + 1, unique_path,
969
             hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
970
971

    TreeSHAP(feature_values, phi, cold_index, unique_depth + 1, unique_path,
972
             cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
973
974
975
  }
}

976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
// recursive sparse computation of SHAP values for a decision tree
void Tree::TreeSHAPByMap(const std::unordered_map<int, double>& feature_values, std::unordered_map<int, double>* phi,
                         int node, int unique_depth,
                         PathElement *parent_unique_path, double parent_zero_fraction,
                         double parent_one_fraction, int parent_feature_index) const {
  // extend the unique path
  PathElement* unique_path = parent_unique_path + unique_depth;
  if (unique_depth > 0) {
    std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
  }
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);

  // leaf node
  if (node < 0) {
    for (int i = 1; i <= unique_depth; ++i) {
      const double w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
      (*phi)[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
    }

  // internal node
  } else {
    const int hot_index = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index) / w;
    const double cold_zero_fraction = data_count(cold_index) / w;
    double incoming_zero_fraction = 1;
    double incoming_one_fraction = 1;

    // see if we have already split on this feature,
    // if so we undo that split so we can redo it for this node
    int path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
      if (unique_path[path_index].feature_index == split_feature_[node]) break;
    }
    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

    TreeSHAPByMap(feature_values, phi, hot_index, unique_depth + 1, unique_path,
                  hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);

    TreeSHAPByMap(feature_values, phi, cold_index, unique_depth + 1, unique_path,
                  cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
  }
}

1028
1029
1030
1031
1032
double Tree::ExpectedValue() const {
  if (num_leaves_ == 1) return LeafOutput(0);
  const double total_count = internal_count_[0];
  double exp_value = 0.0;
  for (int i = 0; i < num_leaves(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
1033
    exp_value += (leaf_count_[i] / total_count)*LeafOutput(i);
Guolin Ke's avatar
Guolin Ke committed
1034
  }
1035
  return exp_value;
Guolin Ke's avatar
Guolin Ke committed
1036
1037
}

1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
void Tree::RecomputeMaxDepth() {
  if (num_leaves_ == 1) {
    max_depth_ = 0;
  } else {
    if (leaf_depth_.size() == 0) {
      RecomputeLeafDepths(0, 0);
    }
    max_depth_ = leaf_depth_[0];
    for (int i = 1; i < num_leaves(); ++i) {
      if (max_depth_ < leaf_depth_[i]) max_depth_ = leaf_depth_[i];
    }
Guolin Ke's avatar
Guolin Ke committed
1049
1050
1051
  }
}

Guolin Ke's avatar
Guolin Ke committed
1052
}  // namespace LightGBM