tree.cpp 41.7 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#include <LightGBM/tree.h>

#include <LightGBM/dataset.h>
8
9
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h>
Guolin Ke's avatar
Guolin Ke committed
10

11
#include <algorithm>
12
13
14
#include <functional>
#include <iomanip>
#include <sstream>
15
16
17
#include <string>
#include <unordered_map>
#include <vector>
18

Guolin Ke's avatar
Guolin Ke committed
19
20
namespace LightGBM {

21
Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear)
22
  :max_leaves_(max_leaves), track_branch_features_(track_branch_features) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
26
27
28
  left_child_.resize(max_leaves_ - 1);
  right_child_.resize(max_leaves_ - 1);
  split_feature_inner_.resize(max_leaves_ - 1);
  split_feature_.resize(max_leaves_ - 1);
  threshold_in_bin_.resize(max_leaves_ - 1);
  threshold_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
29
  decision_type_.resize(max_leaves_ - 1, 0);
Guolin Ke's avatar
Guolin Ke committed
30
31
32
  split_gain_.resize(max_leaves_ - 1);
  leaf_parent_.resize(max_leaves_);
  leaf_value_.resize(max_leaves_);
33
  leaf_weight_.resize(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
34
35
  leaf_count_.resize(max_leaves_);
  internal_value_.resize(max_leaves_ - 1);
36
  internal_weight_.resize(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
37
38
  internal_count_.resize(max_leaves_ - 1);
  leaf_depth_.resize(max_leaves_);
39
40
41
  if (track_branch_features_) {
    branch_features_ = std::vector<std::vector<int>>(max_leaves_);
  }
Guolin Ke's avatar
Guolin Ke committed
42
43
  // root is in the depth 0
  leaf_depth_[0] = 0;
Guolin Ke's avatar
Guolin Ke committed
44
  num_leaves_ = 1;
45
  leaf_value_[0] = 0.0f;
46
  leaf_weight_[0] = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
47
  leaf_parent_[0] = -1;
Guolin Ke's avatar
Guolin Ke committed
48
  shrinkage_ = 1.0f;
49
  num_cat_ = 0;
50
51
  cat_boundaries_.push_back(0);
  cat_boundaries_inner_.push_back(0);
52
  max_depth_ = -1;
53
54
55
56
57
58
59
  is_linear_ = is_linear;
  if (is_linear_) {
    leaf_coeff_.resize(max_leaves_);
    leaf_const_ = std::vector<double>(max_leaves_, 0);
    leaf_features_.resize(max_leaves_);
    leaf_features_inner_.resize(max_leaves_);
  }
60
  #ifdef USE_CUDA
61
  is_cuda_tree_ = false;
62
  #endif  // USE_CUDA
Guolin Ke's avatar
Guolin Ke committed
63
}
Guolin Ke's avatar
Guolin Ke committed
64

65
66
int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
                double threshold_double, double left_value, double right_value,
67
68
                int left_cnt, int right_cnt, double left_weight, double right_weight, float gain,
                MissingType missing_type, bool default_left) {
69
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
Guolin Ke's avatar
Guolin Ke committed
70
  int new_node_idx = num_leaves_ - 1;
Guolin Ke's avatar
Guolin Ke committed
71
  decision_type_[new_node_idx] = 0;
72
  SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
73
  SetDecisionType(&decision_type_[new_node_idx], default_left, kDefaultLeftMask);
Guolin Ke's avatar
Guolin Ke committed
74
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
Guolin Ke's avatar
Guolin Ke committed
75
  threshold_in_bin_[new_node_idx] = threshold_bin;
Guolin Ke's avatar
Guolin Ke committed
76
  threshold_[new_node_idx] = threshold_double;
77
78
79
  ++num_leaves_;
  return num_leaves_ - 1;
}
Guolin Ke's avatar
Guolin Ke committed
80

81
82
int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                           const uint32_t* threshold, int num_threshold, double left_value, double right_value,
83
84
                           data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) {
  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
85
86
87
  int new_node_idx = num_leaves_ - 1;
  decision_type_[new_node_idx] = 0;
  SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
Guolin Ke's avatar
Guolin Ke committed
88
  SetMissingType(&decision_type_[new_node_idx], static_cast<int8_t>(missing_type));
89
90
  threshold_in_bin_[new_node_idx] = num_cat_;
  threshold_[new_node_idx] = num_cat_;
91
  ++num_cat_;
92
93
94
95
96
97
98
99
  cat_boundaries_.push_back(cat_boundaries_.back() + num_threshold);
  for (int i = 0; i < num_threshold; ++i) {
    cat_threshold_.push_back(threshold[i]);
  }
  cat_boundaries_inner_.push_back(cat_boundaries_inner_.back() + num_threshold_bin);
  for (int i = 0; i < num_threshold_bin; ++i) {
    cat_threshold_inner_.push_back(threshold_bin[i]);
  }
Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
  ++num_leaves_;
  return num_leaves_ - 1;
}

Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#define PredictionFun(niter, fidx_in_iter, start_pos, decision_fun, iter_idx, \
                      data_idx)                                               \
  std::vector<std::unique_ptr<BinIterator>> iter((niter));                    \
  for (int i = 0; i < (niter); ++i) {                                         \
    iter[i].reset(data->FeatureIterator((fidx_in_iter)));                     \
    iter[i]->Reset((start_pos));                                              \
  }                                                                           \
  for (data_size_t i = start; i < end; ++i) {                                 \
    int node = 0;                                                             \
    while (node >= 0) {                                                       \
      node = decision_fun(iter[(iter_idx)]->Get((data_idx)), node,            \
                          default_bins[node], max_bins[node]);                \
    }                                                                         \
    score[(data_idx)] += static_cast<double>(leaf_value_[~node]);             \
118
119
  }\

120
121
122
123
124
125
126
127
128
129

#define PredictionFunLinear(niter, fidx_in_iter, start_pos, decision_fun,     \
                            iter_idx, data_idx)                               \
  std::vector<std::unique_ptr<BinIterator>> iter((niter));                    \
  for (int i = 0; i < (niter); ++i) {                                         \
    iter[i].reset(data->FeatureIterator((fidx_in_iter)));                     \
    iter[i]->Reset((start_pos));                                              \
  }                                                                           \
  for (data_size_t i = start; i < end; ++i) {                                 \
    int node = 0;                                                             \
130
131
132
133
134
135
    if (num_leaves_ > 1) {                                                    \
      while (node >= 0) {                                                     \
        node = decision_fun(iter[(iter_idx)]->Get((data_idx)), node,          \
                            default_bins[node], max_bins[node]);              \
      }                                                                       \
      node = ~node;                                                           \
136
    }                                                                         \
137
    double add_score = leaf_const_[node];                                     \
138
    bool nan_found = false;                                                   \
139
140
141
    const double* coeff_ptr = leaf_coeff_[node].data();                       \
    const float** data_ptr = feat_ptr[node].data();                           \
    for (size_t j = 0; j < leaf_features_inner_[node].size(); ++j) {          \
142
143
144
145
146
147
148
149
       float feat_val = data_ptr[j][(data_idx)];                              \
       if (std::isnan(feat_val)) {                                            \
          nan_found = true;                                                   \
          break;                                                              \
       }                                                                      \
       add_score += coeff_ptr[j] * feat_val;                                  \
    }                                                                         \
    if (nan_found) {                                                          \
150
       score[(data_idx)] += leaf_value_[node];                                \
151
152
153
154
155
156
    } else {                                                                  \
      score[(data_idx)] += add_score;                                         \
    }                                                                         \
}\


157
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
158
  if (!is_linear_ && num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
159
    if (leaf_value_[0] != 0.0f) {
160
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
161
162
163
164
165
166
      for (data_size_t i = 0; i < num_data; ++i) {
        score[i] += leaf_value_[0];
      }
    }
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
167
168
169
170
171
172
173
174
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
  if (is_linear_) {
    std::vector<std::vector<const float*>> feat_ptr(num_leaves_);
    for (int leaf_num = 0; leaf_num < num_leaves_; ++leaf_num) {
      for (int feat : leaf_features_inner_[leaf_num]) {
        feat_ptr[leaf_num].push_back(data->raw_index(feat));
      }
    }
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], start, DecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, start, DecisionInner, split_feature_inner_[node], i);
        });
      }
194
    } else {
195
196
197
198
199
200
201
202
203
204
205
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], start, NumericalDecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, start, NumericalDecisionInner, split_feature_inner_[node], i);
        });
      }
206
    }
Guolin Ke's avatar
Guolin Ke committed
207
  } else {
208
209
210
211
212
213
214
215
216
217
218
219
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, DecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, start, DecisionInner, split_feature_inner_[node], i);
        });
      }
220
    } else {
221
222
223
224
225
226
227
228
229
230
231
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, NumericalDecisionInner, node, i);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, start, NumericalDecisionInner, split_feature_inner_[node], i);
        });
      }
232
    }
Guolin Ke's avatar
Guolin Ke committed
233
  }
Guolin Ke's avatar
Guolin Ke committed
234
235
}

Guolin Ke's avatar
Guolin Ke committed
236
void Tree::AddPredictionToScore(const Dataset* data,
237
238
239
  const data_size_t* used_data_indices,
  data_size_t num_data, double* score) const {
  if (!is_linear_ && num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
240
    if (leaf_value_[0] != 0.0f) {
241
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
Guolin Ke's avatar
Guolin Ke committed
242
243
244
245
      for (data_size_t i = 0; i < num_data; ++i) {
        score[used_data_indices[i]] += leaf_value_[0];
      }
    }
Guolin Ke's avatar
Guolin Ke committed
246
    return;
Guolin Ke's avatar
Guolin Ke committed
247
  }
Guolin Ke's avatar
Guolin Ke committed
248
249
250
251
252
253
254
255
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
  std::vector<uint32_t> max_bins(num_leaves_ - 1);
  for (int i = 0; i < num_leaves_ - 1; ++i) {
    const int fidx = split_feature_inner_[i];
    auto bin_mapper = data->FeatureBinMapper(fidx);
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
  if (is_linear_) {
    std::vector<std::vector<const float*>> feat_ptr(num_leaves_);
    for (int leaf_num = 0; leaf_num < num_leaves_; ++leaf_num) {
      for (int feat : leaf_features_inner_[leaf_num]) {
        feat_ptr[leaf_num].push_back(data->raw_index(feat));
      }
    }
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], DecisionInner,
                              node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, used_data_indices[start], DecisionInner, split_feature_inner_[node], used_data_indices[i]);
        });
      }
276
    } else {
277
278
279
280
281
282
283
284
285
286
287
288
289
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], NumericalDecisionInner,
                              node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins, &feat_ptr]
        (int, data_size_t start, data_size_t end) {
          PredictionFunLinear(data->num_features(), i, used_data_indices[start], NumericalDecisionInner,
                              split_feature_inner_[node], used_data_indices[i]);
        });
      }
290
    }
Guolin Ke's avatar
Guolin Ke committed
291
  } else {
292
293
294
295
296
297
298
299
300
301
302
303
    if (num_cat_ > 0) {
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], DecisionInner, node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, used_data_indices[start], DecisionInner, split_feature_inner_[node], used_data_indices[i]);
        });
      }
304
    } else {
305
306
307
308
309
310
311
312
313
314
315
      if (data->num_features() > num_leaves_ - 1) {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], NumericalDecisionInner, node, used_data_indices[i]);
        });
      } else {
        Threading::For<data_size_t>(0, num_data, 512, [this, &data, score, used_data_indices, &default_bins, &max_bins]
        (int, data_size_t start, data_size_t end) {
          PredictionFun(data->num_features(), i, used_data_indices[start], NumericalDecisionInner, split_feature_inner_[node], used_data_indices[i]);
        });
      }
316
    }
Guolin Ke's avatar
Guolin Ke committed
317
  }
Guolin Ke's avatar
Guolin Ke committed
318
319
}

320
#undef PredictionFun
321
#undef PredictionFunLinear
322

323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
double Tree::GetUpperBoundValue() const {
  double upper_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] > upper_bound) {
      upper_bound = leaf_value_[i];
    }
  }
  return upper_bound;
}

double Tree::GetLowerBoundValue() const {
  double lower_bound = leaf_value_[0];
  for (int i = 1; i < num_leaves_; ++i) {
    if (leaf_value_[i] < lower_bound) {
      lower_bound = leaf_value_[i];
    }
  }
  return lower_bound;
}

Guolin Ke's avatar
Guolin Ke committed
343
std::string Tree::ToString() const {
344
  std::stringstream str_buf;
345
346
347
348
  Common::C_stringstream(str_buf);

  using CommonC::ArrayToString;

349
350
  str_buf << "num_leaves=" << num_leaves_ << '\n';
  str_buf << "num_cat=" << num_cat_ << '\n';
351
  str_buf << "split_feature="
352
    << ArrayToString(split_feature_, num_leaves_ - 1) << '\n';
353
  str_buf << "split_gain="
354
    << ArrayToString(split_gain_, num_leaves_ - 1) << '\n';
355
  str_buf << "threshold="
356
    << ArrayToString<true>(threshold_, num_leaves_ - 1) << '\n';
357
  str_buf << "decision_type="
358
    << ArrayToString(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1) << '\n';
359
  str_buf << "left_child="
360
    << ArrayToString(left_child_, num_leaves_ - 1) << '\n';
361
  str_buf << "right_child="
362
    << ArrayToString(right_child_, num_leaves_ - 1) << '\n';
363
  str_buf << "leaf_value="
364
    << ArrayToString<true>(leaf_value_, num_leaves_) << '\n';
365
  str_buf << "leaf_weight="
366
    << ArrayToString<true>(leaf_weight_, num_leaves_) << '\n';
367
  str_buf << "leaf_count="
368
    << ArrayToString(leaf_count_, num_leaves_) << '\n';
369
  str_buf << "internal_value="
370
    << ArrayToString(internal_value_, num_leaves_ - 1) << '\n';
371
  str_buf << "internal_weight="
372
    << ArrayToString(internal_weight_, num_leaves_ - 1) << '\n';
373
  str_buf << "internal_count="
374
    << ArrayToString(internal_count_, num_leaves_ - 1) << '\n';
375
376
  if (num_cat_ > 0) {
    str_buf << "cat_boundaries="
377
      << ArrayToString(cat_boundaries_, num_cat_ + 1) << '\n';
378
    str_buf << "cat_threshold="
379
      << ArrayToString(cat_threshold_, cat_threshold_.size()) << '\n';
380
  }
381
382
383
384
  str_buf << "is_linear=" << is_linear_ << '\n';

  if (is_linear_) {
    str_buf << "leaf_const="
385
      << ArrayToString<true>(leaf_const_, num_leaves_) << '\n';
386
387
    std::vector<int> num_feat(num_leaves_);
    for (int i = 0; i < num_leaves_; ++i) {
388
      num_feat[i] = static_cast<int>(leaf_coeff_[i].size());
389
390
391
392
393
394
395
396
397
398
399
400
401
402
    }
    str_buf << "num_features="
      << ArrayToString(num_feat, num_leaves_) << '\n';
    str_buf << "leaf_features=";
    for (int i = 0; i < num_leaves_; ++i) {
      if (num_feat[i] > 0) {
        str_buf << ArrayToString(leaf_features_[i], leaf_features_[i].size()) << ' ';
      }
      str_buf << ' ';
    }
    str_buf << '\n';
    str_buf << "leaf_coeff=";
    for (int i = 0; i < num_leaves_; ++i) {
      if (num_feat[i] > 0) {
403
        str_buf << ArrayToString<true>(leaf_coeff_[i], leaf_coeff_[i].size()) << ' ';
404
405
406
407
408
      }
      str_buf << ' ';
    }
    str_buf << '\n';
  }
409
410
  str_buf << "shrinkage=" << shrinkage_ << '\n';
  str_buf << '\n';
411

412
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
413
414
}

Guolin Ke's avatar
Guolin Ke committed
415
std::string Tree::ToJSON() const {
416
  std::stringstream str_buf;
417
  Common::C_stringstream(str_buf);
418
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
419
420
421
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << '\n';
  str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
  str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
wxchan's avatar
wxchan committed
422
  if (num_leaves_ == 1) {
423
424
    str_buf << "\"tree_structure\":{";
    str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
425
    if (is_linear_) {
426
427
      str_buf << "\"leaf_count\":" << leaf_count_[0] << ", " << '\n';
      str_buf << LinearModelToJSON(0);
428
    } else {
429
      str_buf << "\"leaf_count\":" << leaf_count_[0];
430
    }
431
    str_buf << "}" << '\n';
wxchan's avatar
wxchan committed
432
  } else {
433
    str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
wxchan's avatar
wxchan committed
434
  }
435
436
  return str_buf.str();
}
wxchan's avatar
wxchan committed
437

438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
std::string Tree::LinearModelToJSON(int index) const {
  std::stringstream str_buf;
  Common::C_stringstream(str_buf);
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  str_buf << "\"leaf_const\":" << leaf_const_[index] << "," << "\n";
  int num_features = static_cast<int>(leaf_features_[index].size());
  if (num_features > 0) {
    str_buf << "\"leaf_features\":[";
    for (int i = 0; i < num_features - 1; ++i) {
      str_buf << leaf_features_[index][i] << ", ";
    }
    str_buf << leaf_features_[index][num_features - 1] << "]" << ", " << "\n";
    str_buf << "\"leaf_coeff\":[";
    for (int i = 0; i < num_features - 1; ++i) {
      str_buf << leaf_coeff_[index][i] << ", ";
    }
    str_buf << leaf_coeff_[index][num_features - 1] << "]" << "\n";
  } else {
    str_buf << "\"leaf_features\":[],\n";
    str_buf << "\"leaf_coeff\":[]\n";
  }
459
  return str_buf.str();
wxchan's avatar
wxchan committed
460
461
}

Guolin Ke's avatar
Guolin Ke committed
462
std::string Tree::NodeToJSON(int index) const {
463
  std::stringstream str_buf;
464
  Common::C_stringstream(str_buf);
465
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
wxchan's avatar
wxchan committed
466
467
  if (index >= 0) {
    // non-leaf
468
469
470
    str_buf << "{" << '\n';
    str_buf << "\"split_index\":" << index << "," << '\n';
    str_buf << "\"split_feature\":" << split_feature_[index] << "," << '\n';
Guolin Ke's avatar
Guolin Ke committed
471
    str_buf << "\"split_gain\":" << Common::AvoidInf(split_gain_[index]) << "," << '\n';
472
    if (GetDecisionType(decision_type_[index], kCategoricalMask)) {
473
474
475
476
477
478
479
480
481
482
483
      int cat_idx = static_cast<int>(threshold_[index]);
      std::vector<int> cats;
      for (int i = cat_boundaries_[cat_idx]; i < cat_boundaries_[cat_idx + 1]; ++i) {
        for (int j = 0; j < 32; ++j) {
          int cat = (i - cat_boundaries_[cat_idx]) * 32 + j;
          if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                                   cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], cat)) {
            cats.push_back(cat);
          }
        }
      }
484
      str_buf << "\"threshold\":\"" << CommonC::Join(cats, "||") << "\"," << '\n';
485
      str_buf << "\"decision_type\":\"==\"," << '\n';
486
    } else {
487
488
      str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << '\n';
      str_buf << "\"decision_type\":\"<=\"," << '\n';
489
490
    }
    if (GetDecisionType(decision_type_[index], kDefaultLeftMask)) {
491
      str_buf << "\"default_left\":true," << '\n';
492
    } else {
493
      str_buf << "\"default_left\":false," << '\n';
494
495
    }
    uint8_t missing_type = GetMissingType(decision_type_[index]);
496
    if (missing_type == MissingType::None) {
497
      str_buf << "\"missing_type\":\"None\"," << '\n';
498
    } else if (missing_type == MissingType::Zero) {
499
      str_buf << "\"missing_type\":\"Zero\"," << '\n';
500
    } else {
501
      str_buf << "\"missing_type\":\"NaN\"," << '\n';
502
    }
503
    str_buf << "\"internal_value\":" << internal_value_[index] << "," << '\n';
504
    str_buf << "\"internal_weight\":" << internal_weight_[index] << "," << '\n';
505
506
507
    str_buf << "\"internal_count\":" << internal_count_[index] << "," << '\n';
    str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << '\n';
    str_buf << "\"right_child\":" << NodeToJSON(right_child_[index]) << '\n';
508
    str_buf << "}";
wxchan's avatar
wxchan committed
509
510
511
  } else {
    // leaf
    index = ~index;
512
513
514
    str_buf << "{" << '\n';
    str_buf << "\"leaf_index\":" << index << "," << '\n';
    str_buf << "\"leaf_value\":" << leaf_value_[index] << "," << '\n';
515
    str_buf << "\"leaf_weight\":" << leaf_weight_[index] << "," << '\n';
516
517
518
519
520
521
    if (is_linear_) {
      str_buf << "\"leaf_count\":" << leaf_count_[index] << "," << '\n';
      str_buf << LinearModelToJSON(index);
    } else {
      str_buf << "\"leaf_count\":" << leaf_count_[index] << '\n';
    }
522
    str_buf << "}";
wxchan's avatar
wxchan committed
523
  }
524
  return str_buf.str();
wxchan's avatar
wxchan committed
525
526
}

Guolin Ke's avatar
Guolin Ke committed
527
528
std::string Tree::NumericalDecisionIfElse(int node) const {
  std::stringstream str_buf;
529
  Common::C_stringstream(str_buf);
530
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
Guolin Ke's avatar
Guolin Ke committed
531
532
  uint8_t missing_type = GetMissingType(decision_type_[node]);
  bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask);
José Morales's avatar
José Morales committed
533
534
535
536
  if (missing_type != MissingType::NaN) {
    str_buf << "if (std::isnan(fval)) fval = 0.0;";
  }
  if (missing_type == MissingType::Zero) {
Guolin Ke's avatar
Guolin Ke committed
537
    if (default_left) {
José Morales's avatar
José Morales committed
538
      str_buf << "if (Tree::IsZero(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
539
    } else {
José Morales's avatar
José Morales committed
540
      str_buf << "if (!Tree::IsZero(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
541
    }
José Morales's avatar
José Morales committed
542
  } else if (missing_type == MissingType::NaN) {
Guolin Ke's avatar
Guolin Ke committed
543
    if (default_left) {
José Morales's avatar
José Morales committed
544
      str_buf << "if (std::isnan(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
545
    } else {
José Morales's avatar
José Morales committed
546
      str_buf << "if (!std::isnan(fval)) {";
Guolin Ke's avatar
Guolin Ke committed
547
    }
José Morales's avatar
José Morales committed
548
549
  } else {
    str_buf << "if (fval <= " << threshold_[node] << ") {";
Guolin Ke's avatar
Guolin Ke committed
550
551
552
553
554
555
  }
  return str_buf.str();
}

std::string Tree::CategoricalDecisionIfElse(int node) const {
  std::stringstream str_buf;
556
  Common::C_stringstream(str_buf);
557
  int cat_idx = static_cast<int>(threshold_[node]);
José Morales's avatar
José Morales committed
558
  str_buf << "if (std::isnan(fval)) { int_fval = -1; } else { int_fval = static_cast<int>(fval); }";
559
560
561
562
  str_buf << "if (int_fval >= 0 && int_fval < 32 * (";
  str_buf << cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx];
  str_buf << ") && (((cat_threshold[" << cat_boundaries_[cat_idx];
  str_buf << " + int_fval / 32] >> (int_fval & 31)) & 1))) {";
Guolin Ke's avatar
Guolin Ke committed
563
564
565
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
566
std::string Tree::ToIfElse(int index, bool predict_leaf_index) const {
567
  std::stringstream str_buf;
568
  Common::C_stringstream(str_buf);
569
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
570
  if (predict_leaf_index) {
571
572
573
    str_buf << "Leaf";
  }
  str_buf << "(const double* arr) { ";
574
575
  if (num_leaves_ <= 1) {
    str_buf << "return " << leaf_value_[0] << ";";
576
  } else {
577
578
579
580
581
582
583
584
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
585
586
587
588
589
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
590
    str_buf << NodeToIfElse(0, predict_leaf_index);
591
  }
592
  str_buf << " }" << '\n';
593

594
  // Predict func by Map to ifelse
595
  str_buf << "double PredictTree" << index;
Guolin Ke's avatar
Guolin Ke committed
596
  if (predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
597
598
599
    str_buf << "LeafByMap";
  } else {
    str_buf << "ByMap";
600
601
602
  }
  str_buf << "(const std::unordered_map<int, double>& arr) { ";
  if (num_leaves_ <= 1) {
Guolin Ke's avatar
Guolin Ke committed
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
    str_buf << "return " << leaf_value_[0] << ";";
  } else {
    str_buf << "const std::vector<uint32_t> cat_threshold = {";
    for (size_t i = 0; i < cat_threshold_.size(); ++i) {
      if (i != 0) {
        str_buf << ",";
      }
      str_buf << cat_threshold_[i];
    }
    str_buf << "};";
    // use this for the missing value conversion
    str_buf << "double fval = 0.0f; ";
    if (num_cat_ > 0) {
      str_buf << "int int_fval = 0; ";
    }
Guolin Ke's avatar
Guolin Ke committed
618
    str_buf << NodeToIfElseByMap(0, predict_leaf_index);
619
  }
620
  str_buf << " }" << '\n';
621

622
623
624
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
625
std::string Tree::NodeToIfElse(int index, bool predict_leaf_index) const {
626
  std::stringstream str_buf;
627
  Common::C_stringstream(str_buf);
628
629
630
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
631
    str_buf << "fval = arr[" << split_feature_[index] << "];";
Guolin Ke's avatar
Guolin Ke committed
632
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
633
      str_buf << NumericalDecisionIfElse(index);
634
    } else {
635
      str_buf << CategoricalDecisionIfElse(index);
636
637
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
638
    str_buf << NodeToIfElse(left_child_[index], predict_leaf_index);
639
640
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
641
    str_buf << NodeToIfElse(right_child_[index], predict_leaf_index);
642
643
644
645
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
646
    if (predict_leaf_index) {
647
648
649
650
651
652
653
654
655
656
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
657
std::string Tree::NodeToIfElseByMap(int index, bool predict_leaf_index) const {
658
  std::stringstream str_buf;
659
  Common::C_stringstream(str_buf);
660
661
662
663
664
665
666
667
668
669
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
    str_buf << "fval = arr.count(" << split_feature_[index] << ") > 0 ? arr.at(" << split_feature_[index] << ") : 0.0f;";
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
      str_buf << NumericalDecisionIfElse(index);
    } else {
      str_buf << CategoricalDecisionIfElse(index);
    }
    // left subtree
Guolin Ke's avatar
Guolin Ke committed
670
    str_buf << NodeToIfElseByMap(left_child_[index], predict_leaf_index);
671
672
    str_buf << " } else { ";
    // right subtree
Guolin Ke's avatar
Guolin Ke committed
673
    str_buf << NodeToIfElseByMap(right_child_[index], predict_leaf_index);
674
675
676
677
    str_buf << " }";
  } else {
    // leaf
    str_buf << "return ";
Guolin Ke's avatar
Guolin Ke committed
678
    if (predict_leaf_index) {
679
680
681
682
683
684
685
686
687
688
      str_buf << ~index;
    } else {
      str_buf << leaf_value_[~index];
    }
    str_buf << ";";
  }

  return str_buf.str();
}

689
690
Tree::Tree(const char* str, size_t* used_len) {
  auto p = str;
Guolin Ke's avatar
Guolin Ke committed
691
  std::unordered_map<std::string, std::string> key_vals;
692
  const int max_num_line = 22;
693
694
695
696
  int read_line = 0;
  while (read_line < max_num_line) {
    if (*p == '\r' || *p == '\n') break;
    auto start = p;
697
    while (*p != '=') ++p;
698
699
700
701
702
703
704
705
706
707
708
    std::string key(start, p - start);
    ++p;
    start = p;
    while (*p != '\r' && *p != '\n') ++p;
    key_vals[key] = std::string(start, p - start);
    ++read_line;
    if (*p == '\r') ++p;
    if (*p == '\n') ++p;
  }
  *used_len = p - str;

709
  if (key_vals.count("num_leaves") <= 0) {
710
    Log::Fatal("Tree model should contain num_leaves field");
Guolin Ke's avatar
Guolin Ke committed
711
712
713
714
  }

  Common::Atoi(key_vals["num_leaves"].c_str(), &num_leaves_);

715
  if (key_vals.count("num_cat") <= 0) {
716
    Log::Fatal("Tree model should contain num_cat field");
717
718
719
720
  }

  Common::Atoi(key_vals["num_cat"].c_str(), &num_cat_);

721
  if (key_vals.count("leaf_value")) {
722
    leaf_value_ = CommonC::StringToArray<double>(key_vals["leaf_value"], num_leaves_);
723
724
725
  } else {
    Log::Fatal("Tree model string format error, should contain leaf_value field");
  }
726

Guolin Ke's avatar
Guolin Ke committed
727
  if (key_vals.count("shrinkage")) {
728
    CommonC::Atof(key_vals["shrinkage"].c_str(), &shrinkage_);
Guolin Ke's avatar
Guolin Ke committed
729
730
731
  } else {
    shrinkage_ = 1.0f;
  }
732

733
734
735
736
  if (key_vals.count("is_linear")) {
    int is_linear_int;
    Common::Atoi(key_vals["is_linear"].c_str(), &is_linear_int);
    is_linear_ = static_cast<bool>(is_linear_int);
737
738
  } else {
    is_linear_ = false;
739
740
  }

741
742
743
744
745
746
  if (key_vals.count("leaf_count")) {
    leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
  } else {
    leaf_count_.resize(num_leaves_);
  }

747
  #ifdef USE_CUDA
748
  is_cuda_tree_ = false;
749
  #endif  // USE_CUDA
750

751
752
753
  if ((num_leaves_ <= 1) && !is_linear_) {
    return;
  }
754

Guolin Ke's avatar
Guolin Ke committed
755
  if (key_vals.count("left_child")) {
756
    left_child_ = CommonC::StringToArrayFast<int>(key_vals["left_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
757
758
  } else {
    Log::Fatal("Tree model string format error, should contain left_child field");
759
760
  }

Guolin Ke's avatar
Guolin Ke committed
761
  if (key_vals.count("right_child")) {
762
    right_child_ = CommonC::StringToArrayFast<int>(key_vals["right_child"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
763
764
765
766
767
  } else {
    Log::Fatal("Tree model string format error, should contain right_child field");
  }

  if (key_vals.count("split_feature")) {
768
    split_feature_ = CommonC::StringToArrayFast<int>(key_vals["split_feature"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
769
770
771
772
773
  } else {
    Log::Fatal("Tree model string format error, should contain split_feature field");
  }

  if (key_vals.count("threshold")) {
774
    threshold_ = CommonC::StringToArray<double>(key_vals["threshold"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
775
776
777
778
779
  } else {
    Log::Fatal("Tree model string format error, should contain threshold field");
  }

  if (key_vals.count("split_gain")) {
780
    split_gain_ = CommonC::StringToArrayFast<float>(key_vals["split_gain"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
781
782
783
784
785
  } else {
    split_gain_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_count")) {
786
    internal_count_ = CommonC::StringToArrayFast<int>(key_vals["internal_count"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
787
788
789
790
791
  } else {
    internal_count_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("internal_value")) {
792
    internal_value_ = CommonC::StringToArrayFast<double>(key_vals["internal_value"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
793
794
795
796
  } else {
    internal_value_.resize(num_leaves_ - 1);
  }

797
  if (key_vals.count("internal_weight")) {
798
    internal_weight_ = CommonC::StringToArrayFast<double>(key_vals["internal_weight"], num_leaves_ - 1);
799
  } else {
800
801
802
803
    internal_weight_.resize(num_leaves_ - 1);
  }

  if (key_vals.count("leaf_weight")) {
804
    leaf_weight_ = CommonC::StringToArray<double>(key_vals["leaf_weight"], num_leaves_);
805
  } else {
806
807
808
    leaf_weight_.resize(num_leaves_);
  }

Guolin Ke's avatar
Guolin Ke committed
809
  if (key_vals.count("decision_type")) {
810
    decision_type_ = CommonC::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
811
812
813
814
  } else {
    decision_type_ = std::vector<int8_t>(num_leaves_ - 1, 0);
  }

815
816
  if (is_linear_) {
    if (key_vals.count("leaf_const")) {
817
      leaf_const_ = Common::StringToArray<double>(key_vals["leaf_const"], num_leaves_);
818
819
820
821
822
823
824
825
826
827
828
829
    } else {
      leaf_const_.resize(num_leaves_);
    }
    std::vector<int> num_feat;
    if (key_vals.count("num_features")) {
      num_feat = Common::StringToArrayFast<int>(key_vals["num_features"], num_leaves_);
    }
    leaf_coeff_.resize(num_leaves_);
    leaf_features_.resize(num_leaves_);
    leaf_features_inner_.resize(num_leaves_);
    if (num_feat.size() > 0) {
      int total_num_feat = 0;
830
831
832
      for (size_t i = 0; i < num_feat.size(); ++i) {
        total_num_feat += num_feat[i];
      }
833
834
835
836
837
838
      std::vector<int> all_leaf_features;
      if (key_vals.count("leaf_features")) {
        all_leaf_features = Common::StringToArrayFast<int>(key_vals["leaf_features"], total_num_feat);
      }
      std::vector<double> all_leaf_coeff;
      if (key_vals.count("leaf_coeff")) {
839
        all_leaf_coeff = Common::StringToArray<double>(key_vals["leaf_coeff"], total_num_feat);
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
      }
      int sum_num_feat = 0;
      for (int i = 0; i < num_leaves_; ++i) {
        if (num_feat[i] > 0) {
          if (key_vals.count("leaf_features"))  {
            leaf_features_[i].assign(all_leaf_features.begin() + sum_num_feat, all_leaf_features.begin() + sum_num_feat + num_feat[i]);
          }
          if (key_vals.count("leaf_coeff")) {
            leaf_coeff_[i].assign(all_leaf_coeff.begin() + sum_num_feat, all_leaf_coeff.begin() + sum_num_feat + num_feat[i]);
          }
        }
        sum_num_feat += num_feat[i];
      }
    }
  }

856
857
  if (num_cat_ > 0) {
    if (key_vals.count("cat_boundaries")) {
858
      cat_boundaries_ = CommonC::StringToArrayFast<int>(key_vals["cat_boundaries"], num_cat_ + 1);
859
860
861
862
863
    } else {
      Log::Fatal("Tree model should contain cat_boundaries field.");
    }

    if (key_vals.count("cat_threshold")) {
864
      cat_threshold_ = CommonC::StringToArrayFast<uint32_t>(key_vals["cat_threshold"], cat_boundaries_.back());
865
    } else {
866
      Log::Fatal("Tree model should contain cat_threshold field");
867
868
    }
  }
869
  max_depth_ = -1;
Guolin Ke's avatar
Guolin Ke committed
870
871
}

Guolin Ke's avatar
Guolin Ke committed
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
void Tree::ExtendPath(PathElement *unique_path, int unique_depth,
                      double zero_fraction, double one_fraction, int feature_index) {
  unique_path[unique_depth].feature_index = feature_index;
  unique_path[unique_depth].zero_fraction = zero_fraction;
  unique_path[unique_depth].one_fraction = one_fraction;
  unique_path[unique_depth].pweight = (unique_depth == 0 ? 1 : 0);
  for (int i = unique_depth - 1; i >= 0; i--) {
    unique_path[i + 1].pweight += one_fraction*unique_path[i].pweight*(i + 1)
      / static_cast<double>(unique_depth + 1);
    unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth - i)
      / static_cast<double>(unique_depth + 1);
  }
}

void Tree::UnwindPath(PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;

  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = unique_path[i].pweight;
      unique_path[i].pweight = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth - i)
        / static_cast<double>(unique_depth + 1);
    } else {
      unique_path[i].pweight = (unique_path[i].pweight*(unique_depth + 1))
        / static_cast<double>(zero_fraction*(unique_depth - i));
    }
  }

  for (int i = path_index; i < unique_depth; ++i) {
    unique_path[i].feature_index = unique_path[i + 1].feature_index;
    unique_path[i].zero_fraction = unique_path[i + 1].zero_fraction;
    unique_path[i].one_fraction = unique_path[i + 1].one_fraction;
  }
}

double Tree::UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index) {
  const double one_fraction = unique_path[path_index].one_fraction;
  const double zero_fraction = unique_path[path_index].zero_fraction;
  double next_one_portion = unique_path[unique_depth].pweight;
  double total = 0;
  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const double tmp = next_one_portion*(unique_depth + 1)
        / static_cast<double>((i + 1)*one_fraction);
      total += tmp;
      next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth - i)
                                                                     / static_cast<double>(unique_depth + 1));
    } else {
      total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i)
                                                           / static_cast<double>(unique_depth + 1));
    }
  }
  return total;
}

// recursive computation of SHAP values for a decision tree
void Tree::TreeSHAP(const double *feature_values, double *phi,
                    int node, int unique_depth,
                    PathElement *parent_unique_path, double parent_zero_fraction,
                    double parent_one_fraction, int parent_feature_index) const {
  // extend the unique path
937
  PathElement* unique_path = parent_unique_path + unique_depth;
938
939
940
  if (unique_depth > 0) {
    std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
  }
Guolin Ke's avatar
Guolin Ke committed
941
942
943
944
945
946
947
948
949
950
951
952
953
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);

  // leaf node
  if (node < 0) {
    for (int i = 1; i <= unique_depth; ++i) {
      const double w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
      phi[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
    }

    // internal node
  } else {
954
    const int hot_index = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
955
956
957
958
959
960
961
962
963
964
965
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index) / w;
    const double cold_zero_fraction = data_count(cold_index) / w;
    double incoming_zero_fraction = 1;
    double incoming_one_fraction = 1;

    // see if we have already split on this feature,
    // if so we undo that split so we can redo it for this node
    int path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
966
      if (unique_path[path_index].feature_index == split_feature_[node]) break;
Guolin Ke's avatar
Guolin Ke committed
967
968
969
970
971
972
973
974
975
    }
    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

    TreeSHAP(feature_values, phi, hot_index, unique_depth + 1, unique_path,
976
             hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
977
978

    TreeSHAP(feature_values, phi, cold_index, unique_depth + 1, unique_path,
979
             cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
Guolin Ke's avatar
Guolin Ke committed
980
981
982
  }
}

983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
// recursive sparse computation of SHAP values for a decision tree
void Tree::TreeSHAPByMap(const std::unordered_map<int, double>& feature_values, std::unordered_map<int, double>* phi,
                         int node, int unique_depth,
                         PathElement *parent_unique_path, double parent_zero_fraction,
                         double parent_one_fraction, int parent_feature_index) const {
  // extend the unique path
  PathElement* unique_path = parent_unique_path + unique_depth;
  if (unique_depth > 0) {
    std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
  }
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);

  // leaf node
  if (node < 0) {
    for (int i = 1; i <= unique_depth; ++i) {
      const double w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
      (*phi)[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
    }

  // internal node
  } else {
    const int hot_index = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index) / w;
    const double cold_zero_fraction = data_count(cold_index) / w;
    double incoming_zero_fraction = 1;
    double incoming_one_fraction = 1;

    // see if we have already split on this feature,
    // if so we undo that split so we can redo it for this node
    int path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
      if (unique_path[path_index].feature_index == split_feature_[node]) break;
    }
    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

    TreeSHAPByMap(feature_values, phi, hot_index, unique_depth + 1, unique_path,
                  hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);

    TreeSHAPByMap(feature_values, phi, cold_index, unique_depth + 1, unique_path,
                  cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
  }
}

1035
1036
1037
1038
1039
double Tree::ExpectedValue() const {
  if (num_leaves_ == 1) return LeafOutput(0);
  const double total_count = internal_count_[0];
  double exp_value = 0.0;
  for (int i = 0; i < num_leaves(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
1040
    exp_value += (leaf_count_[i] / total_count)*LeafOutput(i);
Guolin Ke's avatar
Guolin Ke committed
1041
  }
1042
  return exp_value;
Guolin Ke's avatar
Guolin Ke committed
1043
1044
}

1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
void Tree::RecomputeMaxDepth() {
  if (num_leaves_ == 1) {
    max_depth_ = 0;
  } else {
    if (leaf_depth_.size() == 0) {
      RecomputeLeafDepths(0, 0);
    }
    max_depth_ = leaf_depth_[0];
    for (int i = 1; i < num_leaves(); ++i) {
      if (max_depth_ < leaf_depth_[i]) max_depth_ = leaf_depth_[i];
    }
Guolin Ke's avatar
Guolin Ke committed
1056
1057
1058
  }
}

Guolin Ke's avatar
Guolin Ke committed
1059
}  // namespace LightGBM