tree.h 25.6 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_

8
9
10
#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>

Guolin Ke's avatar
Guolin Ke committed
11
#include <string>
12
#include <map>
13
14
15
#include <memory>
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
20
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25

/*!
* \brief Tree model
*/
class Tree {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
29
  /*!
  * \brief Constructor
  * \param max_leaves The number of max leaves
30
  * \param track_branch_features Whether to keep track of ancestors of leaf nodes
31
  * \param is_linear Whether the tree has linear models at each leaf
Guolin Ke's avatar
Guolin Ke committed
32
  */
33
  explicit Tree(int max_leaves, bool track_branch_features, bool is_linear);
Guolin Ke's avatar
Guolin Ke committed
34
35

  /*!
36
  * \brief Constructor, from a string
Guolin Ke's avatar
Guolin Ke committed
37
  * \param str Model string
38
  * \param used_len used count of str
Guolin Ke's avatar
Guolin Ke committed
39
  */
40
  Tree(const char* str, size_t* used_len);
Guolin Ke's avatar
Guolin Ke committed
41

42
  ~Tree() noexcept = default;
Guolin Ke's avatar
Guolin Ke committed
43
44

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
45
46
47
  * \brief Performing a split on tree leaves.
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
Guolin Ke's avatar
Guolin Ke committed
48
  * \param real_feature Index of feature, the original index on data
49
  * \param threshold_bin Threshold(bin) of split
50
  * \param threshold_double Threshold on feature value
Guolin Ke's avatar
Guolin Ke committed
51
52
  * \param left_value Model Left child output
  * \param right_value Model Right child output
Guolin Ke's avatar
Guolin Ke committed
53
54
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
55
56
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
Guolin Ke's avatar
Guolin Ke committed
57
  * \param gain Split gain
Guolin Ke's avatar
Guolin Ke committed
58
59
  * \param missing_type missing type
  * \param default_left default direction for missing value
Guolin Ke's avatar
Guolin Ke committed
60
61
  * \return The index of new leaf.
  */
62
63
  int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
            double threshold_double, double left_value, double right_value,
64
65
            int left_cnt, int right_cnt, double left_weight, double right_weight,
            float gain, MissingType missing_type, bool default_left);
Guolin Ke's avatar
Guolin Ke committed
66

67
68
69
70
71
72
73
  /*!
  * \brief Performing a split on tree leaves, with categorical feature
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
  * \param real_feature Index of feature, the original index on data
  * \param threshold_bin Threshold(bin) of split, use bitset to represent
  * \param num_threshold_bin size of threshold_bin
74
75
  * \param threshold Thresholds of real feature value, use bitset to represent
  * \param num_threshold size of threshold
76
77
78
79
  * \param left_value Model Left child output
  * \param right_value Model Right child output
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
80
81
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
82
83
84
  * \param gain Split gain
  * \return The index of new leaf.
  */
85
86
  int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                       const uint32_t* threshold, int num_threshold, double left_value, double right_value,
87
                       int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type);
88

Guolin Ke's avatar
Guolin Ke committed
89
  /*! \brief Get the output of one leaf */
90
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Guolin Ke's avatar
Guolin Ke committed
91

Guolin Ke's avatar
Guolin Ke committed
92
93
  /*! \brief Set the output of one leaf */
  inline void SetLeafOutput(int leaf, double output) {
94
    leaf_value_[leaf] = MaybeRoundToZero(output);
Guolin Ke's avatar
Guolin Ke committed
95
96
  }

Guolin Ke's avatar
Guolin Ke committed
97
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
98
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
99
100
101
102
  * \param data The dataset
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
103
104
105
  void AddPredictionToScore(const Dataset* data,
                            data_size_t num_data,
                            double* score) const;
Guolin Ke's avatar
Guolin Ke committed
106
107

  /*!
108
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
109
110
111
112
113
114
  * \param data The dataset
  * \param used_data_indices Indices of used data
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
  void AddPredictionToScore(const Dataset* data,
Qiwei Ye's avatar
Qiwei Ye committed
115
                            const data_size_t* used_data_indices,
116
                            data_size_t num_data, double* score) const;
Guolin Ke's avatar
Guolin Ke committed
117

118
119
120
121
122
123
124
125
126
127
  /*!
  * \brief Get upper bound leaf value of this tree model
  */
  double GetUpperBoundValue() const;

  /*!
  * \brief Get lower bound leaf value of this tree model
  */
  double GetLowerBoundValue() const;

Guolin Ke's avatar
Guolin Ke committed
128
  /*!
129
  * \brief Prediction on one record
Guolin Ke's avatar
Guolin Ke committed
130
131
132
  * \param feature_values Feature value of this record
  * \return Prediction result
  */
133
  inline double Predict(const double* feature_values) const;
134
  inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
135

136
  inline int PredictLeafIndex(const double* feature_values) const;
137
138
  inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

139
  inline void PredictContrib(const double* feature_values, int num_features, double* output);
140
141
  inline void PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                  int num_features, std::unordered_map<int, double>* output);
142

Guolin Ke's avatar
Guolin Ke committed
143
144
145
  /*! \brief Get Number of leaves*/
  inline int num_leaves() const { return num_leaves_; }

Guolin Ke's avatar
Guolin Ke committed
146
147
148
  /*! \brief Get depth of specific leaf*/
  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }

Belinda Trotta's avatar
Belinda Trotta committed
149
150
151
  /*! \brief Get parent of specific leaf*/
  inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }

152
  /*! \brief Get feature of specific split (original feature index)*/
Guolin Ke's avatar
Guolin Ke committed
153
  inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
wxchan's avatar
wxchan committed
154

155
156
157
  /*! \brief Get feature of specific split*/
  inline int split_feature_inner(int split_idx) const { return split_feature_inner_[split_idx]; }

158
159
160
  /*! \brief Get features on leaf's branch*/
  inline std::vector<int> branch_features(int leaf) const { return branch_features_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
161
162
  inline double split_gain(int split_idx) const { return split_gain_[split_idx]; }

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  inline double internal_value(int node_idx) const {
    return internal_value_[node_idx];
  }

  inline bool IsNumericalSplit(int node_idx) const {
    return !GetDecisionType(decision_type_[node_idx], kCategoricalMask);
  }

  inline int left_child(int node_idx) const { return left_child_[node_idx]; }

  inline int right_child(int node_idx) const { return right_child_[node_idx]; }

  inline uint32_t threshold_in_bin(int node_idx) const {
    return threshold_in_bin_[node_idx];
  }

179
  /*! \brief Get the number of data points that fall at or below this node*/
Guolin Ke's avatar
Guolin Ke committed
180
  inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; }
181

Guolin Ke's avatar
Guolin Ke committed
182
183
  /*!
  * \brief Shrinkage for the tree's output
184
  *        shrinkage rate (a.k.a learning rate) is used to tune the training process
Guolin Ke's avatar
Guolin Ke committed
185
186
  * \param rate The factor of shrinkage
  */
187
  inline void Shrinkage(double rate) {
188
189
190
191
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate);
192
193
194
195
196
197
      if (is_linear_) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] * rate);
        for (size_t j = 0; j < leaf_coeff_[i].size(); ++j) {
          leaf_coeff_[i][j] = MaybeRoundToZero(leaf_coeff_[i][j] * rate);
        }
      }
Guolin Ke's avatar
Guolin Ke committed
198
    }
199
200
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] * rate);
201
202
203
204
205
206
    if (is_linear_) {
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] * rate);
      for (size_t j = 0; j < leaf_coeff_[num_leaves_ - 1].size(); ++j) {
        leaf_coeff_[num_leaves_ - 1][j] = MaybeRoundToZero(leaf_coeff_[num_leaves_ - 1][j] * rate);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
207
    shrinkage_ *= rate;
Guolin Ke's avatar
Guolin Ke committed
208
209
  }

210
  inline double shrinkage() const { return shrinkage_; }
211

Guolin Ke's avatar
Guolin Ke committed
212
  inline void AddBias(double val) {
213
214
215
216
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val);
Guolin Ke's avatar
Guolin Ke committed
217
    }
218
219
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val);
220
221
222
223
224
225
226
    if (is_linear_) {
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
      for (int i = 0; i < num_leaves_ - 1; ++i) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val);
      }
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] + val);
    }
Guolin Ke's avatar
Guolin Ke committed
227
228
229
230
    // force to 1.0
    shrinkage_ = 1.0f;
  }

231
232
233
234
  inline void AsConstantTree(double val) {
    num_leaves_ = 1;
    shrinkage_ = 1.0f;
    leaf_value_[0] = val;
235
236
237
    if (is_linear_) {
      leaf_const_[0] = val;
    }
238
239
  }

wxchan's avatar
wxchan committed
240
  /*! \brief Serialize this object to string*/
Guolin Ke's avatar
Guolin Ke committed
241
  std::string ToString() const;
Guolin Ke's avatar
Guolin Ke committed
242

wxchan's avatar
wxchan committed
243
  /*! \brief Serialize this object to json*/
Guolin Ke's avatar
Guolin Ke committed
244
  std::string ToJSON() const;
wxchan's avatar
wxchan committed
245

246
  /*! \brief Serialize this object to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
247
  std::string ToIfElse(int index, bool predict_leaf_index) const;
248

Guolin Ke's avatar
Guolin Ke committed
249
  inline static bool IsZero(double fval) {
250
    return (fval >= -kZeroThreshold && fval <= kZeroThreshold);
Guolin Ke's avatar
Guolin Ke committed
251
252
  }

253
  inline static double MaybeRoundToZero(double fval) {
254
    return IsZero(fval) ? 0 : fval;
255
256
  }

Guolin Ke's avatar
Guolin Ke committed
257
258
259
260
261
262
263
  inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
    return (decision_type & mask) > 0;
  }

  inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
    if (input) {
      (*decision_type) |= mask;
Guolin Ke's avatar
Guolin Ke committed
264
    } else {
Guolin Ke's avatar
Guolin Ke committed
265
      (*decision_type) &= (127 - mask);
Guolin Ke's avatar
Guolin Ke committed
266
267
268
    }
  }

Guolin Ke's avatar
Guolin Ke committed
269
270
271
272
273
274
275
276
277
  inline static int8_t GetMissingType(int8_t decision_type) {
    return (decision_type >> 2) & 3;
  }

  inline static void SetMissingType(int8_t* decision_type, int8_t input) {
    (*decision_type) &= 3;
    (*decision_type) |= (input << 2);
  }

278
279
  void RecomputeMaxDepth();

280
  int NextLeafId() const { return num_leaves_; }
281

282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
  /*! \brief Get the linear model constant term (bias) of one leaf */
  inline double LeafConst(int leaf) const { return leaf_const_[leaf]; }

  /*! \brief Get the linear model coefficients of one leaf */
  inline std::vector<double> LeafCoeffs(int leaf) const { return leaf_coeff_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeaturesInner(int leaf) const {return leaf_features_inner_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeatures(int leaf) const {return leaf_features_[leaf]; }

  /*! \brief Set the linear model coefficients on one leaf */
  inline void SetLeafCoeffs(int leaf, const std::vector<double>& output) {
    leaf_coeff_[leaf].resize(output.size());
    for (size_t i = 0; i < output.size(); ++i) {
      leaf_coeff_[leaf][i] = MaybeRoundToZero(output[i]);
    }
  }

  /*! \brief Set the linear model constant term (bias) on one leaf */
  inline void SetLeafConst(int leaf, double output) {
    leaf_const_[leaf] = MaybeRoundToZero(output);
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeaturesInner(int leaf, const std::vector<int>& features) {
    leaf_features_inner_[leaf] = features;
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeatures(int leaf, const std::vector<int>& features) {
    leaf_features_[leaf] = features;
  }

  inline bool is_linear() const { return is_linear_; }

  inline void SetIsLinear(bool is_linear) {
    is_linear_ = is_linear;
  }

Nikita Titov's avatar
Nikita Titov committed
323
 private:
Guolin Ke's avatar
Guolin Ke committed
324
  std::string NumericalDecisionIfElse(int node) const;
Guolin Ke's avatar
Guolin Ke committed
325

Guolin Ke's avatar
Guolin Ke committed
326
  std::string CategoricalDecisionIfElse(int node) const;
327
328
329

  inline int NumericalDecision(double fval, int node) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
330
331
    if (std::isnan(fval) && missing_type != MissingType::NaN) {
      fval = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
332
    }
333
334
    if ((missing_type == MissingType::Zero && IsZero(fval))
        || (missing_type == MissingType::NaN && std::isnan(fval))) {
335
336
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
Guolin Ke's avatar
Guolin Ke committed
337
      } else {
338
        return right_child_[node];
Guolin Ke's avatar
Guolin Ke committed
339
340
      }
    }
341
342
343
344
345
    if (fval <= threshold_[node]) {
      return left_child_[node];
    } else {
      return right_child_[node];
    }
Guolin Ke's avatar
Guolin Ke committed
346
  }
Guolin Ke's avatar
Guolin Ke committed
347

348
349
  inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
350
351
    if ((missing_type == MissingType::Zero && fval == default_bin)
        || (missing_type == MissingType::NaN && fval == max_bin)) {
352
353
354
355
356
357
358
359
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
      } else {
        return right_child_[node];
      }
    }
    if (fval <= threshold_in_bin_[node]) {
      return left_child_[node];
360
    } else {
361
      return right_child_[node];
362
363
    }
  }
Guolin Ke's avatar
Guolin Ke committed
364

365
366
367
368
369
370
371
  inline int CategoricalDecision(double fval, int node) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
    int int_fval = static_cast<int>(fval);
    if (int_fval < 0) {
      return right_child_[node];;
    } else if (std::isnan(fval)) {
      // NaN is always in the right
372
      if (missing_type == MissingType::NaN) {
373
374
375
376
        return right_child_[node];
      }
      int_fval = 0;
    }
377
    int cat_idx = static_cast<int>(threshold_[node]);
378
379
    if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                             cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], int_fval)) {
380
381
382
383
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
384

385
  inline int CategoricalDecisionInner(uint32_t fval, int node) const {
386
    int cat_idx = static_cast<int>(threshold_in_bin_[node]);
387
388
    if (Common::FindInBitset(cat_threshold_inner_.data() + cat_boundaries_inner_[cat_idx],
                             cat_boundaries_inner_[cat_idx + 1] - cat_boundaries_inner_[cat_idx], fval)) {
389
390
391
392
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
393

394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
  inline int Decision(double fval, int node) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecision(fval, node);
    } else {
      return NumericalDecision(fval, node);
    }
  }

  inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecisionInner(fval, node);
    } else {
      return NumericalDecisionInner(fval, node, default_bin, max_bin);
    }
  }

410
411
  inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt,
                    double left_weight, double right_weight, float gain);
Guolin Ke's avatar
Guolin Ke committed
412
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
413
  * \brief Find leaf index of which record belongs by features
Guolin Ke's avatar
Guolin Ke committed
414
415
416
  * \param feature_values Feature value of this record
  * \return Leaf index
  */
417
  inline int GetLeaf(const double* feature_values) const;
418
  inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;
Guolin Ke's avatar
Guolin Ke committed
419

wxchan's avatar
wxchan committed
420
  /*! \brief Serialize one node to json*/
Guolin Ke's avatar
Guolin Ke committed
421
  std::string NodeToJSON(int index) const;
wxchan's avatar
wxchan committed
422

423
  /*! \brief Serialize one node to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
424
  std::string NodeToIfElse(int index, bool predict_leaf_index) const;
Guolin Ke's avatar
Guolin Ke committed
425

Guolin Ke's avatar
Guolin Ke committed
426
  std::string NodeToIfElseByMap(int index, bool predict_leaf_index) const;
427

428
  double ExpectedValue() const;
Guolin Ke's avatar
Guolin Ke committed
429

430
431
  /*! \brief This is used fill in leaf_depth_ after reloading a model*/
  inline void RecomputeLeafDepths(int node = 0, int depth = 0);
Guolin Ke's avatar
Guolin Ke committed
432
433
434
435
436
437
438
439
440
441

  /*!
  * \brief Used by TreeSHAP for data we keep about our decision path
  */
  struct PathElement {
    int feature_index;
    double zero_fraction;
    double one_fraction;

    // note that pweight is included for convenience and is not tied with the other attributes,
442
    // the pweight of the i'th path element is the permutation weight of paths with i-1 ones in them
Guolin Ke's avatar
Guolin Ke committed
443
444
445
446
447
448
    double pweight;

    PathElement() {}
    PathElement(int i, double z, double o, double w) : feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
  };

449
  /*! \brief Polynomial time algorithm for SHAP values (arXiv:1706.06060)*/
Guolin Ke's avatar
Guolin Ke committed
450
451
452
453
  void TreeSHAP(const double *feature_values, double *phi,
                int node, int unique_depth,
                PathElement *parent_unique_path, double parent_zero_fraction,
                double parent_one_fraction, int parent_feature_index) const;
454

455
456
457
458
459
460
  void TreeSHAPByMap(const std::unordered_map<int, double>& feature_values,
                     std::unordered_map<int, double>* phi,
                     int node, int unique_depth,
                     PathElement *parent_unique_path, double parent_zero_fraction,
                     double parent_one_fraction, int parent_feature_index) const;

461
  /*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
462
463
  static void ExtendPath(PathElement *unique_path, int unique_depth,
                         double zero_fraction, double one_fraction, int feature_index);
464
465

  /*! \brief Undo a previous extension of the decision path for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
466
  static void UnwindPath(PathElement *unique_path, int unique_depth, int path_index);
467

468
  /*! determine what the total permutation weight would be if we unwound a previous extension in the decision path*/
Guolin Ke's avatar
Guolin Ke committed
469
  static double UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index);
470

Guolin Ke's avatar
Guolin Ke committed
471
472
  /*! \brief Number of max leaves*/
  int max_leaves_;
473
  /*! \brief Number of current leaves*/
Guolin Ke's avatar
Guolin Ke committed
474
475
476
  int num_leaves_;
  // following values used for non-leaf node
  /*! \brief A non-leaf node's left child */
Guolin Ke's avatar
Guolin Ke committed
477
  std::vector<int> left_child_;
Guolin Ke's avatar
Guolin Ke committed
478
  /*! \brief A non-leaf node's right child */
Guolin Ke's avatar
Guolin Ke committed
479
  std::vector<int> right_child_;
Guolin Ke's avatar
Guolin Ke committed
480
  /*! \brief A non-leaf node's split feature */
Guolin Ke's avatar
Guolin Ke committed
481
  std::vector<int> split_feature_inner_;
Guolin Ke's avatar
Guolin Ke committed
482
  /*! \brief A non-leaf node's split feature, the original index */
Guolin Ke's avatar
Guolin Ke committed
483
  std::vector<int> split_feature_;
Guolin Ke's avatar
Guolin Ke committed
484
  /*! \brief A non-leaf node's split threshold in bin */
Guolin Ke's avatar
Guolin Ke committed
485
  std::vector<uint32_t> threshold_in_bin_;
Guolin Ke's avatar
Guolin Ke committed
486
  /*! \brief A non-leaf node's split threshold in feature value */
Guolin Ke's avatar
Guolin Ke committed
487
  std::vector<double> threshold_;
488
  int num_cat_;
489
490
491
492
  std::vector<int> cat_boundaries_inner_;
  std::vector<uint32_t> cat_threshold_inner_;
  std::vector<int> cat_boundaries_;
  std::vector<uint32_t> cat_threshold_;
493
  /*! \brief Store the information for categorical feature handle and missing value handle. */
494
  std::vector<int8_t> decision_type_;
Guolin Ke's avatar
Guolin Ke committed
495
  /*! \brief A non-leaf node's split gain */
496
  std::vector<float> split_gain_;
Guolin Ke's avatar
Guolin Ke committed
497
498
  // used for leaf node
  /*! \brief The parent of leaf */
Guolin Ke's avatar
Guolin Ke committed
499
  std::vector<int> leaf_parent_;
Guolin Ke's avatar
Guolin Ke committed
500
  /*! \brief Output of leaves */
Guolin Ke's avatar
Guolin Ke committed
501
  std::vector<double> leaf_value_;
502
503
  /*! \brief weight of leaves */
  std::vector<double> leaf_weight_;
Guolin Ke's avatar
Guolin Ke committed
504
  /*! \brief DataCount of leaves */
505
  std::vector<int> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
506
507
  /*! \brief Output of non-leaf nodes */
  std::vector<double> internal_value_;
508
509
  /*! \brief weight of non-leaf nodes */
  std::vector<double> internal_weight_;
Guolin Ke's avatar
Guolin Ke committed
510
  /*! \brief DataCount of non-leaf nodes */
511
  std::vector<int> internal_count_;
Guolin Ke's avatar
Guolin Ke committed
512
  /*! \brief Depth for leaves */
Guolin Ke's avatar
Guolin Ke committed
513
  std::vector<int> leaf_depth_;
514
515
516
517
  /*! \brief whether to keep track of ancestor nodes for each leaf (only needed when feature interactions are restricted) */
  bool track_branch_features_;
  /*! \brief Features on leaf's branch, original index */
  std::vector<std::vector<int>> branch_features_;
Guolin Ke's avatar
Guolin Ke committed
518
  double shrinkage_;
519
  int max_depth_;
520
521
522
523
524
525
526
527
528
529
  /*! \brief Tree has linear model at each leaf */
  bool is_linear_;
  /*! \brief coefficients of linear models on leaves */
  std::vector<std::vector<double>> leaf_coeff_;
  /*! \brief constant term (bias) of linear models on leaves */
  std::vector<double> leaf_const_;
  /* \brief features used in leaf linear models; indexing is relative to num_total_features_ */
  std::vector<std::vector<int>> leaf_features_;
  /* \brief features used in leaf linear models; indexing is relative to used_features_ */
  std::vector<std::vector<int>> leaf_features_inner_;
Guolin Ke's avatar
Guolin Ke committed
530
531
};

532
inline void Tree::Split(int leaf, int feature, int real_feature,
533
                        double left_value, double right_value, int left_cnt, int right_cnt,
534
                        double left_weight, double right_weight, float gain) {
535
536
537
538
539
540
541
542
543
544
545
546
547
548
  int new_node_idx = num_leaves_ - 1;
  // update parent info
  int parent = leaf_parent_[leaf];
  if (parent >= 0) {
    // if cur node is left child
    if (left_child_[parent] == ~leaf) {
      left_child_[parent] = new_node_idx;
    } else {
      right_child_[parent] = new_node_idx;
    }
  }
  // add new node
  split_feature_inner_[new_node_idx] = feature;
  split_feature_[new_node_idx] = real_feature;
Guolin Ke's avatar
Guolin Ke committed
549
  split_gain_[new_node_idx] = gain;
550
551
552
553
554
555
556
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
  right_child_[new_node_idx] = ~num_leaves_;
  // update new leaves
  leaf_parent_[leaf] = new_node_idx;
  leaf_parent_[num_leaves_] = new_node_idx;
  // save current leaf value to internal node before change
557
  internal_weight_[new_node_idx] = leaf_weight_[leaf];
558
559
560
  internal_value_[new_node_idx] = leaf_value_[leaf];
  internal_count_[new_node_idx] = left_cnt + right_cnt;
  leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
561
  leaf_weight_[leaf] = left_weight;
562
563
  leaf_count_[leaf] = left_cnt;
  leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
564
  leaf_weight_[num_leaves_] = right_weight;
565
566
567
568
  leaf_count_[num_leaves_] = right_cnt;
  // update leaf depth
  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
  leaf_depth_[leaf]++;
569
570
571
572
573
  if (track_branch_features_) {
    branch_features_[num_leaves_] = branch_features_[leaf];
    branch_features_[num_leaves_].push_back(split_feature_[new_node_idx]);
    branch_features_[leaf].push_back(split_feature_[new_node_idx]);
  }
574
575
}

576
inline double Tree::Predict(const double* feature_values) const {
577
  if (is_linear_) {
578
      int leaf = (num_leaves_ > 1) ? GetLeaf(feature_values) : 0;
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
      double output = leaf_const_[leaf];
      bool nan_found = false;
      for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
        int feat_raw = leaf_features_[leaf][i];
        double feat_val = feature_values[feat_raw];
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
      if (nan_found) {
        return LeafOutput(leaf);
      } else {
        return output;
      }
Guolin Ke's avatar
Guolin Ke committed
596
  } else {
597
598
599
600
601
602
    if (num_leaves_ > 1) {
      int leaf = GetLeaf(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
Guolin Ke's avatar
Guolin Ke committed
603
  }
Guolin Ke's avatar
Guolin Ke committed
604
605
}

606
inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
607
  if (is_linear_) {
608
    int leaf = (num_leaves_ > 1) ? GetLeafByMap(feature_values) : 0;
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
    double output = leaf_const_[leaf];
    bool nan_found = false;
    for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
      int feat = leaf_features_[leaf][i];
      auto val_it = feature_values.find(feat);
      if (val_it != feature_values.end()) {
        double feat_val = val_it->second;
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
    }
    if (nan_found) {
      return LeafOutput(leaf);
    } else {
      return output;
    }
629
  } else {
630
631
632
633
634
635
    if (num_leaves_ > 1) {
      int leaf = GetLeafByMap(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
636
637
638
  }
}

639
inline int Tree::PredictLeafIndex(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
640
641
642
643
644
645
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
    return leaf;
  } else {
    return 0;
  }
wxchan's avatar
wxchan committed
646
647
}

648
649
650
651
652
653
654
655
656
inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
  if (num_leaves_ > 1) {
    int leaf = GetLeafByMap(feature_values);
    return leaf;
  } else {
    return 0;
  }
}

657
658
inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
  output[num_features] += ExpectedValue();
659
  // Run the recursion with preallocated space for the unique path data
660
  if (num_leaves_ > 1) {
661
    CHECK_GE(max_depth_, 0);
662
663
664
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAP(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
665
666
667
  }
}

668
669
670
671
672
673
674
675
676
677
678
679
inline void Tree::PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                      int num_features, std::unordered_map<int, double>* output) {
  (*output)[num_features] += ExpectedValue();
  // Run the recursion with preallocated space for the unique path data
  if (num_leaves_ > 1) {
    CHECK_GE(max_depth_, 0);
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
  }
}

680
681
682
683
684
inline void Tree::RecomputeLeafDepths(int node, int depth) {
  if (node == 0) leaf_depth_.resize(num_leaves());
  if (node < 0) {
    leaf_depth_[~node] = depth;
  } else {
Guolin Ke's avatar
Guolin Ke committed
685
686
    RecomputeLeafDepths(left_child_[node], depth + 1);
    RecomputeLeafDepths(right_child_[node], depth + 1);
687
  }
688
689
}

690
inline int Tree::GetLeaf(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
691
  int node = 0;
692
  if (num_cat_ > 0) {
Guolin Ke's avatar
Guolin Ke committed
693
    while (node >= 0) {
694
      node = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
695
696
697
    }
  } else {
    while (node >= 0) {
698
      node = NumericalDecision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
699
700
701
702
703
    }
  }
  return ~node;
}

704
705
706
707
708
709
710
711
712
713
714
715
716
717
inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
  int node = 0;
  if (num_cat_ > 0) {
    while (node >= 0) {
      node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  } else {
    while (node >= 0) {
      node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  }
  return ~node;
}

Guolin Ke's avatar
Guolin Ke committed
718
719
}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
720
#endif   // LightGBM_TREE_H_