tree.h 25.7 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_

8
9
10
#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>

Guolin Ke's avatar
Guolin Ke committed
11
#include <string>
12
#include <map>
13
14
15
#include <memory>
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
20
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25

/*!
* \brief Tree model
*/
class Tree {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
29
  /*!
  * \brief Constructor
  * \param max_leaves The number of max leaves
30
  * \param track_branch_features Whether to keep track of ancestors of leaf nodes
31
  * \param is_linear Whether the tree has linear models at each leaf
Guolin Ke's avatar
Guolin Ke committed
32
  */
33
  explicit Tree(int max_leaves, bool track_branch_features, bool is_linear);
Guolin Ke's avatar
Guolin Ke committed
34
35

  /*!
36
  * \brief Constructor, from a string
Guolin Ke's avatar
Guolin Ke committed
37
  * \param str Model string
38
  * \param used_len used count of str
Guolin Ke's avatar
Guolin Ke committed
39
  */
40
  Tree(const char* str, size_t* used_len);
Guolin Ke's avatar
Guolin Ke committed
41

42
  ~Tree() noexcept = default;
Guolin Ke's avatar
Guolin Ke committed
43
44

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
45
46
47
  * \brief Performing a split on tree leaves.
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
Guolin Ke's avatar
Guolin Ke committed
48
  * \param real_feature Index of feature, the original index on data
49
  * \param threshold_bin Threshold(bin) of split
50
  * \param threshold_double Threshold on feature value
Guolin Ke's avatar
Guolin Ke committed
51
52
  * \param left_value Model Left child output
  * \param right_value Model Right child output
Guolin Ke's avatar
Guolin Ke committed
53
54
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
55
56
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
Guolin Ke's avatar
Guolin Ke committed
57
  * \param gain Split gain
Guolin Ke's avatar
Guolin Ke committed
58
59
  * \param missing_type missing type
  * \param default_left default direction for missing value
Guolin Ke's avatar
Guolin Ke committed
60
61
  * \return The index of new leaf.
  */
62
63
  int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
            double threshold_double, double left_value, double right_value,
64
65
            int left_cnt, int right_cnt, double left_weight, double right_weight,
            float gain, MissingType missing_type, bool default_left);
Guolin Ke's avatar
Guolin Ke committed
66

67
68
69
70
71
72
73
  /*!
  * \brief Performing a split on tree leaves, with categorical feature
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
  * \param real_feature Index of feature, the original index on data
  * \param threshold_bin Threshold(bin) of split, use bitset to represent
  * \param num_threshold_bin size of threshold_bin
74
75
  * \param threshold Thresholds of real feature value, use bitset to represent
  * \param num_threshold size of threshold
76
77
78
79
  * \param left_value Model Left child output
  * \param right_value Model Right child output
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
80
81
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
82
83
84
  * \param gain Split gain
  * \return The index of new leaf.
  */
85
86
  int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                       const uint32_t* threshold, int num_threshold, double left_value, double right_value,
87
                       int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type);
88

Guolin Ke's avatar
Guolin Ke committed
89
  /*! \brief Get the output of one leaf */
90
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Guolin Ke's avatar
Guolin Ke committed
91

Guolin Ke's avatar
Guolin Ke committed
92
93
  /*! \brief Set the output of one leaf */
  inline void SetLeafOutput(int leaf, double output) {
94
    leaf_value_[leaf] = MaybeRoundToZero(output);
Guolin Ke's avatar
Guolin Ke committed
95
96
  }

Guolin Ke's avatar
Guolin Ke committed
97
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
98
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
99
100
101
102
  * \param data The dataset
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
103
104
105
  void AddPredictionToScore(const Dataset* data,
                            data_size_t num_data,
                            double* score) const;
Guolin Ke's avatar
Guolin Ke committed
106
107

  /*!
108
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
109
110
111
112
113
114
  * \param data The dataset
  * \param used_data_indices Indices of used data
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
  void AddPredictionToScore(const Dataset* data,
Qiwei Ye's avatar
Qiwei Ye committed
115
                            const data_size_t* used_data_indices,
116
                            data_size_t num_data, double* score) const;
Guolin Ke's avatar
Guolin Ke committed
117

118
119
120
121
122
123
124
125
126
127
  /*!
  * \brief Get upper bound leaf value of this tree model
  */
  double GetUpperBoundValue() const;

  /*!
  * \brief Get lower bound leaf value of this tree model
  */
  double GetLowerBoundValue() const;

Guolin Ke's avatar
Guolin Ke committed
128
  /*!
129
  * \brief Prediction on one record
Guolin Ke's avatar
Guolin Ke committed
130
131
132
  * \param feature_values Feature value of this record
  * \return Prediction result
  */
133
  inline double Predict(const double* feature_values) const;
134
  inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
135

136
  inline int PredictLeafIndex(const double* feature_values) const;
137
138
  inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

139
  inline void PredictContrib(const double* feature_values, int num_features, double* output);
140
141
  inline void PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                  int num_features, std::unordered_map<int, double>* output);
142

Guolin Ke's avatar
Guolin Ke committed
143
144
145
  /*! \brief Get Number of leaves*/
  inline int num_leaves() const { return num_leaves_; }

Guolin Ke's avatar
Guolin Ke committed
146
147
148
  /*! \brief Get depth of specific leaf*/
  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }

Belinda Trotta's avatar
Belinda Trotta committed
149
150
151
  /*! \brief Get parent of specific leaf*/
  inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }

152
  /*! \brief Get feature of specific split (original feature index)*/
Guolin Ke's avatar
Guolin Ke committed
153
  inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
wxchan's avatar
wxchan committed
154

155
156
157
  /*! \brief Get feature of specific split*/
  inline int split_feature_inner(int split_idx) const { return split_feature_inner_[split_idx]; }

158
159
160
  /*! \brief Get features on leaf's branch*/
  inline std::vector<int> branch_features(int leaf) const { return branch_features_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
161
162
  inline double split_gain(int split_idx) const { return split_gain_[split_idx]; }

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  inline double internal_value(int node_idx) const {
    return internal_value_[node_idx];
  }

  inline bool IsNumericalSplit(int node_idx) const {
    return !GetDecisionType(decision_type_[node_idx], kCategoricalMask);
  }

  inline int left_child(int node_idx) const { return left_child_[node_idx]; }

  inline int right_child(int node_idx) const { return right_child_[node_idx]; }

  inline uint32_t threshold_in_bin(int node_idx) const {
    return threshold_in_bin_[node_idx];
  }

179
  /*! \brief Get the number of data points that fall at or below this node*/
Guolin Ke's avatar
Guolin Ke committed
180
  inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; }
181

Guolin Ke's avatar
Guolin Ke committed
182
183
  /*!
  * \brief Shrinkage for the tree's output
184
  *        shrinkage rate (a.k.a learning rate) is used to tune the training process
Guolin Ke's avatar
Guolin Ke committed
185
186
  * \param rate The factor of shrinkage
  */
187
  inline void Shrinkage(double rate) {
188
189
190
191
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate);
192
193
194
195
196
197
      if (is_linear_) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] * rate);
        for (size_t j = 0; j < leaf_coeff_[i].size(); ++j) {
          leaf_coeff_[i][j] = MaybeRoundToZero(leaf_coeff_[i][j] * rate);
        }
      }
Guolin Ke's avatar
Guolin Ke committed
198
    }
199
200
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] * rate);
201
202
203
204
205
206
    if (is_linear_) {
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] * rate);
      for (size_t j = 0; j < leaf_coeff_[num_leaves_ - 1].size(); ++j) {
        leaf_coeff_[num_leaves_ - 1][j] = MaybeRoundToZero(leaf_coeff_[num_leaves_ - 1][j] * rate);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
207
    shrinkage_ *= rate;
Guolin Ke's avatar
Guolin Ke committed
208
209
  }

210
  inline double shrinkage() const { return shrinkage_; }
211

Guolin Ke's avatar
Guolin Ke committed
212
  inline void AddBias(double val) {
213
214
215
216
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val);
Guolin Ke's avatar
Guolin Ke committed
217
    }
218
219
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val);
220
221
222
223
224
225
226
    if (is_linear_) {
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
      for (int i = 0; i < num_leaves_ - 1; ++i) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val);
      }
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] + val);
    }
Guolin Ke's avatar
Guolin Ke committed
227
228
229
230
    // force to 1.0
    shrinkage_ = 1.0f;
  }

231
232
233
234
  inline void AsConstantTree(double val) {
    num_leaves_ = 1;
    shrinkage_ = 1.0f;
    leaf_value_[0] = val;
235
236
237
    if (is_linear_) {
      leaf_const_[0] = val;
    }
238
239
  }

wxchan's avatar
wxchan committed
240
  /*! \brief Serialize this object to string*/
Guolin Ke's avatar
Guolin Ke committed
241
  std::string ToString() const;
Guolin Ke's avatar
Guolin Ke committed
242

wxchan's avatar
wxchan committed
243
  /*! \brief Serialize this object to json*/
Guolin Ke's avatar
Guolin Ke committed
244
  std::string ToJSON() const;
wxchan's avatar
wxchan committed
245

246
247
248
  /*! \brief Serialize linear model of tree node to json*/
  std::string LinearModelToJSON(int index) const;

249
  /*! \brief Serialize this object to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
250
  std::string ToIfElse(int index, bool predict_leaf_index) const;
251

Guolin Ke's avatar
Guolin Ke committed
252
  inline static bool IsZero(double fval) {
253
    return (fval >= -kZeroThreshold && fval <= kZeroThreshold);
Guolin Ke's avatar
Guolin Ke committed
254
255
  }

256
  inline static double MaybeRoundToZero(double fval) {
257
    return IsZero(fval) ? 0 : fval;
258
259
  }

Guolin Ke's avatar
Guolin Ke committed
260
261
262
263
264
265
266
  inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
    return (decision_type & mask) > 0;
  }

  inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
    if (input) {
      (*decision_type) |= mask;
Guolin Ke's avatar
Guolin Ke committed
267
    } else {
Guolin Ke's avatar
Guolin Ke committed
268
      (*decision_type) &= (127 - mask);
Guolin Ke's avatar
Guolin Ke committed
269
270
271
    }
  }

Guolin Ke's avatar
Guolin Ke committed
272
273
274
275
276
277
278
279
280
  inline static int8_t GetMissingType(int8_t decision_type) {
    return (decision_type >> 2) & 3;
  }

  inline static void SetMissingType(int8_t* decision_type, int8_t input) {
    (*decision_type) &= 3;
    (*decision_type) |= (input << 2);
  }

281
282
  void RecomputeMaxDepth();

283
  int NextLeafId() const { return num_leaves_; }
284

285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
  /*! \brief Get the linear model constant term (bias) of one leaf */
  inline double LeafConst(int leaf) const { return leaf_const_[leaf]; }

  /*! \brief Get the linear model coefficients of one leaf */
  inline std::vector<double> LeafCoeffs(int leaf) const { return leaf_coeff_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeaturesInner(int leaf) const {return leaf_features_inner_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeatures(int leaf) const {return leaf_features_[leaf]; }

  /*! \brief Set the linear model coefficients on one leaf */
  inline void SetLeafCoeffs(int leaf, const std::vector<double>& output) {
    leaf_coeff_[leaf].resize(output.size());
    for (size_t i = 0; i < output.size(); ++i) {
      leaf_coeff_[leaf][i] = MaybeRoundToZero(output[i]);
    }
  }

  /*! \brief Set the linear model constant term (bias) on one leaf */
  inline void SetLeafConst(int leaf, double output) {
    leaf_const_[leaf] = MaybeRoundToZero(output);
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeaturesInner(int leaf, const std::vector<int>& features) {
    leaf_features_inner_[leaf] = features;
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeatures(int leaf, const std::vector<int>& features) {
    leaf_features_[leaf] = features;
  }

  inline bool is_linear() const { return is_linear_; }

  inline void SetIsLinear(bool is_linear) {
    is_linear_ = is_linear;
  }

Nikita Titov's avatar
Nikita Titov committed
326
 private:
Guolin Ke's avatar
Guolin Ke committed
327
  std::string NumericalDecisionIfElse(int node) const;
Guolin Ke's avatar
Guolin Ke committed
328

Guolin Ke's avatar
Guolin Ke committed
329
  std::string CategoricalDecisionIfElse(int node) const;
330
331
332

  inline int NumericalDecision(double fval, int node) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
333
334
    if (std::isnan(fval) && missing_type != MissingType::NaN) {
      fval = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
335
    }
336
337
    if ((missing_type == MissingType::Zero && IsZero(fval))
        || (missing_type == MissingType::NaN && std::isnan(fval))) {
338
339
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
Guolin Ke's avatar
Guolin Ke committed
340
      } else {
341
        return right_child_[node];
Guolin Ke's avatar
Guolin Ke committed
342
343
      }
    }
344
345
346
347
348
    if (fval <= threshold_[node]) {
      return left_child_[node];
    } else {
      return right_child_[node];
    }
Guolin Ke's avatar
Guolin Ke committed
349
  }
Guolin Ke's avatar
Guolin Ke committed
350

351
352
  inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
353
354
    if ((missing_type == MissingType::Zero && fval == default_bin)
        || (missing_type == MissingType::NaN && fval == max_bin)) {
355
356
357
358
359
360
361
362
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
      } else {
        return right_child_[node];
      }
    }
    if (fval <= threshold_in_bin_[node]) {
      return left_child_[node];
363
    } else {
364
      return right_child_[node];
365
366
    }
  }
Guolin Ke's avatar
Guolin Ke committed
367

368
369
370
371
372
373
374
  inline int CategoricalDecision(double fval, int node) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
    int int_fval = static_cast<int>(fval);
    if (int_fval < 0) {
      return right_child_[node];;
    } else if (std::isnan(fval)) {
      // NaN is always in the right
375
      if (missing_type == MissingType::NaN) {
376
377
378
379
        return right_child_[node];
      }
      int_fval = 0;
    }
380
    int cat_idx = static_cast<int>(threshold_[node]);
381
382
    if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                             cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], int_fval)) {
383
384
385
386
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
387

388
  inline int CategoricalDecisionInner(uint32_t fval, int node) const {
389
    int cat_idx = static_cast<int>(threshold_in_bin_[node]);
390
391
    if (Common::FindInBitset(cat_threshold_inner_.data() + cat_boundaries_inner_[cat_idx],
                             cat_boundaries_inner_[cat_idx + 1] - cat_boundaries_inner_[cat_idx], fval)) {
392
393
394
395
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
396

397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
  inline int Decision(double fval, int node) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecision(fval, node);
    } else {
      return NumericalDecision(fval, node);
    }
  }

  inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecisionInner(fval, node);
    } else {
      return NumericalDecisionInner(fval, node, default_bin, max_bin);
    }
  }

413
414
  inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt,
                    double left_weight, double right_weight, float gain);
Guolin Ke's avatar
Guolin Ke committed
415
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
416
  * \brief Find leaf index of which record belongs by features
Guolin Ke's avatar
Guolin Ke committed
417
418
419
  * \param feature_values Feature value of this record
  * \return Leaf index
  */
420
  inline int GetLeaf(const double* feature_values) const;
421
  inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;
Guolin Ke's avatar
Guolin Ke committed
422

wxchan's avatar
wxchan committed
423
  /*! \brief Serialize one node to json*/
Guolin Ke's avatar
Guolin Ke committed
424
  std::string NodeToJSON(int index) const;
wxchan's avatar
wxchan committed
425

426
  /*! \brief Serialize one node to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
427
  std::string NodeToIfElse(int index, bool predict_leaf_index) const;
Guolin Ke's avatar
Guolin Ke committed
428

Guolin Ke's avatar
Guolin Ke committed
429
  std::string NodeToIfElseByMap(int index, bool predict_leaf_index) const;
430

431
  double ExpectedValue() const;
Guolin Ke's avatar
Guolin Ke committed
432

433
434
  /*! \brief This is used fill in leaf_depth_ after reloading a model*/
  inline void RecomputeLeafDepths(int node = 0, int depth = 0);
Guolin Ke's avatar
Guolin Ke committed
435
436
437
438
439
440
441
442
443
444

  /*!
  * \brief Used by TreeSHAP for data we keep about our decision path
  */
  struct PathElement {
    int feature_index;
    double zero_fraction;
    double one_fraction;

    // note that pweight is included for convenience and is not tied with the other attributes,
445
    // the pweight of the i'th path element is the permutation weight of paths with i-1 ones in them
Guolin Ke's avatar
Guolin Ke committed
446
447
448
449
450
451
    double pweight;

    PathElement() {}
    PathElement(int i, double z, double o, double w) : feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
  };

452
  /*! \brief Polynomial time algorithm for SHAP values (arXiv:1706.06060)*/
Guolin Ke's avatar
Guolin Ke committed
453
454
455
456
  void TreeSHAP(const double *feature_values, double *phi,
                int node, int unique_depth,
                PathElement *parent_unique_path, double parent_zero_fraction,
                double parent_one_fraction, int parent_feature_index) const;
457

458
459
460
461
462
463
  void TreeSHAPByMap(const std::unordered_map<int, double>& feature_values,
                     std::unordered_map<int, double>* phi,
                     int node, int unique_depth,
                     PathElement *parent_unique_path, double parent_zero_fraction,
                     double parent_one_fraction, int parent_feature_index) const;

464
  /*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
465
466
  static void ExtendPath(PathElement *unique_path, int unique_depth,
                         double zero_fraction, double one_fraction, int feature_index);
467
468

  /*! \brief Undo a previous extension of the decision path for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
469
  static void UnwindPath(PathElement *unique_path, int unique_depth, int path_index);
470

471
  /*! determine what the total permutation weight would be if we unwound a previous extension in the decision path*/
Guolin Ke's avatar
Guolin Ke committed
472
  static double UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index);
473

Guolin Ke's avatar
Guolin Ke committed
474
475
  /*! \brief Number of max leaves*/
  int max_leaves_;
476
  /*! \brief Number of current leaves*/
Guolin Ke's avatar
Guolin Ke committed
477
478
479
  int num_leaves_;
  // following values used for non-leaf node
  /*! \brief A non-leaf node's left child */
Guolin Ke's avatar
Guolin Ke committed
480
  std::vector<int> left_child_;
Guolin Ke's avatar
Guolin Ke committed
481
  /*! \brief A non-leaf node's right child */
Guolin Ke's avatar
Guolin Ke committed
482
  std::vector<int> right_child_;
Guolin Ke's avatar
Guolin Ke committed
483
  /*! \brief A non-leaf node's split feature */
Guolin Ke's avatar
Guolin Ke committed
484
  std::vector<int> split_feature_inner_;
Guolin Ke's avatar
Guolin Ke committed
485
  /*! \brief A non-leaf node's split feature, the original index */
Guolin Ke's avatar
Guolin Ke committed
486
  std::vector<int> split_feature_;
Guolin Ke's avatar
Guolin Ke committed
487
  /*! \brief A non-leaf node's split threshold in bin */
Guolin Ke's avatar
Guolin Ke committed
488
  std::vector<uint32_t> threshold_in_bin_;
Guolin Ke's avatar
Guolin Ke committed
489
  /*! \brief A non-leaf node's split threshold in feature value */
Guolin Ke's avatar
Guolin Ke committed
490
  std::vector<double> threshold_;
491
  int num_cat_;
492
493
494
495
  std::vector<int> cat_boundaries_inner_;
  std::vector<uint32_t> cat_threshold_inner_;
  std::vector<int> cat_boundaries_;
  std::vector<uint32_t> cat_threshold_;
496
  /*! \brief Store the information for categorical feature handle and missing value handle. */
497
  std::vector<int8_t> decision_type_;
Guolin Ke's avatar
Guolin Ke committed
498
  /*! \brief A non-leaf node's split gain */
499
  std::vector<float> split_gain_;
Guolin Ke's avatar
Guolin Ke committed
500
501
  // used for leaf node
  /*! \brief The parent of leaf */
Guolin Ke's avatar
Guolin Ke committed
502
  std::vector<int> leaf_parent_;
Guolin Ke's avatar
Guolin Ke committed
503
  /*! \brief Output of leaves */
Guolin Ke's avatar
Guolin Ke committed
504
  std::vector<double> leaf_value_;
505
506
  /*! \brief weight of leaves */
  std::vector<double> leaf_weight_;
Guolin Ke's avatar
Guolin Ke committed
507
  /*! \brief DataCount of leaves */
508
  std::vector<int> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
509
510
  /*! \brief Output of non-leaf nodes */
  std::vector<double> internal_value_;
511
512
  /*! \brief weight of non-leaf nodes */
  std::vector<double> internal_weight_;
Guolin Ke's avatar
Guolin Ke committed
513
  /*! \brief DataCount of non-leaf nodes */
514
  std::vector<int> internal_count_;
Guolin Ke's avatar
Guolin Ke committed
515
  /*! \brief Depth for leaves */
Guolin Ke's avatar
Guolin Ke committed
516
  std::vector<int> leaf_depth_;
517
518
519
520
  /*! \brief whether to keep track of ancestor nodes for each leaf (only needed when feature interactions are restricted) */
  bool track_branch_features_;
  /*! \brief Features on leaf's branch, original index */
  std::vector<std::vector<int>> branch_features_;
Guolin Ke's avatar
Guolin Ke committed
521
  double shrinkage_;
522
  int max_depth_;
523
524
525
526
527
528
529
530
531
532
  /*! \brief Tree has linear model at each leaf */
  bool is_linear_;
  /*! \brief coefficients of linear models on leaves */
  std::vector<std::vector<double>> leaf_coeff_;
  /*! \brief constant term (bias) of linear models on leaves */
  std::vector<double> leaf_const_;
  /* \brief features used in leaf linear models; indexing is relative to num_total_features_ */
  std::vector<std::vector<int>> leaf_features_;
  /* \brief features used in leaf linear models; indexing is relative to used_features_ */
  std::vector<std::vector<int>> leaf_features_inner_;
Guolin Ke's avatar
Guolin Ke committed
533
534
};

535
inline void Tree::Split(int leaf, int feature, int real_feature,
536
                        double left_value, double right_value, int left_cnt, int right_cnt,
537
                        double left_weight, double right_weight, float gain) {
538
539
540
541
542
543
544
545
546
547
548
549
550
551
  int new_node_idx = num_leaves_ - 1;
  // update parent info
  int parent = leaf_parent_[leaf];
  if (parent >= 0) {
    // if cur node is left child
    if (left_child_[parent] == ~leaf) {
      left_child_[parent] = new_node_idx;
    } else {
      right_child_[parent] = new_node_idx;
    }
  }
  // add new node
  split_feature_inner_[new_node_idx] = feature;
  split_feature_[new_node_idx] = real_feature;
Guolin Ke's avatar
Guolin Ke committed
552
  split_gain_[new_node_idx] = gain;
553
554
555
556
557
558
559
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
  right_child_[new_node_idx] = ~num_leaves_;
  // update new leaves
  leaf_parent_[leaf] = new_node_idx;
  leaf_parent_[num_leaves_] = new_node_idx;
  // save current leaf value to internal node before change
560
  internal_weight_[new_node_idx] = leaf_weight_[leaf];
561
562
563
  internal_value_[new_node_idx] = leaf_value_[leaf];
  internal_count_[new_node_idx] = left_cnt + right_cnt;
  leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
564
  leaf_weight_[leaf] = left_weight;
565
566
  leaf_count_[leaf] = left_cnt;
  leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
567
  leaf_weight_[num_leaves_] = right_weight;
568
569
570
571
  leaf_count_[num_leaves_] = right_cnt;
  // update leaf depth
  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
  leaf_depth_[leaf]++;
572
573
574
575
576
  if (track_branch_features_) {
    branch_features_[num_leaves_] = branch_features_[leaf];
    branch_features_[num_leaves_].push_back(split_feature_[new_node_idx]);
    branch_features_[leaf].push_back(split_feature_[new_node_idx]);
  }
577
578
}

579
inline double Tree::Predict(const double* feature_values) const {
580
  if (is_linear_) {
581
      int leaf = (num_leaves_ > 1) ? GetLeaf(feature_values) : 0;
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
      double output = leaf_const_[leaf];
      bool nan_found = false;
      for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
        int feat_raw = leaf_features_[leaf][i];
        double feat_val = feature_values[feat_raw];
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
      if (nan_found) {
        return LeafOutput(leaf);
      } else {
        return output;
      }
Guolin Ke's avatar
Guolin Ke committed
599
  } else {
600
601
602
603
604
605
    if (num_leaves_ > 1) {
      int leaf = GetLeaf(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
Guolin Ke's avatar
Guolin Ke committed
606
  }
Guolin Ke's avatar
Guolin Ke committed
607
608
}

609
inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
610
  if (is_linear_) {
611
    int leaf = (num_leaves_ > 1) ? GetLeafByMap(feature_values) : 0;
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
    double output = leaf_const_[leaf];
    bool nan_found = false;
    for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
      int feat = leaf_features_[leaf][i];
      auto val_it = feature_values.find(feat);
      if (val_it != feature_values.end()) {
        double feat_val = val_it->second;
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
    }
    if (nan_found) {
      return LeafOutput(leaf);
    } else {
      return output;
    }
632
  } else {
633
634
635
636
637
638
    if (num_leaves_ > 1) {
      int leaf = GetLeafByMap(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
639
640
641
  }
}

642
inline int Tree::PredictLeafIndex(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
643
644
645
646
647
648
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
    return leaf;
  } else {
    return 0;
  }
wxchan's avatar
wxchan committed
649
650
}

651
652
653
654
655
656
657
658
659
inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
  if (num_leaves_ > 1) {
    int leaf = GetLeafByMap(feature_values);
    return leaf;
  } else {
    return 0;
  }
}

660
661
inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
  output[num_features] += ExpectedValue();
662
  // Run the recursion with preallocated space for the unique path data
663
  if (num_leaves_ > 1) {
664
    CHECK_GE(max_depth_, 0);
665
666
667
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAP(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
668
669
670
  }
}

671
672
673
674
675
676
677
678
679
680
681
682
inline void Tree::PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                      int num_features, std::unordered_map<int, double>* output) {
  (*output)[num_features] += ExpectedValue();
  // Run the recursion with preallocated space for the unique path data
  if (num_leaves_ > 1) {
    CHECK_GE(max_depth_, 0);
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
  }
}

683
684
685
686
687
inline void Tree::RecomputeLeafDepths(int node, int depth) {
  if (node == 0) leaf_depth_.resize(num_leaves());
  if (node < 0) {
    leaf_depth_[~node] = depth;
  } else {
Guolin Ke's avatar
Guolin Ke committed
688
689
    RecomputeLeafDepths(left_child_[node], depth + 1);
    RecomputeLeafDepths(right_child_[node], depth + 1);
690
  }
691
692
}

693
inline int Tree::GetLeaf(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
694
  int node = 0;
695
  if (num_cat_ > 0) {
Guolin Ke's avatar
Guolin Ke committed
696
    while (node >= 0) {
697
      node = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
698
699
700
    }
  } else {
    while (node >= 0) {
701
      node = NumericalDecision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
702
703
704
705
706
    }
  }
  return ~node;
}

707
708
709
710
711
712
713
714
715
716
717
718
719
720
inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
  int node = 0;
  if (num_cat_ > 0) {
    while (node >= 0) {
      node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  } else {
    while (node >= 0) {
      node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  }
  return ~node;
}

Guolin Ke's avatar
Guolin Ke committed
721
722
}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
723
#endif   // LightGBM_TREE_H_