tree.h 25.9 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_

8
9
10
#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>

Guolin Ke's avatar
Guolin Ke committed
11
#include <string>
12
#include <map>
13
14
15
#include <memory>
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
20
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25

/*!
* \brief Tree model
*/
class Tree {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
29
  /*!
  * \brief Constructor
  * \param max_leaves The number of max leaves
30
  * \param track_branch_features Whether to keep track of ancestors of leaf nodes
31
  * \param is_linear Whether the tree has linear models at each leaf
Guolin Ke's avatar
Guolin Ke committed
32
  */
33
  explicit Tree(int max_leaves, bool track_branch_features, bool is_linear);
Guolin Ke's avatar
Guolin Ke committed
34
35

  /*!
36
  * \brief Constructor, from a string
Guolin Ke's avatar
Guolin Ke committed
37
  * \param str Model string
38
  * \param used_len used count of str
Guolin Ke's avatar
Guolin Ke committed
39
  */
40
  Tree(const char* str, size_t* used_len);
Guolin Ke's avatar
Guolin Ke committed
41

42
  virtual ~Tree() noexcept = default;
Guolin Ke's avatar
Guolin Ke committed
43
44

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
45
46
47
  * \brief Performing a split on tree leaves.
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
Guolin Ke's avatar
Guolin Ke committed
48
  * \param real_feature Index of feature, the original index on data
49
  * \param threshold_bin Threshold(bin) of split
50
  * \param threshold_double Threshold on feature value
Guolin Ke's avatar
Guolin Ke committed
51
52
  * \param left_value Model Left child output
  * \param right_value Model Right child output
Guolin Ke's avatar
Guolin Ke committed
53
54
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
55
56
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
Guolin Ke's avatar
Guolin Ke committed
57
  * \param gain Split gain
Guolin Ke's avatar
Guolin Ke committed
58
59
  * \param missing_type missing type
  * \param default_left default direction for missing value
Guolin Ke's avatar
Guolin Ke committed
60
61
  * \return The index of new leaf.
  */
62
63
  int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
            double threshold_double, double left_value, double right_value,
64
65
            int left_cnt, int right_cnt, double left_weight, double right_weight,
            float gain, MissingType missing_type, bool default_left);
Guolin Ke's avatar
Guolin Ke committed
66

67
68
69
70
71
72
73
  /*!
  * \brief Performing a split on tree leaves, with categorical feature
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
  * \param real_feature Index of feature, the original index on data
  * \param threshold_bin Threshold(bin) of split, use bitset to represent
  * \param num_threshold_bin size of threshold_bin
74
75
  * \param threshold Thresholds of real feature value, use bitset to represent
  * \param num_threshold size of threshold
76
77
78
79
  * \param left_value Model Left child output
  * \param right_value Model Right child output
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
80
81
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
82
83
84
  * \param gain Split gain
  * \return The index of new leaf.
  */
85
86
  int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                       const uint32_t* threshold, int num_threshold, double left_value, double right_value,
87
                       int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type);
88

Guolin Ke's avatar
Guolin Ke committed
89
  /*! \brief Get the output of one leaf */
90
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Guolin Ke's avatar
Guolin Ke committed
91

Guolin Ke's avatar
Guolin Ke committed
92
93
  /*! \brief Set the output of one leaf */
  inline void SetLeafOutput(int leaf, double output) {
94
    leaf_value_[leaf] = MaybeRoundToZero(output);
Guolin Ke's avatar
Guolin Ke committed
95
96
  }

Guolin Ke's avatar
Guolin Ke committed
97
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
98
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
99
100
101
102
  * \param data The dataset
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
103
  virtual void AddPredictionToScore(const Dataset* data,
104
105
                            data_size_t num_data,
                            double* score) const;
Guolin Ke's avatar
Guolin Ke committed
106
107

  /*!
108
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
109
110
111
112
113
  * \param data The dataset
  * \param used_data_indices Indices of used data
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
114
  virtual void AddPredictionToScore(const Dataset* data,
Qiwei Ye's avatar
Qiwei Ye committed
115
                            const data_size_t* used_data_indices,
116
                            data_size_t num_data, double* score) const;
Guolin Ke's avatar
Guolin Ke committed
117

118
119
120
121
122
123
124
125
126
127
  /*!
  * \brief Get upper bound leaf value of this tree model
  */
  double GetUpperBoundValue() const;

  /*!
  * \brief Get lower bound leaf value of this tree model
  */
  double GetLowerBoundValue() const;

Guolin Ke's avatar
Guolin Ke committed
128
  /*!
129
  * \brief Prediction on one record
Guolin Ke's avatar
Guolin Ke committed
130
131
132
  * \param feature_values Feature value of this record
  * \return Prediction result
  */
133
  inline double Predict(const double* feature_values) const;
134
  inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
135

136
  inline int PredictLeafIndex(const double* feature_values) const;
137
138
  inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

139
  inline void PredictContrib(const double* feature_values, int num_features, double* output);
140
141
  inline void PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                  int num_features, std::unordered_map<int, double>* output);
142

Guolin Ke's avatar
Guolin Ke committed
143
144
145
  /*! \brief Get Number of leaves*/
  inline int num_leaves() const { return num_leaves_; }

Guolin Ke's avatar
Guolin Ke committed
146
147
148
  /*! \brief Get depth of specific leaf*/
  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }

Belinda Trotta's avatar
Belinda Trotta committed
149
150
151
  /*! \brief Get parent of specific leaf*/
  inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }

152
  /*! \brief Get feature of specific split (original feature index)*/
Guolin Ke's avatar
Guolin Ke committed
153
  inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
wxchan's avatar
wxchan committed
154

155
156
157
  /*! \brief Get feature of specific split*/
  inline int split_feature_inner(int split_idx) const { return split_feature_inner_[split_idx]; }

158
159
160
  /*! \brief Get features on leaf's branch*/
  inline std::vector<int> branch_features(int leaf) const { return branch_features_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
161
162
  inline double split_gain(int split_idx) const { return split_gain_[split_idx]; }

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  inline double internal_value(int node_idx) const {
    return internal_value_[node_idx];
  }

  inline bool IsNumericalSplit(int node_idx) const {
    return !GetDecisionType(decision_type_[node_idx], kCategoricalMask);
  }

  inline int left_child(int node_idx) const { return left_child_[node_idx]; }

  inline int right_child(int node_idx) const { return right_child_[node_idx]; }

  inline uint32_t threshold_in_bin(int node_idx) const {
    return threshold_in_bin_[node_idx];
  }

179
  /*! \brief Get the number of data points that fall at or below this node*/
Guolin Ke's avatar
Guolin Ke committed
180
  inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; }
181

Guolin Ke's avatar
Guolin Ke committed
182
183
  /*!
  * \brief Shrinkage for the tree's output
184
  *        shrinkage rate (a.k.a learning rate) is used to tune the training process
Guolin Ke's avatar
Guolin Ke committed
185
186
  * \param rate The factor of shrinkage
  */
187
  virtual inline void Shrinkage(double rate) {
188
189
190
191
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate);
192
193
194
195
196
197
      if (is_linear_) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] * rate);
        for (size_t j = 0; j < leaf_coeff_[i].size(); ++j) {
          leaf_coeff_[i][j] = MaybeRoundToZero(leaf_coeff_[i][j] * rate);
        }
      }
Guolin Ke's avatar
Guolin Ke committed
198
    }
199
200
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] * rate);
201
202
203
204
205
206
    if (is_linear_) {
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] * rate);
      for (size_t j = 0; j < leaf_coeff_[num_leaves_ - 1].size(); ++j) {
        leaf_coeff_[num_leaves_ - 1][j] = MaybeRoundToZero(leaf_coeff_[num_leaves_ - 1][j] * rate);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
207
    shrinkage_ *= rate;
Guolin Ke's avatar
Guolin Ke committed
208
209
  }

210
  inline double shrinkage() const { return shrinkage_; }
211

212
  virtual inline void AddBias(double val) {
213
214
215
216
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val);
Guolin Ke's avatar
Guolin Ke committed
217
    }
218
219
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val);
220
221
222
223
224
225
226
    if (is_linear_) {
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
      for (int i = 0; i < num_leaves_ - 1; ++i) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val);
      }
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] + val);
    }
Guolin Ke's avatar
Guolin Ke committed
227
228
229
230
    // force to 1.0
    shrinkage_ = 1.0f;
  }

231
  virtual inline void AsConstantTree(double val) {
232
233
234
    num_leaves_ = 1;
    shrinkage_ = 1.0f;
    leaf_value_[0] = val;
235
236
237
    if (is_linear_) {
      leaf_const_[0] = val;
    }
238
239
  }

wxchan's avatar
wxchan committed
240
  /*! \brief Serialize this object to string*/
Guolin Ke's avatar
Guolin Ke committed
241
  std::string ToString() const;
Guolin Ke's avatar
Guolin Ke committed
242

wxchan's avatar
wxchan committed
243
  /*! \brief Serialize this object to json*/
Guolin Ke's avatar
Guolin Ke committed
244
  std::string ToJSON() const;
wxchan's avatar
wxchan committed
245

246
247
248
  /*! \brief Serialize linear model of tree node to json*/
  std::string LinearModelToJSON(int index) const;

249
  /*! \brief Serialize this object to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
250
  std::string ToIfElse(int index, bool predict_leaf_index) const;
251

Guolin Ke's avatar
Guolin Ke committed
252
  inline static bool IsZero(double fval) {
253
    return (fval >= -kZeroThreshold && fval <= kZeroThreshold);
Guolin Ke's avatar
Guolin Ke committed
254
255
  }

256
  inline static double MaybeRoundToZero(double fval) {
257
    return IsZero(fval) ? 0 : fval;
258
259
  }

Guolin Ke's avatar
Guolin Ke committed
260
261
262
263
264
265
266
  inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
    return (decision_type & mask) > 0;
  }

  inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
    if (input) {
      (*decision_type) |= mask;
Guolin Ke's avatar
Guolin Ke committed
267
    } else {
Guolin Ke's avatar
Guolin Ke committed
268
      (*decision_type) &= (127 - mask);
Guolin Ke's avatar
Guolin Ke committed
269
270
271
    }
  }

Guolin Ke's avatar
Guolin Ke committed
272
273
274
275
276
277
278
279
280
  inline static int8_t GetMissingType(int8_t decision_type) {
    return (decision_type >> 2) & 3;
  }

  inline static void SetMissingType(int8_t* decision_type, int8_t input) {
    (*decision_type) &= 3;
    (*decision_type) |= (input << 2);
  }

281
282
  void RecomputeMaxDepth();

283
  int NextLeafId() const { return num_leaves_; }
284

285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
  /*! \brief Get the linear model constant term (bias) of one leaf */
  inline double LeafConst(int leaf) const { return leaf_const_[leaf]; }

  /*! \brief Get the linear model coefficients of one leaf */
  inline std::vector<double> LeafCoeffs(int leaf) const { return leaf_coeff_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeaturesInner(int leaf) const {return leaf_features_inner_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeatures(int leaf) const {return leaf_features_[leaf]; }

  /*! \brief Set the linear model coefficients on one leaf */
  inline void SetLeafCoeffs(int leaf, const std::vector<double>& output) {
    leaf_coeff_[leaf].resize(output.size());
    for (size_t i = 0; i < output.size(); ++i) {
      leaf_coeff_[leaf][i] = MaybeRoundToZero(output[i]);
    }
  }

  /*! \brief Set the linear model constant term (bias) on one leaf */
  inline void SetLeafConst(int leaf, double output) {
    leaf_const_[leaf] = MaybeRoundToZero(output);
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeaturesInner(int leaf, const std::vector<int>& features) {
    leaf_features_inner_[leaf] = features;
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeatures(int leaf, const std::vector<int>& features) {
    leaf_features_[leaf] = features;
  }

  inline bool is_linear() const { return is_linear_; }

322
323
324
325
  #ifdef USE_CUDA_EXP
  inline bool is_cuda_tree() const { return is_cuda_tree_; }
  #endif  // USE_CUDA_EXP

326
327
328
329
  inline void SetIsLinear(bool is_linear) {
    is_linear_ = is_linear;
  }

330
 protected:
Guolin Ke's avatar
Guolin Ke committed
331
  std::string NumericalDecisionIfElse(int node) const;
Guolin Ke's avatar
Guolin Ke committed
332

Guolin Ke's avatar
Guolin Ke committed
333
  std::string CategoricalDecisionIfElse(int node) const;
334
335
336

  inline int NumericalDecision(double fval, int node) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
337
338
    if (std::isnan(fval) && missing_type != MissingType::NaN) {
      fval = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
339
    }
340
341
    if ((missing_type == MissingType::Zero && IsZero(fval))
        || (missing_type == MissingType::NaN && std::isnan(fval))) {
342
343
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
Guolin Ke's avatar
Guolin Ke committed
344
      } else {
345
        return right_child_[node];
Guolin Ke's avatar
Guolin Ke committed
346
347
      }
    }
348
349
350
351
352
    if (fval <= threshold_[node]) {
      return left_child_[node];
    } else {
      return right_child_[node];
    }
Guolin Ke's avatar
Guolin Ke committed
353
  }
Guolin Ke's avatar
Guolin Ke committed
354

355
356
  inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
357
358
    if ((missing_type == MissingType::Zero && fval == default_bin)
        || (missing_type == MissingType::NaN && fval == max_bin)) {
359
360
361
362
363
364
365
366
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
      } else {
        return right_child_[node];
      }
    }
    if (fval <= threshold_in_bin_[node]) {
      return left_child_[node];
367
    } else {
368
      return right_child_[node];
369
370
    }
  }
Guolin Ke's avatar
Guolin Ke committed
371

372
  inline int CategoricalDecision(double fval, int node) const {
373
374
375
376
377
378
    int int_fval;
    if (std::isnan(fval)) {
      return right_child_[node];
    } else {
      int_fval = static_cast<int>(fval);
      if (int_fval < 0) {
379
380
381
        return right_child_[node];
      }
    }
382
    int cat_idx = static_cast<int>(threshold_[node]);
383
384
    if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                             cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], int_fval)) {
385
386
387
388
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
389

390
  inline int CategoricalDecisionInner(uint32_t fval, int node) const {
391
    int cat_idx = static_cast<int>(threshold_in_bin_[node]);
392
393
    if (Common::FindInBitset(cat_threshold_inner_.data() + cat_boundaries_inner_[cat_idx],
                             cat_boundaries_inner_[cat_idx + 1] - cat_boundaries_inner_[cat_idx], fval)) {
394
395
396
397
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
398

399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
  inline int Decision(double fval, int node) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecision(fval, node);
    } else {
      return NumericalDecision(fval, node);
    }
  }

  inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecisionInner(fval, node);
    } else {
      return NumericalDecisionInner(fval, node, default_bin, max_bin);
    }
  }

415
416
  inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt,
                    double left_weight, double right_weight, float gain);
Guolin Ke's avatar
Guolin Ke committed
417
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
418
  * \brief Find leaf index of which record belongs by features
Guolin Ke's avatar
Guolin Ke committed
419
420
421
  * \param feature_values Feature value of this record
  * \return Leaf index
  */
422
  inline int GetLeaf(const double* feature_values) const;
423
  inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;
Guolin Ke's avatar
Guolin Ke committed
424

wxchan's avatar
wxchan committed
425
  /*! \brief Serialize one node to json*/
Guolin Ke's avatar
Guolin Ke committed
426
  std::string NodeToJSON(int index) const;
wxchan's avatar
wxchan committed
427

428
  /*! \brief Serialize one node to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
429
  std::string NodeToIfElse(int index, bool predict_leaf_index) const;
Guolin Ke's avatar
Guolin Ke committed
430

Guolin Ke's avatar
Guolin Ke committed
431
  std::string NodeToIfElseByMap(int index, bool predict_leaf_index) const;
432

433
  double ExpectedValue() const;
Guolin Ke's avatar
Guolin Ke committed
434

435
436
  /*! \brief This is used fill in leaf_depth_ after reloading a model*/
  inline void RecomputeLeafDepths(int node = 0, int depth = 0);
Guolin Ke's avatar
Guolin Ke committed
437
438
439
440
441
442
443
444
445
446

  /*!
  * \brief Used by TreeSHAP for data we keep about our decision path
  */
  struct PathElement {
    int feature_index;
    double zero_fraction;
    double one_fraction;

    // note that pweight is included for convenience and is not tied with the other attributes,
447
    // the pweight of the i'th path element is the permutation weight of paths with i-1 ones in them
Guolin Ke's avatar
Guolin Ke committed
448
449
450
451
452
453
    double pweight;

    PathElement() {}
    PathElement(int i, double z, double o, double w) : feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
  };

454
  /*! \brief Polynomial time algorithm for SHAP values (arXiv:1706.06060)*/
Guolin Ke's avatar
Guolin Ke committed
455
456
457
458
  void TreeSHAP(const double *feature_values, double *phi,
                int node, int unique_depth,
                PathElement *parent_unique_path, double parent_zero_fraction,
                double parent_one_fraction, int parent_feature_index) const;
459

460
461
462
463
464
465
  void TreeSHAPByMap(const std::unordered_map<int, double>& feature_values,
                     std::unordered_map<int, double>* phi,
                     int node, int unique_depth,
                     PathElement *parent_unique_path, double parent_zero_fraction,
                     double parent_one_fraction, int parent_feature_index) const;

466
  /*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
467
468
  static void ExtendPath(PathElement *unique_path, int unique_depth,
                         double zero_fraction, double one_fraction, int feature_index);
469
470

  /*! \brief Undo a previous extension of the decision path for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
471
  static void UnwindPath(PathElement *unique_path, int unique_depth, int path_index);
472

473
  /*! determine what the total permutation weight would be if we unwound a previous extension in the decision path*/
Guolin Ke's avatar
Guolin Ke committed
474
  static double UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index);
475

Guolin Ke's avatar
Guolin Ke committed
476
477
  /*! \brief Number of max leaves*/
  int max_leaves_;
478
  /*! \brief Number of current leaves*/
Guolin Ke's avatar
Guolin Ke committed
479
480
481
  int num_leaves_;
  // following values used for non-leaf node
  /*! \brief A non-leaf node's left child */
Guolin Ke's avatar
Guolin Ke committed
482
  std::vector<int> left_child_;
Guolin Ke's avatar
Guolin Ke committed
483
  /*! \brief A non-leaf node's right child */
Guolin Ke's avatar
Guolin Ke committed
484
  std::vector<int> right_child_;
Guolin Ke's avatar
Guolin Ke committed
485
  /*! \brief A non-leaf node's split feature */
Guolin Ke's avatar
Guolin Ke committed
486
  std::vector<int> split_feature_inner_;
Guolin Ke's avatar
Guolin Ke committed
487
  /*! \brief A non-leaf node's split feature, the original index */
Guolin Ke's avatar
Guolin Ke committed
488
  std::vector<int> split_feature_;
Guolin Ke's avatar
Guolin Ke committed
489
  /*! \brief A non-leaf node's split threshold in bin */
Guolin Ke's avatar
Guolin Ke committed
490
  std::vector<uint32_t> threshold_in_bin_;
Guolin Ke's avatar
Guolin Ke committed
491
  /*! \brief A non-leaf node's split threshold in feature value */
Guolin Ke's avatar
Guolin Ke committed
492
  std::vector<double> threshold_;
493
  int num_cat_;
494
495
496
497
  std::vector<int> cat_boundaries_inner_;
  std::vector<uint32_t> cat_threshold_inner_;
  std::vector<int> cat_boundaries_;
  std::vector<uint32_t> cat_threshold_;
498
  /*! \brief Store the information for categorical feature handle and missing value handle. */
499
  std::vector<int8_t> decision_type_;
Guolin Ke's avatar
Guolin Ke committed
500
  /*! \brief A non-leaf node's split gain */
501
  std::vector<float> split_gain_;
Guolin Ke's avatar
Guolin Ke committed
502
503
  // used for leaf node
  /*! \brief The parent of leaf */
Guolin Ke's avatar
Guolin Ke committed
504
  std::vector<int> leaf_parent_;
Guolin Ke's avatar
Guolin Ke committed
505
  /*! \brief Output of leaves */
Guolin Ke's avatar
Guolin Ke committed
506
  std::vector<double> leaf_value_;
507
508
  /*! \brief weight of leaves */
  std::vector<double> leaf_weight_;
Guolin Ke's avatar
Guolin Ke committed
509
  /*! \brief DataCount of leaves */
510
  std::vector<int> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
511
512
  /*! \brief Output of non-leaf nodes */
  std::vector<double> internal_value_;
513
514
  /*! \brief weight of non-leaf nodes */
  std::vector<double> internal_weight_;
Guolin Ke's avatar
Guolin Ke committed
515
  /*! \brief DataCount of non-leaf nodes */
516
  std::vector<int> internal_count_;
Guolin Ke's avatar
Guolin Ke committed
517
  /*! \brief Depth for leaves */
Guolin Ke's avatar
Guolin Ke committed
518
  std::vector<int> leaf_depth_;
519
520
521
522
  /*! \brief whether to keep track of ancestor nodes for each leaf (only needed when feature interactions are restricted) */
  bool track_branch_features_;
  /*! \brief Features on leaf's branch, original index */
  std::vector<std::vector<int>> branch_features_;
Guolin Ke's avatar
Guolin Ke committed
523
  double shrinkage_;
524
  int max_depth_;
525
526
527
528
529
530
531
532
533
534
  /*! \brief Tree has linear model at each leaf */
  bool is_linear_;
  /*! \brief coefficients of linear models on leaves */
  std::vector<std::vector<double>> leaf_coeff_;
  /*! \brief constant term (bias) of linear models on leaves */
  std::vector<double> leaf_const_;
  /* \brief features used in leaf linear models; indexing is relative to num_total_features_ */
  std::vector<std::vector<int>> leaf_features_;
  /* \brief features used in leaf linear models; indexing is relative to used_features_ */
  std::vector<std::vector<int>> leaf_features_inner_;
535
536
537
538
  #ifdef USE_CUDA_EXP
  /*! \brief Marks whether this tree is a CUDATree */
  bool is_cuda_tree_;
  #endif  // USE_CUDA_EXP
Guolin Ke's avatar
Guolin Ke committed
539
540
};

541
inline void Tree::Split(int leaf, int feature, int real_feature,
542
                        double left_value, double right_value, int left_cnt, int right_cnt,
543
                        double left_weight, double right_weight, float gain) {
544
545
546
547
548
549
550
551
552
553
554
555
556
557
  int new_node_idx = num_leaves_ - 1;
  // update parent info
  int parent = leaf_parent_[leaf];
  if (parent >= 0) {
    // if cur node is left child
    if (left_child_[parent] == ~leaf) {
      left_child_[parent] = new_node_idx;
    } else {
      right_child_[parent] = new_node_idx;
    }
  }
  // add new node
  split_feature_inner_[new_node_idx] = feature;
  split_feature_[new_node_idx] = real_feature;
Guolin Ke's avatar
Guolin Ke committed
558
  split_gain_[new_node_idx] = gain;
559
560
561
562
563
564
565
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
  right_child_[new_node_idx] = ~num_leaves_;
  // update new leaves
  leaf_parent_[leaf] = new_node_idx;
  leaf_parent_[num_leaves_] = new_node_idx;
  // save current leaf value to internal node before change
566
  internal_weight_[new_node_idx] = leaf_weight_[leaf];
567
568
569
  internal_value_[new_node_idx] = leaf_value_[leaf];
  internal_count_[new_node_idx] = left_cnt + right_cnt;
  leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
570
  leaf_weight_[leaf] = left_weight;
571
572
  leaf_count_[leaf] = left_cnt;
  leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
573
  leaf_weight_[num_leaves_] = right_weight;
574
575
576
577
  leaf_count_[num_leaves_] = right_cnt;
  // update leaf depth
  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
  leaf_depth_[leaf]++;
578
579
580
581
582
  if (track_branch_features_) {
    branch_features_[num_leaves_] = branch_features_[leaf];
    branch_features_[num_leaves_].push_back(split_feature_[new_node_idx]);
    branch_features_[leaf].push_back(split_feature_[new_node_idx]);
  }
583
584
}

585
inline double Tree::Predict(const double* feature_values) const {
586
  if (is_linear_) {
587
      int leaf = (num_leaves_ > 1) ? GetLeaf(feature_values) : 0;
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
      double output = leaf_const_[leaf];
      bool nan_found = false;
      for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
        int feat_raw = leaf_features_[leaf][i];
        double feat_val = feature_values[feat_raw];
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
      if (nan_found) {
        return LeafOutput(leaf);
      } else {
        return output;
      }
Guolin Ke's avatar
Guolin Ke committed
605
  } else {
606
607
608
609
610
611
    if (num_leaves_ > 1) {
      int leaf = GetLeaf(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
Guolin Ke's avatar
Guolin Ke committed
612
  }
Guolin Ke's avatar
Guolin Ke committed
613
614
}

615
inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
616
  if (is_linear_) {
617
    int leaf = (num_leaves_ > 1) ? GetLeafByMap(feature_values) : 0;
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
    double output = leaf_const_[leaf];
    bool nan_found = false;
    for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
      int feat = leaf_features_[leaf][i];
      auto val_it = feature_values.find(feat);
      if (val_it != feature_values.end()) {
        double feat_val = val_it->second;
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
    }
    if (nan_found) {
      return LeafOutput(leaf);
    } else {
      return output;
    }
638
  } else {
639
640
641
642
643
644
    if (num_leaves_ > 1) {
      int leaf = GetLeafByMap(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
645
646
647
  }
}

648
inline int Tree::PredictLeafIndex(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
649
650
651
652
653
654
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
    return leaf;
  } else {
    return 0;
  }
wxchan's avatar
wxchan committed
655
656
}

657
658
659
660
661
662
663
664
665
inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
  if (num_leaves_ > 1) {
    int leaf = GetLeafByMap(feature_values);
    return leaf;
  } else {
    return 0;
  }
}

666
667
inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
  output[num_features] += ExpectedValue();
668
  // Run the recursion with preallocated space for the unique path data
669
  if (num_leaves_ > 1) {
670
    CHECK_GE(max_depth_, 0);
671
672
673
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAP(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
674
675
676
  }
}

677
678
679
680
681
682
683
684
685
686
687
688
inline void Tree::PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                      int num_features, std::unordered_map<int, double>* output) {
  (*output)[num_features] += ExpectedValue();
  // Run the recursion with preallocated space for the unique path data
  if (num_leaves_ > 1) {
    CHECK_GE(max_depth_, 0);
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
  }
}

689
690
691
692
693
inline void Tree::RecomputeLeafDepths(int node, int depth) {
  if (node == 0) leaf_depth_.resize(num_leaves());
  if (node < 0) {
    leaf_depth_[~node] = depth;
  } else {
Guolin Ke's avatar
Guolin Ke committed
694
695
    RecomputeLeafDepths(left_child_[node], depth + 1);
    RecomputeLeafDepths(right_child_[node], depth + 1);
696
  }
697
698
}

699
inline int Tree::GetLeaf(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
700
  int node = 0;
701
  if (num_cat_ > 0) {
Guolin Ke's avatar
Guolin Ke committed
702
    while (node >= 0) {
703
      node = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
704
705
706
    }
  } else {
    while (node >= 0) {
707
      node = NumericalDecision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
708
709
710
711
712
    }
  }
  return ~node;
}

713
714
715
716
717
718
719
720
721
722
723
724
725
726
inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
  int node = 0;
  if (num_cat_ > 0) {
    while (node >= 0) {
      node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  } else {
    while (node >= 0) {
      node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  }
  return ~node;
}

Guolin Ke's avatar
Guolin Ke committed
727
728
}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
729
#endif   // LightGBM_TREE_H_