tree.h 26 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_

8
9
10
#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>

Guolin Ke's avatar
Guolin Ke committed
11
#include <string>
12
#include <map>
13
14
15
#include <memory>
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
20
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25

/*!
* \brief Tree model
*/
class Tree {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
29
  /*!
  * \brief Constructor
  * \param max_leaves The number of max leaves
30
  * \param track_branch_features Whether to keep track of ancestors of leaf nodes
31
  * \param is_linear Whether the tree has linear models at each leaf
Guolin Ke's avatar
Guolin Ke committed
32
  */
33
  explicit Tree(int max_leaves, bool track_branch_features, bool is_linear);
Guolin Ke's avatar
Guolin Ke committed
34
35

  /*!
36
  * \brief Constructor, from a string
Guolin Ke's avatar
Guolin Ke committed
37
  * \param str Model string
38
  * \param used_len used count of str
Guolin Ke's avatar
Guolin Ke committed
39
  */
40
  Tree(const char* str, size_t* used_len);
Guolin Ke's avatar
Guolin Ke committed
41

42
  virtual ~Tree() noexcept = default;
Guolin Ke's avatar
Guolin Ke committed
43
44

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
45
46
47
  * \brief Performing a split on tree leaves.
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
Guolin Ke's avatar
Guolin Ke committed
48
  * \param real_feature Index of feature, the original index on data
49
  * \param threshold_bin Threshold(bin) of split
50
  * \param threshold_double Threshold on feature value
Guolin Ke's avatar
Guolin Ke committed
51
52
  * \param left_value Model Left child output
  * \param right_value Model Right child output
Guolin Ke's avatar
Guolin Ke committed
53
54
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
55
56
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
Guolin Ke's avatar
Guolin Ke committed
57
  * \param gain Split gain
Guolin Ke's avatar
Guolin Ke committed
58
59
  * \param missing_type missing type
  * \param default_left default direction for missing value
Guolin Ke's avatar
Guolin Ke committed
60
61
  * \return The index of new leaf.
  */
62
63
  int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
            double threshold_double, double left_value, double right_value,
64
65
            int left_cnt, int right_cnt, double left_weight, double right_weight,
            float gain, MissingType missing_type, bool default_left);
Guolin Ke's avatar
Guolin Ke committed
66

67
68
69
70
71
72
73
  /*!
  * \brief Performing a split on tree leaves, with categorical feature
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
  * \param real_feature Index of feature, the original index on data
  * \param threshold_bin Threshold(bin) of split, use bitset to represent
  * \param num_threshold_bin size of threshold_bin
74
75
  * \param threshold Thresholds of real feature value, use bitset to represent
  * \param num_threshold size of threshold
76
77
78
79
  * \param left_value Model Left child output
  * \param right_value Model Right child output
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
80
81
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
82
83
84
  * \param gain Split gain
  * \return The index of new leaf.
  */
85
86
  int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                       const uint32_t* threshold, int num_threshold, double left_value, double right_value,
87
                       int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type);
88

Guolin Ke's avatar
Guolin Ke committed
89
  /*! \brief Get the output of one leaf */
90
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Guolin Ke's avatar
Guolin Ke committed
91

Guolin Ke's avatar
Guolin Ke committed
92
93
  /*! \brief Set the output of one leaf */
  inline void SetLeafOutput(int leaf, double output) {
94
    leaf_value_[leaf] = MaybeRoundToZero(output);
Guolin Ke's avatar
Guolin Ke committed
95
96
  }

Guolin Ke's avatar
Guolin Ke committed
97
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
98
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
99
100
101
102
  * \param data The dataset
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
103
  virtual void AddPredictionToScore(const Dataset* data,
104
105
                            data_size_t num_data,
                            double* score) const;
Guolin Ke's avatar
Guolin Ke committed
106
107

  /*!
108
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
109
110
111
112
113
  * \param data The dataset
  * \param used_data_indices Indices of used data
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
114
  virtual void AddPredictionToScore(const Dataset* data,
Qiwei Ye's avatar
Qiwei Ye committed
115
                            const data_size_t* used_data_indices,
116
                            data_size_t num_data, double* score) const;
Guolin Ke's avatar
Guolin Ke committed
117

118
119
120
121
122
123
124
125
126
127
  /*!
  * \brief Get upper bound leaf value of this tree model
  */
  double GetUpperBoundValue() const;

  /*!
  * \brief Get lower bound leaf value of this tree model
  */
  double GetLowerBoundValue() const;

Guolin Ke's avatar
Guolin Ke committed
128
  /*!
129
  * \brief Prediction on one record
Guolin Ke's avatar
Guolin Ke committed
130
131
132
  * \param feature_values Feature value of this record
  * \return Prediction result
  */
133
  inline double Predict(const double* feature_values) const;
134
  inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
135

136
  inline int PredictLeafIndex(const double* feature_values) const;
137
138
  inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

139
  inline void PredictContrib(const double* feature_values, int num_features, double* output);
140
141
  inline void PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                  int num_features, std::unordered_map<int, double>* output);
142

Guolin Ke's avatar
Guolin Ke committed
143
144
145
  /*! \brief Get Number of leaves*/
  inline int num_leaves() const { return num_leaves_; }

Guolin Ke's avatar
Guolin Ke committed
146
147
148
  /*! \brief Get depth of specific leaf*/
  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }

Belinda Trotta's avatar
Belinda Trotta committed
149
150
151
  /*! \brief Get parent of specific leaf*/
  inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }

152
  /*! \brief Get feature of specific split (original feature index)*/
Guolin Ke's avatar
Guolin Ke committed
153
  inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
wxchan's avatar
wxchan committed
154

155
156
157
  /*! \brief Get feature of specific split*/
  inline int split_feature_inner(int split_idx) const { return split_feature_inner_[split_idx]; }

158
159
160
  /*! \brief Get features on leaf's branch*/
  inline std::vector<int> branch_features(int leaf) const { return branch_features_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
161
162
  inline double split_gain(int split_idx) const { return split_gain_[split_idx]; }

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  inline double internal_value(int node_idx) const {
    return internal_value_[node_idx];
  }

  inline bool IsNumericalSplit(int node_idx) const {
    return !GetDecisionType(decision_type_[node_idx], kCategoricalMask);
  }

  inline int left_child(int node_idx) const { return left_child_[node_idx]; }

  inline int right_child(int node_idx) const { return right_child_[node_idx]; }

  inline uint32_t threshold_in_bin(int node_idx) const {
    return threshold_in_bin_[node_idx];
  }

179
  /*! \brief Get the number of data points that fall at or below this node*/
Guolin Ke's avatar
Guolin Ke committed
180
  inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; }
181

Guolin Ke's avatar
Guolin Ke committed
182
183
  /*!
  * \brief Shrinkage for the tree's output
184
  *        shrinkage rate (a.k.a learning rate) is used to tune the training process
Guolin Ke's avatar
Guolin Ke committed
185
186
  * \param rate The factor of shrinkage
  */
187
  virtual inline void Shrinkage(double rate) {
188
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
189
190
191
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate);
192
193
194
195
196
197
      if (is_linear_) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] * rate);
        for (size_t j = 0; j < leaf_coeff_[i].size(); ++j) {
          leaf_coeff_[i][j] = MaybeRoundToZero(leaf_coeff_[i][j] * rate);
        }
      }
Guolin Ke's avatar
Guolin Ke committed
198
    }
199
200
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] * rate);
201
202
203
204
205
206
    if (is_linear_) {
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] * rate);
      for (size_t j = 0; j < leaf_coeff_[num_leaves_ - 1].size(); ++j) {
        leaf_coeff_[num_leaves_ - 1][j] = MaybeRoundToZero(leaf_coeff_[num_leaves_ - 1][j] * rate);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
207
    shrinkage_ *= rate;
Guolin Ke's avatar
Guolin Ke committed
208
209
  }

210
  inline double shrinkage() const { return shrinkage_; }
211

212
  virtual inline void AddBias(double val) {
213
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
214
215
216
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val);
Guolin Ke's avatar
Guolin Ke committed
217
    }
218
219
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val);
220
    if (is_linear_) {
221
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
222
223
224
225
226
      for (int i = 0; i < num_leaves_ - 1; ++i) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val);
      }
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] + val);
    }
Guolin Ke's avatar
Guolin Ke committed
227
228
229
230
    // force to 1.0
    shrinkage_ = 1.0f;
  }

231
  virtual inline void AsConstantTree(double val, int count = 0) {
232
233
234
    num_leaves_ = 1;
    shrinkage_ = 1.0f;
    leaf_value_[0] = val;
235
236
237
    if (is_linear_) {
      leaf_const_[0] = val;
    }
238
    leaf_count_[0] = count;
239
240
  }

wxchan's avatar
wxchan committed
241
  /*! \brief Serialize this object to string*/
Guolin Ke's avatar
Guolin Ke committed
242
  std::string ToString() const;
Guolin Ke's avatar
Guolin Ke committed
243

wxchan's avatar
wxchan committed
244
  /*! \brief Serialize this object to json*/
Guolin Ke's avatar
Guolin Ke committed
245
  std::string ToJSON() const;
wxchan's avatar
wxchan committed
246

247
248
249
  /*! \brief Serialize linear model of tree node to json*/
  std::string LinearModelToJSON(int index) const;

250
  /*! \brief Serialize this object to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
251
  std::string ToIfElse(int index, bool predict_leaf_index) const;
252

Guolin Ke's avatar
Guolin Ke committed
253
  inline static bool IsZero(double fval) {
254
    return (fval >= -kZeroThreshold && fval <= kZeroThreshold);
Guolin Ke's avatar
Guolin Ke committed
255
256
  }

257
  inline static double MaybeRoundToZero(double fval) {
258
    return IsZero(fval) ? 0 : fval;
259
260
  }

Guolin Ke's avatar
Guolin Ke committed
261
262
263
264
265
266
267
  inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
    return (decision_type & mask) > 0;
  }

  inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
    if (input) {
      (*decision_type) |= mask;
Guolin Ke's avatar
Guolin Ke committed
268
    } else {
Guolin Ke's avatar
Guolin Ke committed
269
      (*decision_type) &= (127 - mask);
Guolin Ke's avatar
Guolin Ke committed
270
271
272
    }
  }

Guolin Ke's avatar
Guolin Ke committed
273
274
275
276
277
278
279
280
281
  inline static int8_t GetMissingType(int8_t decision_type) {
    return (decision_type >> 2) & 3;
  }

  inline static void SetMissingType(int8_t* decision_type, int8_t input) {
    (*decision_type) &= 3;
    (*decision_type) |= (input << 2);
  }

282
283
  void RecomputeMaxDepth();

284
  int NextLeafId() const { return num_leaves_; }
285

286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
  /*! \brief Get the linear model constant term (bias) of one leaf */
  inline double LeafConst(int leaf) const { return leaf_const_[leaf]; }

  /*! \brief Get the linear model coefficients of one leaf */
  inline std::vector<double> LeafCoeffs(int leaf) const { return leaf_coeff_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeaturesInner(int leaf) const {return leaf_features_inner_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeatures(int leaf) const {return leaf_features_[leaf]; }

  /*! \brief Set the linear model coefficients on one leaf */
  inline void SetLeafCoeffs(int leaf, const std::vector<double>& output) {
    leaf_coeff_[leaf].resize(output.size());
    for (size_t i = 0; i < output.size(); ++i) {
      leaf_coeff_[leaf][i] = MaybeRoundToZero(output[i]);
    }
  }

  /*! \brief Set the linear model constant term (bias) on one leaf */
  inline void SetLeafConst(int leaf, double output) {
    leaf_const_[leaf] = MaybeRoundToZero(output);
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeaturesInner(int leaf, const std::vector<int>& features) {
    leaf_features_inner_[leaf] = features;
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeatures(int leaf, const std::vector<int>& features) {
    leaf_features_[leaf] = features;
  }

  inline bool is_linear() const { return is_linear_; }

323
  #ifdef USE_CUDA
324
  inline bool is_cuda_tree() const { return is_cuda_tree_; }
325
  #endif  // USE_CUDA
326

327
328
329
330
  inline void SetIsLinear(bool is_linear) {
    is_linear_ = is_linear;
  }

331
 protected:
Guolin Ke's avatar
Guolin Ke committed
332
  std::string NumericalDecisionIfElse(int node) const;
Guolin Ke's avatar
Guolin Ke committed
333

Guolin Ke's avatar
Guolin Ke committed
334
  std::string CategoricalDecisionIfElse(int node) const;
335
336
337

  inline int NumericalDecision(double fval, int node) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
338
339
    if (std::isnan(fval) && missing_type != MissingType::NaN) {
      fval = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
340
    }
341
342
    if ((missing_type == MissingType::Zero && IsZero(fval))
        || (missing_type == MissingType::NaN && std::isnan(fval))) {
343
344
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
Guolin Ke's avatar
Guolin Ke committed
345
      } else {
346
        return right_child_[node];
Guolin Ke's avatar
Guolin Ke committed
347
348
      }
    }
349
350
351
352
353
    if (fval <= threshold_[node]) {
      return left_child_[node];
    } else {
      return right_child_[node];
    }
Guolin Ke's avatar
Guolin Ke committed
354
  }
Guolin Ke's avatar
Guolin Ke committed
355

356
357
  inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
358
359
    if ((missing_type == MissingType::Zero && fval == default_bin)
        || (missing_type == MissingType::NaN && fval == max_bin)) {
360
361
362
363
364
365
366
367
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
      } else {
        return right_child_[node];
      }
    }
    if (fval <= threshold_in_bin_[node]) {
      return left_child_[node];
368
    } else {
369
      return right_child_[node];
370
371
    }
  }
Guolin Ke's avatar
Guolin Ke committed
372

373
  inline int CategoricalDecision(double fval, int node) const {
374
375
376
377
378
379
    int int_fval;
    if (std::isnan(fval)) {
      return right_child_[node];
    } else {
      int_fval = static_cast<int>(fval);
      if (int_fval < 0) {
380
381
382
        return right_child_[node];
      }
    }
383
    int cat_idx = static_cast<int>(threshold_[node]);
384
385
    if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                             cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], int_fval)) {
386
387
388
389
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
390

391
  inline int CategoricalDecisionInner(uint32_t fval, int node) const {
392
    int cat_idx = static_cast<int>(threshold_in_bin_[node]);
393
394
    if (Common::FindInBitset(cat_threshold_inner_.data() + cat_boundaries_inner_[cat_idx],
                             cat_boundaries_inner_[cat_idx + 1] - cat_boundaries_inner_[cat_idx], fval)) {
395
396
397
398
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
399

400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
  inline int Decision(double fval, int node) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecision(fval, node);
    } else {
      return NumericalDecision(fval, node);
    }
  }

  inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecisionInner(fval, node);
    } else {
      return NumericalDecisionInner(fval, node, default_bin, max_bin);
    }
  }

416
417
  inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt,
                    double left_weight, double right_weight, float gain);
Guolin Ke's avatar
Guolin Ke committed
418
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
419
  * \brief Find leaf index of which record belongs by features
Guolin Ke's avatar
Guolin Ke committed
420
421
422
  * \param feature_values Feature value of this record
  * \return Leaf index
  */
423
  inline int GetLeaf(const double* feature_values) const;
424
  inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;
Guolin Ke's avatar
Guolin Ke committed
425

wxchan's avatar
wxchan committed
426
  /*! \brief Serialize one node to json*/
Guolin Ke's avatar
Guolin Ke committed
427
  std::string NodeToJSON(int index) const;
wxchan's avatar
wxchan committed
428

429
  /*! \brief Serialize one node to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
430
  std::string NodeToIfElse(int index, bool predict_leaf_index) const;
Guolin Ke's avatar
Guolin Ke committed
431

Guolin Ke's avatar
Guolin Ke committed
432
  std::string NodeToIfElseByMap(int index, bool predict_leaf_index) const;
433

434
  double ExpectedValue() const;
Guolin Ke's avatar
Guolin Ke committed
435

436
437
  /*! \brief This is used fill in leaf_depth_ after reloading a model*/
  inline void RecomputeLeafDepths(int node = 0, int depth = 0);
Guolin Ke's avatar
Guolin Ke committed
438
439
440
441
442
443
444
445
446
447

  /*!
  * \brief Used by TreeSHAP for data we keep about our decision path
  */
  struct PathElement {
    int feature_index;
    double zero_fraction;
    double one_fraction;

    // note that pweight is included for convenience and is not tied with the other attributes,
448
    // the pweight of the i'th path element is the permutation weight of paths with i-1 ones in them
Guolin Ke's avatar
Guolin Ke committed
449
450
451
452
453
454
    double pweight;

    PathElement() {}
    PathElement(int i, double z, double o, double w) : feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
  };

455
  /*! \brief Polynomial time algorithm for SHAP values (arXiv:1706.06060)*/
Guolin Ke's avatar
Guolin Ke committed
456
457
458
459
  void TreeSHAP(const double *feature_values, double *phi,
                int node, int unique_depth,
                PathElement *parent_unique_path, double parent_zero_fraction,
                double parent_one_fraction, int parent_feature_index) const;
460

461
462
463
464
465
466
  void TreeSHAPByMap(const std::unordered_map<int, double>& feature_values,
                     std::unordered_map<int, double>* phi,
                     int node, int unique_depth,
                     PathElement *parent_unique_path, double parent_zero_fraction,
                     double parent_one_fraction, int parent_feature_index) const;

467
  /*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
468
469
  static void ExtendPath(PathElement *unique_path, int unique_depth,
                         double zero_fraction, double one_fraction, int feature_index);
470
471

  /*! \brief Undo a previous extension of the decision path for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
472
  static void UnwindPath(PathElement *unique_path, int unique_depth, int path_index);
473

474
  /*! determine what the total permutation weight would be if we unwound a previous extension in the decision path*/
Guolin Ke's avatar
Guolin Ke committed
475
  static double UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index);
476

Guolin Ke's avatar
Guolin Ke committed
477
478
  /*! \brief Number of max leaves*/
  int max_leaves_;
479
  /*! \brief Number of current leaves*/
Guolin Ke's avatar
Guolin Ke committed
480
481
482
  int num_leaves_;
  // following values used for non-leaf node
  /*! \brief A non-leaf node's left child */
Guolin Ke's avatar
Guolin Ke committed
483
  std::vector<int> left_child_;
Guolin Ke's avatar
Guolin Ke committed
484
  /*! \brief A non-leaf node's right child */
Guolin Ke's avatar
Guolin Ke committed
485
  std::vector<int> right_child_;
Guolin Ke's avatar
Guolin Ke committed
486
  /*! \brief A non-leaf node's split feature */
Guolin Ke's avatar
Guolin Ke committed
487
  std::vector<int> split_feature_inner_;
Guolin Ke's avatar
Guolin Ke committed
488
  /*! \brief A non-leaf node's split feature, the original index */
Guolin Ke's avatar
Guolin Ke committed
489
  std::vector<int> split_feature_;
Guolin Ke's avatar
Guolin Ke committed
490
  /*! \brief A non-leaf node's split threshold in bin */
Guolin Ke's avatar
Guolin Ke committed
491
  std::vector<uint32_t> threshold_in_bin_;
Guolin Ke's avatar
Guolin Ke committed
492
  /*! \brief A non-leaf node's split threshold in feature value */
Guolin Ke's avatar
Guolin Ke committed
493
  std::vector<double> threshold_;
494
  int num_cat_;
495
496
497
498
  std::vector<int> cat_boundaries_inner_;
  std::vector<uint32_t> cat_threshold_inner_;
  std::vector<int> cat_boundaries_;
  std::vector<uint32_t> cat_threshold_;
499
  /*! \brief Store the information for categorical feature handle and missing value handle. */
500
  std::vector<int8_t> decision_type_;
Guolin Ke's avatar
Guolin Ke committed
501
  /*! \brief A non-leaf node's split gain */
502
  std::vector<float> split_gain_;
Guolin Ke's avatar
Guolin Ke committed
503
504
  // used for leaf node
  /*! \brief The parent of leaf */
Guolin Ke's avatar
Guolin Ke committed
505
  std::vector<int> leaf_parent_;
Guolin Ke's avatar
Guolin Ke committed
506
  /*! \brief Output of leaves */
Guolin Ke's avatar
Guolin Ke committed
507
  std::vector<double> leaf_value_;
508
509
  /*! \brief weight of leaves */
  std::vector<double> leaf_weight_;
Guolin Ke's avatar
Guolin Ke committed
510
  /*! \brief DataCount of leaves */
511
  std::vector<int> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
512
513
  /*! \brief Output of non-leaf nodes */
  std::vector<double> internal_value_;
514
515
  /*! \brief weight of non-leaf nodes */
  std::vector<double> internal_weight_;
Guolin Ke's avatar
Guolin Ke committed
516
  /*! \brief DataCount of non-leaf nodes */
517
  std::vector<int> internal_count_;
Guolin Ke's avatar
Guolin Ke committed
518
  /*! \brief Depth for leaves */
Guolin Ke's avatar
Guolin Ke committed
519
  std::vector<int> leaf_depth_;
520
521
522
523
  /*! \brief whether to keep track of ancestor nodes for each leaf (only needed when feature interactions are restricted) */
  bool track_branch_features_;
  /*! \brief Features on leaf's branch, original index */
  std::vector<std::vector<int>> branch_features_;
Guolin Ke's avatar
Guolin Ke committed
524
  double shrinkage_;
525
  int max_depth_;
526
527
528
529
530
531
532
533
534
535
  /*! \brief Tree has linear model at each leaf */
  bool is_linear_;
  /*! \brief coefficients of linear models on leaves */
  std::vector<std::vector<double>> leaf_coeff_;
  /*! \brief constant term (bias) of linear models on leaves */
  std::vector<double> leaf_const_;
  /* \brief features used in leaf linear models; indexing is relative to num_total_features_ */
  std::vector<std::vector<int>> leaf_features_;
  /* \brief features used in leaf linear models; indexing is relative to used_features_ */
  std::vector<std::vector<int>> leaf_features_inner_;
536
  #ifdef USE_CUDA
537
538
  /*! \brief Marks whether this tree is a CUDATree */
  bool is_cuda_tree_;
539
  #endif  // USE_CUDA
Guolin Ke's avatar
Guolin Ke committed
540
541
};

542
inline void Tree::Split(int leaf, int feature, int real_feature,
543
                        double left_value, double right_value, int left_cnt, int right_cnt,
544
                        double left_weight, double right_weight, float gain) {
545
546
547
548
549
550
551
552
553
554
555
556
557
558
  int new_node_idx = num_leaves_ - 1;
  // update parent info
  int parent = leaf_parent_[leaf];
  if (parent >= 0) {
    // if cur node is left child
    if (left_child_[parent] == ~leaf) {
      left_child_[parent] = new_node_idx;
    } else {
      right_child_[parent] = new_node_idx;
    }
  }
  // add new node
  split_feature_inner_[new_node_idx] = feature;
  split_feature_[new_node_idx] = real_feature;
Guolin Ke's avatar
Guolin Ke committed
559
  split_gain_[new_node_idx] = gain;
560
561
562
563
564
565
566
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
  right_child_[new_node_idx] = ~num_leaves_;
  // update new leaves
  leaf_parent_[leaf] = new_node_idx;
  leaf_parent_[num_leaves_] = new_node_idx;
  // save current leaf value to internal node before change
567
  internal_weight_[new_node_idx] = left_weight + right_weight;
568
569
570
  internal_value_[new_node_idx] = leaf_value_[leaf];
  internal_count_[new_node_idx] = left_cnt + right_cnt;
  leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
571
  leaf_weight_[leaf] = left_weight;
572
573
  leaf_count_[leaf] = left_cnt;
  leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
574
  leaf_weight_[num_leaves_] = right_weight;
575
576
577
578
  leaf_count_[num_leaves_] = right_cnt;
  // update leaf depth
  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
  leaf_depth_[leaf]++;
579
580
581
582
583
  if (track_branch_features_) {
    branch_features_[num_leaves_] = branch_features_[leaf];
    branch_features_[num_leaves_].push_back(split_feature_[new_node_idx]);
    branch_features_[leaf].push_back(split_feature_[new_node_idx]);
  }
584
585
}

586
inline double Tree::Predict(const double* feature_values) const {
587
  if (is_linear_) {
588
      int leaf = (num_leaves_ > 1) ? GetLeaf(feature_values) : 0;
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
      double output = leaf_const_[leaf];
      bool nan_found = false;
      for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
        int feat_raw = leaf_features_[leaf][i];
        double feat_val = feature_values[feat_raw];
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
      if (nan_found) {
        return LeafOutput(leaf);
      } else {
        return output;
      }
Guolin Ke's avatar
Guolin Ke committed
606
  } else {
607
608
609
610
611
612
    if (num_leaves_ > 1) {
      int leaf = GetLeaf(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
Guolin Ke's avatar
Guolin Ke committed
613
  }
Guolin Ke's avatar
Guolin Ke committed
614
615
}

616
inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
617
  if (is_linear_) {
618
    int leaf = (num_leaves_ > 1) ? GetLeafByMap(feature_values) : 0;
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
    double output = leaf_const_[leaf];
    bool nan_found = false;
    for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
      int feat = leaf_features_[leaf][i];
      auto val_it = feature_values.find(feat);
      if (val_it != feature_values.end()) {
        double feat_val = val_it->second;
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
    }
    if (nan_found) {
      return LeafOutput(leaf);
    } else {
      return output;
    }
639
  } else {
640
641
642
643
644
645
    if (num_leaves_ > 1) {
      int leaf = GetLeafByMap(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
646
647
648
  }
}

649
inline int Tree::PredictLeafIndex(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
650
651
652
653
654
655
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
    return leaf;
  } else {
    return 0;
  }
wxchan's avatar
wxchan committed
656
657
}

658
659
660
661
662
663
664
665
666
inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
  if (num_leaves_ > 1) {
    int leaf = GetLeafByMap(feature_values);
    return leaf;
  } else {
    return 0;
  }
}

667
668
inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
  output[num_features] += ExpectedValue();
669
  // Run the recursion with preallocated space for the unique path data
670
  if (num_leaves_ > 1) {
671
    CHECK_GE(max_depth_, 0);
672
673
674
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAP(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
675
676
677
  }
}

678
679
680
681
682
683
684
685
686
687
688
689
inline void Tree::PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                      int num_features, std::unordered_map<int, double>* output) {
  (*output)[num_features] += ExpectedValue();
  // Run the recursion with preallocated space for the unique path data
  if (num_leaves_ > 1) {
    CHECK_GE(max_depth_, 0);
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
  }
}

690
691
692
693
694
inline void Tree::RecomputeLeafDepths(int node, int depth) {
  if (node == 0) leaf_depth_.resize(num_leaves());
  if (node < 0) {
    leaf_depth_[~node] = depth;
  } else {
Guolin Ke's avatar
Guolin Ke committed
695
696
    RecomputeLeafDepths(left_child_[node], depth + 1);
    RecomputeLeafDepths(right_child_[node], depth + 1);
697
  }
698
699
}

700
inline int Tree::GetLeaf(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
701
  int node = 0;
702
  if (num_cat_ > 0) {
Guolin Ke's avatar
Guolin Ke committed
703
    while (node >= 0) {
704
      node = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
705
706
707
    }
  } else {
    while (node >= 0) {
708
      node = NumericalDecision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
709
710
711
712
713
    }
  }
  return ~node;
}

714
715
716
717
718
719
720
721
722
723
724
725
726
727
inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
  int node = 0;
  if (num_cat_ > 0) {
    while (node >= 0) {
      node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  } else {
    while (node >= 0) {
      node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  }
  return ~node;
}

Guolin Ke's avatar
Guolin Ke committed
728
729
}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
730
#endif   // LightGBM_TREE_H_