tree.h 26.1 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
6
#ifndef LIGHTGBM_INCLUDE_LIGHTGBM_TREE_H_
#define LIGHTGBM_INCLUDE_LIGHTGBM_TREE_H_
Guolin Ke's avatar
Guolin Ke committed
7

8
9
10
#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>

11
#include <cstdint>
Guolin Ke's avatar
Guolin Ke committed
12
#include <string>
13
#include <map>
14
15
16
#include <memory>
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
17
18
19

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
20
21
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)
Guolin Ke's avatar
Guolin Ke committed
22
23
24
25
26

/*!
* \brief Tree model
*/
class Tree {
Nikita Titov's avatar
Nikita Titov committed
27
 public:
Guolin Ke's avatar
Guolin Ke committed
28
29
30
  /*!
  * \brief Constructor
  * \param max_leaves The number of max leaves
31
  * \param track_branch_features Whether to keep track of ancestors of leaf nodes
32
  * \param is_linear Whether the tree has linear models at each leaf
Guolin Ke's avatar
Guolin Ke committed
33
  */
34
  explicit Tree(int max_leaves, bool track_branch_features, bool is_linear);
Guolin Ke's avatar
Guolin Ke committed
35
36

  /*!
37
  * \brief Constructor, from a string
Guolin Ke's avatar
Guolin Ke committed
38
  * \param str Model string
39
  * \param used_len used count of str
Guolin Ke's avatar
Guolin Ke committed
40
  */
41
  Tree(const char* str, size_t* used_len);
Guolin Ke's avatar
Guolin Ke committed
42

43
  virtual ~Tree() noexcept = default;
Guolin Ke's avatar
Guolin Ke committed
44
45

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
46
47
48
  * \brief Performing a split on tree leaves.
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
Guolin Ke's avatar
Guolin Ke committed
49
  * \param real_feature Index of feature, the original index on data
50
  * \param threshold_bin Threshold(bin) of split
51
  * \param threshold_double Threshold on feature value
Guolin Ke's avatar
Guolin Ke committed
52
53
  * \param left_value Model Left child output
  * \param right_value Model Right child output
Guolin Ke's avatar
Guolin Ke committed
54
55
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
56
57
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
Guolin Ke's avatar
Guolin Ke committed
58
  * \param gain Split gain
Guolin Ke's avatar
Guolin Ke committed
59
60
  * \param missing_type missing type
  * \param default_left default direction for missing value
Guolin Ke's avatar
Guolin Ke committed
61
62
  * \return The index of new leaf.
  */
63
64
  int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
            double threshold_double, double left_value, double right_value,
65
66
            int left_cnt, int right_cnt, double left_weight, double right_weight,
            float gain, MissingType missing_type, bool default_left);
Guolin Ke's avatar
Guolin Ke committed
67

68
69
70
71
72
73
74
  /*!
  * \brief Performing a split on tree leaves, with categorical feature
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
  * \param real_feature Index of feature, the original index on data
  * \param threshold_bin Threshold(bin) of split, use bitset to represent
  * \param num_threshold_bin size of threshold_bin
75
76
  * \param threshold Thresholds of real feature value, use bitset to represent
  * \param num_threshold size of threshold
77
78
79
80
  * \param left_value Model Left child output
  * \param right_value Model Right child output
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
81
82
  * \param left_weight Weight of left child
  * \param right_weight Weight of right child
83
84
85
  * \param gain Split gain
  * \return The index of new leaf.
  */
86
87
  int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
                       const uint32_t* threshold, int num_threshold, double left_value, double right_value,
88
                       int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type);
89

Guolin Ke's avatar
Guolin Ke committed
90
  /*! \brief Get the output of one leaf */
91
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Guolin Ke's avatar
Guolin Ke committed
92

Guolin Ke's avatar
Guolin Ke committed
93
94
  /*! \brief Set the output of one leaf */
  inline void SetLeafOutput(int leaf, double output) {
95
    leaf_value_[leaf] = MaybeRoundToZero(output);
Guolin Ke's avatar
Guolin Ke committed
96
97
  }

Guolin Ke's avatar
Guolin Ke committed
98
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
99
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
  * \param data The dataset
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
104
  virtual void AddPredictionToScore(const Dataset* data,
105
106
                            data_size_t num_data,
                            double* score) const;
Guolin Ke's avatar
Guolin Ke committed
107
108

  /*!
109
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
110
111
112
113
114
  * \param data The dataset
  * \param used_data_indices Indices of used data
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
115
  virtual void AddPredictionToScore(const Dataset* data,
Qiwei Ye's avatar
Qiwei Ye committed
116
                            const data_size_t* used_data_indices,
117
                            data_size_t num_data, double* score) const;
Guolin Ke's avatar
Guolin Ke committed
118

119
120
121
122
123
124
125
126
127
128
  /*!
  * \brief Get upper bound leaf value of this tree model
  */
  double GetUpperBoundValue() const;

  /*!
  * \brief Get lower bound leaf value of this tree model
  */
  double GetLowerBoundValue() const;

Guolin Ke's avatar
Guolin Ke committed
129
  /*!
130
  * \brief Prediction on one record
Guolin Ke's avatar
Guolin Ke committed
131
132
133
  * \param feature_values Feature value of this record
  * \return Prediction result
  */
134
  inline double Predict(const double* feature_values) const;
135
  inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
136

137
  inline int PredictLeafIndex(const double* feature_values) const;
138
139
  inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

140
  inline void PredictContrib(const double* feature_values, int num_features, double* output);
141
142
  inline void PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                  int num_features, std::unordered_map<int, double>* output);
143

Guolin Ke's avatar
Guolin Ke committed
144
145
146
  /*! \brief Get Number of leaves*/
  inline int num_leaves() const { return num_leaves_; }

Guolin Ke's avatar
Guolin Ke committed
147
148
149
  /*! \brief Get depth of specific leaf*/
  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }

Belinda Trotta's avatar
Belinda Trotta committed
150
151
152
  /*! \brief Get parent of specific leaf*/
  inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }

153
  /*! \brief Get feature of specific split (original feature index)*/
Guolin Ke's avatar
Guolin Ke committed
154
  inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
wxchan's avatar
wxchan committed
155

156
157
158
  /*! \brief Get feature of specific split*/
  inline int split_feature_inner(int split_idx) const { return split_feature_inner_[split_idx]; }

159
160
161
  /*! \brief Get features on leaf's branch*/
  inline std::vector<int> branch_features(int leaf) const { return branch_features_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
162
163
  inline double split_gain(int split_idx) const { return split_gain_[split_idx]; }

164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
  inline double internal_value(int node_idx) const {
    return internal_value_[node_idx];
  }

  inline bool IsNumericalSplit(int node_idx) const {
    return !GetDecisionType(decision_type_[node_idx], kCategoricalMask);
  }

  inline int left_child(int node_idx) const { return left_child_[node_idx]; }

  inline int right_child(int node_idx) const { return right_child_[node_idx]; }

  inline uint32_t threshold_in_bin(int node_idx) const {
    return threshold_in_bin_[node_idx];
  }

180
  /*! \brief Get the number of data points that fall at or below this node*/
Guolin Ke's avatar
Guolin Ke committed
181
  inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; }
182

Guolin Ke's avatar
Guolin Ke committed
183
184
  /*!
  * \brief Shrinkage for the tree's output
185
  *        shrinkage rate (a.k.a learning rate) is used to tune the training process
Guolin Ke's avatar
Guolin Ke committed
186
187
  * \param rate The factor of shrinkage
  */
188
  virtual inline void Shrinkage(double rate) {
189
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
190
191
192
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate);
193
194
195
196
197
198
      if (is_linear_) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] * rate);
        for (size_t j = 0; j < leaf_coeff_[i].size(); ++j) {
          leaf_coeff_[i][j] = MaybeRoundToZero(leaf_coeff_[i][j] * rate);
        }
      }
Guolin Ke's avatar
Guolin Ke committed
199
    }
200
201
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] * rate);
202
203
204
205
206
207
    if (is_linear_) {
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] * rate);
      for (size_t j = 0; j < leaf_coeff_[num_leaves_ - 1].size(); ++j) {
        leaf_coeff_[num_leaves_ - 1][j] = MaybeRoundToZero(leaf_coeff_[num_leaves_ - 1][j] * rate);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
208
    shrinkage_ *= rate;
Guolin Ke's avatar
Guolin Ke committed
209
210
  }

211
  inline double shrinkage() const { return shrinkage_; }
212

213
  virtual inline void AddBias(double val) {
214
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
215
216
217
    for (int i = 0; i < num_leaves_ - 1; ++i) {
      leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
      internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val);
Guolin Ke's avatar
Guolin Ke committed
218
    }
219
220
    leaf_value_[num_leaves_ - 1] =
        MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val);
221
    if (is_linear_) {
222
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
223
224
225
226
227
      for (int i = 0; i < num_leaves_ - 1; ++i) {
        leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val);
      }
      leaf_const_[num_leaves_ - 1] = MaybeRoundToZero(leaf_const_[num_leaves_ - 1] + val);
    }
Guolin Ke's avatar
Guolin Ke committed
228
229
230
231
    // force to 1.0
    shrinkage_ = 1.0f;
  }

232
  virtual inline void AsConstantTree(double val, int count = 0) {
233
234
235
    num_leaves_ = 1;
    shrinkage_ = 1.0f;
    leaf_value_[0] = val;
236
237
238
    if (is_linear_) {
      leaf_const_[0] = val;
    }
239
    leaf_count_[0] = count;
240
241
  }

wxchan's avatar
wxchan committed
242
  /*! \brief Serialize this object to string*/
Guolin Ke's avatar
Guolin Ke committed
243
  std::string ToString() const;
Guolin Ke's avatar
Guolin Ke committed
244

wxchan's avatar
wxchan committed
245
  /*! \brief Serialize this object to json*/
Guolin Ke's avatar
Guolin Ke committed
246
  std::string ToJSON() const;
wxchan's avatar
wxchan committed
247

248
249
250
  /*! \brief Serialize linear model of tree node to json*/
  std::string LinearModelToJSON(int index) const;

251
  /*! \brief Serialize this object to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
252
  std::string ToIfElse(int index, bool predict_leaf_index) const;
253

Guolin Ke's avatar
Guolin Ke committed
254
  inline static bool IsZero(double fval) {
255
    return (fval >= -kZeroThreshold && fval <= kZeroThreshold);
Guolin Ke's avatar
Guolin Ke committed
256
257
  }

258
  inline static double MaybeRoundToZero(double fval) {
259
    return IsZero(fval) ? 0 : fval;
260
261
  }

Guolin Ke's avatar
Guolin Ke committed
262
263
264
265
266
267
268
  inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
    return (decision_type & mask) > 0;
  }

  inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
    if (input) {
      (*decision_type) |= mask;
Guolin Ke's avatar
Guolin Ke committed
269
    } else {
Guolin Ke's avatar
Guolin Ke committed
270
      (*decision_type) &= (127 - mask);
Guolin Ke's avatar
Guolin Ke committed
271
272
273
    }
  }

Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
279
280
281
282
  inline static int8_t GetMissingType(int8_t decision_type) {
    return (decision_type >> 2) & 3;
  }

  inline static void SetMissingType(int8_t* decision_type, int8_t input) {
    (*decision_type) &= 3;
    (*decision_type) |= (input << 2);
  }

283
284
  void RecomputeMaxDepth();

285
  int NextLeafId() const { return num_leaves_; }
286

287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
  /*! \brief Get the linear model constant term (bias) of one leaf */
  inline double LeafConst(int leaf) const { return leaf_const_[leaf]; }

  /*! \brief Get the linear model coefficients of one leaf */
  inline std::vector<double> LeafCoeffs(int leaf) const { return leaf_coeff_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeaturesInner(int leaf) const {return leaf_features_inner_[leaf]; }

  /*! \brief Get the linear model features of one leaf */
  inline std::vector<int> LeafFeatures(int leaf) const {return leaf_features_[leaf]; }

  /*! \brief Set the linear model coefficients on one leaf */
  inline void SetLeafCoeffs(int leaf, const std::vector<double>& output) {
    leaf_coeff_[leaf].resize(output.size());
    for (size_t i = 0; i < output.size(); ++i) {
      leaf_coeff_[leaf][i] = MaybeRoundToZero(output[i]);
    }
  }

  /*! \brief Set the linear model constant term (bias) on one leaf */
  inline void SetLeafConst(int leaf, double output) {
    leaf_const_[leaf] = MaybeRoundToZero(output);
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeaturesInner(int leaf, const std::vector<int>& features) {
    leaf_features_inner_[leaf] = features;
  }

  /*! \brief Set the linear model features on one leaf */
  inline void SetLeafFeatures(int leaf, const std::vector<int>& features) {
    leaf_features_[leaf] = features;
  }

  inline bool is_linear() const { return is_linear_; }

324
  #ifdef USE_CUDA
325
  inline bool is_cuda_tree() const { return is_cuda_tree_; }
326
  #endif  // USE_CUDA
327

328
329
330
331
  inline void SetIsLinear(bool is_linear) {
    is_linear_ = is_linear;
  }

332
 protected:
Guolin Ke's avatar
Guolin Ke committed
333
  std::string NumericalDecisionIfElse(int node) const;
Guolin Ke's avatar
Guolin Ke committed
334

Guolin Ke's avatar
Guolin Ke committed
335
  std::string CategoricalDecisionIfElse(int node) const;
336
337
338

  inline int NumericalDecision(double fval, int node) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
339
340
    if (std::isnan(fval) && missing_type != MissingType::NaN) {
      fval = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
341
    }
342
343
    if ((missing_type == MissingType::Zero && IsZero(fval))
        || (missing_type == MissingType::NaN && std::isnan(fval))) {
344
345
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
Guolin Ke's avatar
Guolin Ke committed
346
      } else {
347
        return right_child_[node];
Guolin Ke's avatar
Guolin Ke committed
348
349
      }
    }
350
351
352
353
354
    if (fval <= threshold_[node]) {
      return left_child_[node];
    } else {
      return right_child_[node];
    }
Guolin Ke's avatar
Guolin Ke committed
355
  }
Guolin Ke's avatar
Guolin Ke committed
356

357
358
  inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    uint8_t missing_type = GetMissingType(decision_type_[node]);
359
360
    if ((missing_type == MissingType::Zero && fval == default_bin)
        || (missing_type == MissingType::NaN && fval == max_bin)) {
361
362
363
364
365
366
367
368
      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
        return left_child_[node];
      } else {
        return right_child_[node];
      }
    }
    if (fval <= threshold_in_bin_[node]) {
      return left_child_[node];
369
    } else {
370
      return right_child_[node];
371
372
    }
  }
Guolin Ke's avatar
Guolin Ke committed
373

374
  inline int CategoricalDecision(double fval, int node) const {
375
376
377
378
379
380
    int int_fval;
    if (std::isnan(fval)) {
      return right_child_[node];
    } else {
      int_fval = static_cast<int>(fval);
      if (int_fval < 0) {
381
382
383
        return right_child_[node];
      }
    }
384
    int cat_idx = static_cast<int>(threshold_[node]);
385
386
    if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
                             cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], int_fval)) {
387
388
389
390
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
391

392
  inline int CategoricalDecisionInner(uint32_t fval, int node) const {
393
    int cat_idx = static_cast<int>(threshold_in_bin_[node]);
394
395
    if (Common::FindInBitset(cat_threshold_inner_.data() + cat_boundaries_inner_[cat_idx],
                             cat_boundaries_inner_[cat_idx + 1] - cat_boundaries_inner_[cat_idx], fval)) {
396
397
398
399
      return left_child_[node];
    }
    return right_child_[node];
  }
Guolin Ke's avatar
Guolin Ke committed
400

401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
  inline int Decision(double fval, int node) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecision(fval, node);
    } else {
      return NumericalDecision(fval, node);
    }
  }

  inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
      return CategoricalDecisionInner(fval, node);
    } else {
      return NumericalDecisionInner(fval, node, default_bin, max_bin);
    }
  }

417
418
  inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt,
                    double left_weight, double right_weight, float gain);
Guolin Ke's avatar
Guolin Ke committed
419
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
420
  * \brief Find leaf index of which record belongs by features
Guolin Ke's avatar
Guolin Ke committed
421
422
423
  * \param feature_values Feature value of this record
  * \return Leaf index
  */
424
  inline int GetLeaf(const double* feature_values) const;
425
  inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;
Guolin Ke's avatar
Guolin Ke committed
426

wxchan's avatar
wxchan committed
427
  /*! \brief Serialize one node to json*/
Guolin Ke's avatar
Guolin Ke committed
428
  std::string NodeToJSON(int index) const;
wxchan's avatar
wxchan committed
429

430
  /*! \brief Serialize one node to if-else statement*/
Guolin Ke's avatar
Guolin Ke committed
431
  std::string NodeToIfElse(int index, bool predict_leaf_index) const;
Guolin Ke's avatar
Guolin Ke committed
432

Guolin Ke's avatar
Guolin Ke committed
433
  std::string NodeToIfElseByMap(int index, bool predict_leaf_index) const;
434

435
  double ExpectedValue() const;
Guolin Ke's avatar
Guolin Ke committed
436

437
438
  /*! \brief This is used fill in leaf_depth_ after reloading a model*/
  inline void RecomputeLeafDepths(int node = 0, int depth = 0);
Guolin Ke's avatar
Guolin Ke committed
439
440
441
442
443
444
445
446
447
448

  /*!
  * \brief Used by TreeSHAP for data we keep about our decision path
  */
  struct PathElement {
    int feature_index;
    double zero_fraction;
    double one_fraction;

    // note that pweight is included for convenience and is not tied with the other attributes,
449
    // the pweight of the i'th path element is the permutation weight of paths with i-1 ones in them
Guolin Ke's avatar
Guolin Ke committed
450
451
452
453
454
455
    double pweight;

    PathElement() {}
    PathElement(int i, double z, double o, double w) : feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
  };

456
  /*! \brief Polynomial time algorithm for SHAP values (arXiv:1706.06060)*/
Guolin Ke's avatar
Guolin Ke committed
457
458
459
460
  void TreeSHAP(const double *feature_values, double *phi,
                int node, int unique_depth,
                PathElement *parent_unique_path, double parent_zero_fraction,
                double parent_one_fraction, int parent_feature_index) const;
461

462
463
464
465
466
467
  void TreeSHAPByMap(const std::unordered_map<int, double>& feature_values,
                     std::unordered_map<int, double>* phi,
                     int node, int unique_depth,
                     PathElement *parent_unique_path, double parent_zero_fraction,
                     double parent_one_fraction, int parent_feature_index) const;

468
  /*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
469
470
  static void ExtendPath(PathElement *unique_path, int unique_depth,
                         double zero_fraction, double one_fraction, int feature_index);
471
472

  /*! \brief Undo a previous extension of the decision path for TreeSHAP*/
Guolin Ke's avatar
Guolin Ke committed
473
  static void UnwindPath(PathElement *unique_path, int unique_depth, int path_index);
474

475
  /*! determine what the total permutation weight would be if we unwound a previous extension in the decision path*/
Guolin Ke's avatar
Guolin Ke committed
476
  static double UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index);
477

Guolin Ke's avatar
Guolin Ke committed
478
479
  /*! \brief Number of max leaves*/
  int max_leaves_;
480
  /*! \brief Number of current leaves*/
Guolin Ke's avatar
Guolin Ke committed
481
482
483
  int num_leaves_;
  // following values used for non-leaf node
  /*! \brief A non-leaf node's left child */
Guolin Ke's avatar
Guolin Ke committed
484
  std::vector<int> left_child_;
Guolin Ke's avatar
Guolin Ke committed
485
  /*! \brief A non-leaf node's right child */
Guolin Ke's avatar
Guolin Ke committed
486
  std::vector<int> right_child_;
Guolin Ke's avatar
Guolin Ke committed
487
  /*! \brief A non-leaf node's split feature */
Guolin Ke's avatar
Guolin Ke committed
488
  std::vector<int> split_feature_inner_;
Guolin Ke's avatar
Guolin Ke committed
489
  /*! \brief A non-leaf node's split feature, the original index */
Guolin Ke's avatar
Guolin Ke committed
490
  std::vector<int> split_feature_;
Guolin Ke's avatar
Guolin Ke committed
491
  /*! \brief A non-leaf node's split threshold in bin */
Guolin Ke's avatar
Guolin Ke committed
492
  std::vector<uint32_t> threshold_in_bin_;
Guolin Ke's avatar
Guolin Ke committed
493
  /*! \brief A non-leaf node's split threshold in feature value */
Guolin Ke's avatar
Guolin Ke committed
494
  std::vector<double> threshold_;
495
  int num_cat_;
496
497
498
499
  std::vector<int> cat_boundaries_inner_;
  std::vector<uint32_t> cat_threshold_inner_;
  std::vector<int> cat_boundaries_;
  std::vector<uint32_t> cat_threshold_;
500
  /*! \brief Store the information for categorical feature handle and missing value handle. */
501
  std::vector<int8_t> decision_type_;
Guolin Ke's avatar
Guolin Ke committed
502
  /*! \brief A non-leaf node's split gain */
503
  std::vector<float> split_gain_;
Guolin Ke's avatar
Guolin Ke committed
504
505
  // used for leaf node
  /*! \brief The parent of leaf */
Guolin Ke's avatar
Guolin Ke committed
506
  std::vector<int> leaf_parent_;
Guolin Ke's avatar
Guolin Ke committed
507
  /*! \brief Output of leaves */
Guolin Ke's avatar
Guolin Ke committed
508
  std::vector<double> leaf_value_;
509
510
  /*! \brief weight of leaves */
  std::vector<double> leaf_weight_;
Guolin Ke's avatar
Guolin Ke committed
511
  /*! \brief DataCount of leaves */
512
  std::vector<int> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
513
514
  /*! \brief Output of non-leaf nodes */
  std::vector<double> internal_value_;
515
516
  /*! \brief weight of non-leaf nodes */
  std::vector<double> internal_weight_;
Guolin Ke's avatar
Guolin Ke committed
517
  /*! \brief DataCount of non-leaf nodes */
518
  std::vector<int> internal_count_;
Guolin Ke's avatar
Guolin Ke committed
519
  /*! \brief Depth for leaves */
Guolin Ke's avatar
Guolin Ke committed
520
  std::vector<int> leaf_depth_;
521
522
523
524
  /*! \brief whether to keep track of ancestor nodes for each leaf (only needed when feature interactions are restricted) */
  bool track_branch_features_;
  /*! \brief Features on leaf's branch, original index */
  std::vector<std::vector<int>> branch_features_;
Guolin Ke's avatar
Guolin Ke committed
525
  double shrinkage_;
526
  int max_depth_;
527
528
529
530
531
532
533
534
535
536
  /*! \brief Tree has linear model at each leaf */
  bool is_linear_;
  /*! \brief coefficients of linear models on leaves */
  std::vector<std::vector<double>> leaf_coeff_;
  /*! \brief constant term (bias) of linear models on leaves */
  std::vector<double> leaf_const_;
  /* \brief features used in leaf linear models; indexing is relative to num_total_features_ */
  std::vector<std::vector<int>> leaf_features_;
  /* \brief features used in leaf linear models; indexing is relative to used_features_ */
  std::vector<std::vector<int>> leaf_features_inner_;
537
  #ifdef USE_CUDA
538
539
  /*! \brief Marks whether this tree is a CUDATree */
  bool is_cuda_tree_;
540
  #endif  // USE_CUDA
Guolin Ke's avatar
Guolin Ke committed
541
542
};

543
inline void Tree::Split(int leaf, int feature, int real_feature,
544
                        double left_value, double right_value, int left_cnt, int right_cnt,
545
                        double left_weight, double right_weight, float gain) {
546
547
548
549
550
551
552
553
554
555
556
557
558
559
  int new_node_idx = num_leaves_ - 1;
  // update parent info
  int parent = leaf_parent_[leaf];
  if (parent >= 0) {
    // if cur node is left child
    if (left_child_[parent] == ~leaf) {
      left_child_[parent] = new_node_idx;
    } else {
      right_child_[parent] = new_node_idx;
    }
  }
  // add new node
  split_feature_inner_[new_node_idx] = feature;
  split_feature_[new_node_idx] = real_feature;
Guolin Ke's avatar
Guolin Ke committed
560
  split_gain_[new_node_idx] = gain;
561
562
563
564
565
566
567
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
  right_child_[new_node_idx] = ~num_leaves_;
  // update new leaves
  leaf_parent_[leaf] = new_node_idx;
  leaf_parent_[num_leaves_] = new_node_idx;
  // save current leaf value to internal node before change
568
  internal_weight_[new_node_idx] = left_weight + right_weight;
569
570
571
  internal_value_[new_node_idx] = leaf_value_[leaf];
  internal_count_[new_node_idx] = left_cnt + right_cnt;
  leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
572
  leaf_weight_[leaf] = left_weight;
573
574
  leaf_count_[leaf] = left_cnt;
  leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
575
  leaf_weight_[num_leaves_] = right_weight;
576
577
578
579
  leaf_count_[num_leaves_] = right_cnt;
  // update leaf depth
  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
  leaf_depth_[leaf]++;
580
581
582
583
584
  if (track_branch_features_) {
    branch_features_[num_leaves_] = branch_features_[leaf];
    branch_features_[num_leaves_].push_back(split_feature_[new_node_idx]);
    branch_features_[leaf].push_back(split_feature_[new_node_idx]);
  }
585
586
}

587
inline double Tree::Predict(const double* feature_values) const {
588
  if (is_linear_) {
589
      int leaf = (num_leaves_ > 1) ? GetLeaf(feature_values) : 0;
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
      double output = leaf_const_[leaf];
      bool nan_found = false;
      for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
        int feat_raw = leaf_features_[leaf][i];
        double feat_val = feature_values[feat_raw];
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
      if (nan_found) {
        return LeafOutput(leaf);
      } else {
        return output;
      }
Guolin Ke's avatar
Guolin Ke committed
607
  } else {
608
609
610
611
612
613
    if (num_leaves_ > 1) {
      int leaf = GetLeaf(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
Guolin Ke's avatar
Guolin Ke committed
614
  }
Guolin Ke's avatar
Guolin Ke committed
615
616
}

617
inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
618
  if (is_linear_) {
619
    int leaf = (num_leaves_ > 1) ? GetLeafByMap(feature_values) : 0;
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
    double output = leaf_const_[leaf];
    bool nan_found = false;
    for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
      int feat = leaf_features_[leaf][i];
      auto val_it = feature_values.find(feat);
      if (val_it != feature_values.end()) {
        double feat_val = val_it->second;
        if (std::isnan(feat_val)) {
          nan_found = true;
          break;
        } else {
          output += leaf_coeff_[leaf][i] * feat_val;
        }
      }
    }
    if (nan_found) {
      return LeafOutput(leaf);
    } else {
      return output;
    }
640
  } else {
641
642
643
644
645
646
    if (num_leaves_ > 1) {
      int leaf = GetLeafByMap(feature_values);
      return LeafOutput(leaf);
    } else {
      return leaf_value_[0];
    }
647
648
649
  }
}

650
inline int Tree::PredictLeafIndex(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
651
652
653
654
655
656
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
    return leaf;
  } else {
    return 0;
  }
wxchan's avatar
wxchan committed
657
658
}

659
660
661
662
663
664
665
666
667
inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
  if (num_leaves_ > 1) {
    int leaf = GetLeafByMap(feature_values);
    return leaf;
  } else {
    return 0;
  }
}

668
669
inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
  output[num_features] += ExpectedValue();
670
  // Run the recursion with preallocated space for the unique path data
671
  if (num_leaves_ > 1) {
672
    CHECK_GE(max_depth_, 0);
673
674
675
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAP(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
676
677
678
  }
}

679
680
681
682
683
684
685
686
687
688
689
690
inline void Tree::PredictContribByMap(const std::unordered_map<int, double>& feature_values,
                                      int num_features, std::unordered_map<int, double>* output) {
  (*output)[num_features] += ExpectedValue();
  // Run the recursion with preallocated space for the unique path data
  if (num_leaves_ > 1) {
    CHECK_GE(max_depth_, 0);
    const int max_path_len = max_depth_ + 1;
    std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
    TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
  }
}

691
692
693
694
695
inline void Tree::RecomputeLeafDepths(int node, int depth) {
  if (node == 0) leaf_depth_.resize(num_leaves());
  if (node < 0) {
    leaf_depth_[~node] = depth;
  } else {
Guolin Ke's avatar
Guolin Ke committed
696
697
    RecomputeLeafDepths(left_child_[node], depth + 1);
    RecomputeLeafDepths(right_child_[node], depth + 1);
698
  }
699
700
}

701
inline int Tree::GetLeaf(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
702
  int node = 0;
703
  if (num_cat_ > 0) {
Guolin Ke's avatar
Guolin Ke committed
704
    while (node >= 0) {
705
      node = Decision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
706
707
708
    }
  } else {
    while (node >= 0) {
709
      node = NumericalDecision(feature_values[split_feature_[node]], node);
Guolin Ke's avatar
Guolin Ke committed
710
711
712
713
714
    }
  }
  return ~node;
}

715
716
717
718
719
720
721
722
723
724
725
726
727
728
inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
  int node = 0;
  if (num_cat_ > 0) {
    while (node >= 0) {
      node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  } else {
    while (node >= 0) {
      node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
    }
  }
  return ~node;
}

Guolin Ke's avatar
Guolin Ke committed
729
730
}  // namespace LightGBM

731
#endif   // LIGHTGBM_INCLUDE_LIGHTGBM_TREE_H_