tree.h 9.5 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
#ifndef LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_

#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>

#include <string>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
9
#include <memory>
Guolin Ke's avatar
Guolin Ke committed
10
11
12

namespace LightGBM {

13
#define kMaxTreeOutput (100)
Guolin Ke's avatar
Guolin Ke committed
14
15
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

/*!
* \brief Tree model
*/
class Tree {
public:
  /*!
  * \brief Constructor
  * \param max_leaves The number of max leaves
  */
  explicit Tree(int max_leaves);

  /*!
  * \brief Construtor, from a string
  * \param str Model string
  */
  explicit Tree(const std::string& str);

  ~Tree();

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
37
38
39
  * \brief Performing a split on tree leaves.
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
40
  * \param bin_type type of this feature, numerical or categorical
Guolin Ke's avatar
Guolin Ke committed
41
42
  * \param threshold Threshold(bin) of split
  * \param real_feature Index of feature, the original index on data
43
  * \param threshold_double Threshold on feature value
Guolin Ke's avatar
Guolin Ke committed
44
45
  * \param left_value Model Left child output
  * \param right_value Model Right child output
Guolin Ke's avatar
Guolin Ke committed
46
47
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
Guolin Ke's avatar
Guolin Ke committed
48
  * \param gain Split gain
Guolin Ke's avatar
Guolin Ke committed
49
50
  * \param missing_type missing type
  * \param default_left default direction for missing value
Guolin Ke's avatar
Guolin Ke committed
51
52
  * \return The index of new leaf.
  */
Guolin Ke's avatar
Guolin Ke committed
53
54
  int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature, 
            double threshold_double, double left_value, double right_value, 
Guolin Ke's avatar
Guolin Ke committed
55
            data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type, bool default_left);
Guolin Ke's avatar
Guolin Ke committed
56

Guolin Ke's avatar
Guolin Ke committed
57
  /*! \brief Get the output of one leaf */
58
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Guolin Ke's avatar
Guolin Ke committed
59

Guolin Ke's avatar
Guolin Ke committed
60
61
62
63
64
  /*! \brief Set the output of one leaf */
  inline void SetLeafOutput(int leaf, double output) {
    leaf_value_[leaf] = output;
  }

Guolin Ke's avatar
Guolin Ke committed
65
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
66
  * \brief Adding prediction value of this tree model to scores
Guolin Ke's avatar
Guolin Ke committed
67
68
69
70
  * \param data The dataset
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
71
72
73
  void AddPredictionToScore(const Dataset* data,
                            data_size_t num_data,
                            double* score) const;
Guolin Ke's avatar
Guolin Ke committed
74
75

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
76
  * \brief Adding prediction value of this tree model to scorese
Guolin Ke's avatar
Guolin Ke committed
77
78
79
80
81
82
  * \param data The dataset
  * \param used_data_indices Indices of used data
  * \param num_data Number of total data
  * \param score Will add prediction to score
  */
  void AddPredictionToScore(const Dataset* data,
Qiwei Ye's avatar
Qiwei Ye committed
83
                            const data_size_t* used_data_indices,
84
                            data_size_t num_data, double* score) const;
Guolin Ke's avatar
Guolin Ke committed
85
86

  /*!
87
  * \brief Prediction on one record
Guolin Ke's avatar
Guolin Ke committed
88
89
90
  * \param feature_values Feature value of this record
  * \return Prediction result
  */
91
92
  inline double Predict(const double* feature_values) const;
  inline int PredictLeafIndex(const double* feature_values) const;
Guolin Ke's avatar
Guolin Ke committed
93
94
95
96

  /*! \brief Get Number of leaves*/
  inline int num_leaves() const { return num_leaves_; }

Guolin Ke's avatar
Guolin Ke committed
97
98
99
  /*! \brief Get depth of specific leaf*/
  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }

wxchan's avatar
wxchan committed
100
  /*! \brief Get feature of specific split*/
Guolin Ke's avatar
Guolin Ke committed
101
  inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
wxchan's avatar
wxchan committed
102

Guolin Ke's avatar
Guolin Ke committed
103
104
  inline double split_gain(int split_idx) const { return split_gain_[split_idx]; }

Guolin Ke's avatar
Guolin Ke committed
105
106
  /*!
  * \brief Shrinkage for the tree's output
Qiwei Ye's avatar
Qiwei Ye committed
107
  *        shrinkage rate (a.k.a learning rate) is used to tune the traning process
Guolin Ke's avatar
Guolin Ke committed
108
109
  * \param rate The factor of shrinkage
  */
110
  inline void Shrinkage(double rate) {
Guolin Ke's avatar
Guolin Ke committed
111
    #pragma omp parallel for schedule(static, 512) if (num_leaves_ >= 1024)
Guolin Ke's avatar
Guolin Ke committed
112
    for (int i = 0; i < num_leaves_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
113
      leaf_value_[i] *= rate;
114
115
      if (leaf_value_[i] > kMaxTreeOutput) { leaf_value_[i] = kMaxTreeOutput; } 
      else if (leaf_value_[i] < -kMaxTreeOutput) { leaf_value_[i] = -kMaxTreeOutput; }
Guolin Ke's avatar
Guolin Ke committed
116
    }
Guolin Ke's avatar
Guolin Ke committed
117
    shrinkage_ *= rate;
Guolin Ke's avatar
Guolin Ke committed
118
119
  }

wxchan's avatar
wxchan committed
120
  /*! \brief Serialize this object to string*/
Guolin Ke's avatar
Guolin Ke committed
121
122
  std::string ToString();

wxchan's avatar
wxchan committed
123
124
125
  /*! \brief Serialize this object to json*/
  std::string ToJSON();

126
127
128
  /*! \brief Serialize this object to if-else statement*/
  std::string ToIfElse(int index, bool is_predict_leaf_index);

129
  template<typename T>
Guolin Ke's avatar
Guolin Ke committed
130
  inline static bool CategoricalDecision(T fval, T threshold) {
131
132
133
134
135
136
137
    if (static_cast<int>(fval) == static_cast<int>(threshold)) {
      return true;
    } else {
      return false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
138
  template<typename T>
Guolin Ke's avatar
Guolin Ke committed
139
  inline static bool NumericalDecision(T fval, T threshold) {
Guolin Ke's avatar
Guolin Ke committed
140
141
142
143
144
145
146
    if (fval <= threshold) {
      return true;
    } else {
      return false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
147
148
149
  inline static bool IsZero(double fval) {
    if (fval > -kZeroAsMissingValueRange && fval <= kZeroAsMissingValueRange) {
      return true;
Guolin Ke's avatar
Guolin Ke committed
150
    } else {
Guolin Ke's avatar
Guolin Ke committed
151
      return false;
Guolin Ke's avatar
Guolin Ke committed
152
153
154
    }
  }

Guolin Ke's avatar
Guolin Ke committed
155
156
157
158
159
160
161
  inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
    return (decision_type & mask) > 0;
  }

  inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
    if (input) {
      (*decision_type) |= mask;
Guolin Ke's avatar
Guolin Ke committed
162
    } else {
Guolin Ke's avatar
Guolin Ke committed
163
      (*decision_type) &= (127 - mask);
Guolin Ke's avatar
Guolin Ke committed
164
165
166
    }
  }

Guolin Ke's avatar
Guolin Ke committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
  inline static int8_t GetMissingType(int8_t decision_type) {
    return (decision_type >> 2) & 3;
  }

  inline static void SetMissingType(int8_t* decision_type, int8_t input) {
    (*decision_type) &= 3;
    (*decision_type) |= (input << 2);
  }

  inline static uint32_t ConvertMissingValue(uint32_t fval, uint32_t threshold, int8_t decision_type, uint32_t default_bin, uint32_t max_bin) {
    uint8_t missing_type = GetMissingType(decision_type);
    if ((missing_type == 1 && fval == default_bin)
        || (missing_type == 2 && fval == max_bin)) {
      if (GetDecisionType(decision_type, kDefaultLeftMask)) {
        fval = threshold;
      } else {
        fval = threshold + 1;
      }
    }
    return fval;
  }

  inline static double ConvertMissingValue(double fval, double threshold, int8_t decision_type) {
    uint8_t missing_type = GetMissingType(decision_type);
    if (std::isnan(fval)) {
      if (missing_type != 2) {
        fval = 0.0f;
      }
    }
    if ((missing_type == 1 && IsZero(fval))
        || (missing_type == 2 && std::isnan(fval))) {
      if (GetDecisionType(decision_type, kDefaultLeftMask)) {
        fval = threshold;
      } else {
        fval = 10.0f * threshold;
      }
    }
    return fval;
  }
Guolin Ke's avatar
Guolin Ke committed
206

Guolin Ke's avatar
Guolin Ke committed
207
  inline static const char* GetDecisionTypeName(int8_t type) {
208
209
210
211
212
213
    if (type == 0) {
      return "no_greater";
    } else {
      return "is";
    }
  }
Guolin Ke's avatar
Guolin Ke committed
214

215
216
  static std::vector<bool(*)(uint32_t, uint32_t)> inner_decision_funs;
  static std::vector<bool(*)(double, double)> decision_funs;
Guolin Ke's avatar
Guolin Ke committed
217

218
private:
Guolin Ke's avatar
Guolin Ke committed
219
220

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
221
  * \brief Find leaf index of which record belongs by features
Guolin Ke's avatar
Guolin Ke committed
222
223
224
  * \param feature_values Feature value of this record
  * \return Leaf index
  */
225
  inline int GetLeaf(const double* feature_values) const;
Guolin Ke's avatar
Guolin Ke committed
226

wxchan's avatar
wxchan committed
227
228
229
  /*! \brief Serialize one node to json*/
  inline std::string NodeToJSON(int index);

230
231
232
  /*! \brief Serialize one node to if-else statement*/
  inline std::string NodeToIfElse(int index, bool is_predict_leaf_index);

Guolin Ke's avatar
Guolin Ke committed
233
234
235
236
237
238
  /*! \brief Number of max leaves*/
  int max_leaves_;
  /*! \brief Number of current levas*/
  int num_leaves_;
  // following values used for non-leaf node
  /*! \brief A non-leaf node's left child */
Guolin Ke's avatar
Guolin Ke committed
239
  std::vector<int> left_child_;
Guolin Ke's avatar
Guolin Ke committed
240
  /*! \brief A non-leaf node's right child */
Guolin Ke's avatar
Guolin Ke committed
241
  std::vector<int> right_child_;
Guolin Ke's avatar
Guolin Ke committed
242
  /*! \brief A non-leaf node's split feature */
Guolin Ke's avatar
Guolin Ke committed
243
  std::vector<int> split_feature_inner_;
Guolin Ke's avatar
Guolin Ke committed
244
  /*! \brief A non-leaf node's split feature, the original index */
Guolin Ke's avatar
Guolin Ke committed
245
  std::vector<int> split_feature_;
Guolin Ke's avatar
Guolin Ke committed
246
  /*! \brief A non-leaf node's split threshold in bin */
Guolin Ke's avatar
Guolin Ke committed
247
  std::vector<uint32_t> threshold_in_bin_;
Guolin Ke's avatar
Guolin Ke committed
248
  /*! \brief A non-leaf node's split threshold in feature value */
Guolin Ke's avatar
Guolin Ke committed
249
  std::vector<double> threshold_;
Guolin Ke's avatar
Guolin Ke committed
250
  /*! \brief Store the information for categorical feature handle and mising value handle. */
251
  std::vector<int8_t> decision_type_;
Guolin Ke's avatar
Guolin Ke committed
252
  /*! \brief A non-leaf node's split gain */
Guolin Ke's avatar
Guolin Ke committed
253
  std::vector<double> split_gain_;
Guolin Ke's avatar
Guolin Ke committed
254
255
  // used for leaf node
  /*! \brief The parent of leaf */
Guolin Ke's avatar
Guolin Ke committed
256
  std::vector<int> leaf_parent_;
Guolin Ke's avatar
Guolin Ke committed
257
  /*! \brief Output of leaves */
Guolin Ke's avatar
Guolin Ke committed
258
  std::vector<double> leaf_value_;
Guolin Ke's avatar
Guolin Ke committed
259
260
261
262
263
264
  /*! \brief DataCount of leaves */
  std::vector<data_size_t> leaf_count_;
  /*! \brief Output of non-leaf nodes */
  std::vector<double> internal_value_;
  /*! \brief DataCount of non-leaf nodes */
  std::vector<data_size_t> internal_count_;
Guolin Ke's avatar
Guolin Ke committed
265
  /*! \brief Depth for leaves */
Guolin Ke's avatar
Guolin Ke committed
266
  std::vector<int> leaf_depth_;
Guolin Ke's avatar
Guolin Ke committed
267
  double shrinkage_;
268
  bool has_categorical_;
Guolin Ke's avatar
Guolin Ke committed
269
270
};

271
inline double Tree::Predict(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
272
273
274
275
276
277
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
    return LeafOutput(leaf);
  } else {
    return 0.0f;
  }
Guolin Ke's avatar
Guolin Ke committed
278
279
}

280
inline int Tree::PredictLeafIndex(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
281
282
283
284
285
286
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
    return leaf;
  } else {
    return 0;
  }
wxchan's avatar
wxchan committed
287
288
}

289
inline int Tree::GetLeaf(const double* feature_values) const {
Guolin Ke's avatar
Guolin Ke committed
290
  int node = 0;
Guolin Ke's avatar
Guolin Ke committed
291
292
  if (has_categorical_) {
    while (node >= 0) {
Guolin Ke's avatar
Guolin Ke committed
293
294
      double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
      if (decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
Guolin Ke's avatar
Guolin Ke committed
295
        fval,
Guolin Ke's avatar
Guolin Ke committed
296
297
298
299
300
301
302
303
        threshold_[node])) {
        node = left_child_[node];
      } else {
        node = right_child_[node];
      }
    }
  } else {
    while (node >= 0) {
Guolin Ke's avatar
Guolin Ke committed
304
      double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
Guolin Ke's avatar
Guolin Ke committed
305
      if (NumericalDecision<double>(
Guolin Ke's avatar
Guolin Ke committed
306
        fval,
Guolin Ke's avatar
Guolin Ke committed
307
308
309
310
311
        threshold_[node])) {
        node = left_child_[node];
      } else {
        node = right_child_[node];
      }
Guolin Ke's avatar
Guolin Ke committed
312
313
314
315
316
317
318
    }
  }
  return ~node;
}

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
319
#endif   // LightGBM_TREE_H_