tree.cpp 9.6 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
#include <LightGBM/tree.h>

#include <LightGBM/utils/threading.h>
#include <LightGBM/utils/common.h>

#include <LightGBM/dataset.h>
#include <LightGBM/feature.h>

#include <sstream>
#include <unordered_map>
#include <functional>
#include <vector>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
14
#include <memory>
15
#include <iomanip>
Guolin Ke's avatar
Guolin Ke committed
16
17
18

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
20
21
22
23
24
std::vector<std::function<bool(unsigned int, unsigned int)>> Tree::inner_decision_funs = 
          {Tree::NumericalDecision<unsigned int>, Tree::CategoricalDecision<unsigned int> };
std::vector<std::function<bool(double, double)>> Tree::decision_funs = 
          { Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };


Guolin Ke's avatar
Guolin Ke committed
25
26
27
Tree::Tree(int max_leaves)
  :max_leaves_(max_leaves) {

Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
32
33
34
  num_leaves_ = 0;
  left_child_ = std::vector<int>(max_leaves_ - 1);
  right_child_ = std::vector<int>(max_leaves_ - 1);
  split_feature_ = std::vector<int>(max_leaves_ - 1);
  split_feature_real_ = std::vector<int>(max_leaves_ - 1);
  threshold_in_bin_ = std::vector<unsigned int>(max_leaves_ - 1);
  threshold_ = std::vector<double>(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
35
  decision_type_ = std::vector<int8_t>(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
36
37
38
  split_gain_ = std::vector<double>(max_leaves_ - 1);
  leaf_parent_ = std::vector<int>(max_leaves_);
  leaf_value_ = std::vector<double>(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
39
  leaf_count_ = std::vector<data_size_t>(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
40
  internal_value_ = std::vector<double>(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
41
  internal_count_ = std::vector<data_size_t>(max_leaves_ - 1);
Guolin Ke's avatar
Guolin Ke committed
42
  leaf_depth_ = std::vector<int>(max_leaves_);
Guolin Ke's avatar
Guolin Ke committed
43
44
  // root is in the depth 0
  leaf_depth_[0] = 0;
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
  num_leaves_ = 1;
  leaf_parent_[0] = -1;
}
Tree::~Tree() {
Guolin Ke's avatar
Guolin Ke committed
49

Guolin Ke's avatar
Guolin Ke committed
50
51
}

Guolin Ke's avatar
Guolin Ke committed
52
53
54
int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_bin, int real_feature,
    double threshold_double, double left_value,
    double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  int new_node_idx = num_leaves_ - 1;
  // update parent info
  int parent = leaf_parent_[leaf];
  if (parent >= 0) {
    // if cur node is left child
    if (left_child_[parent] == ~leaf) {
      left_child_[parent] = new_node_idx;
    } else {
      right_child_[parent] = new_node_idx;
    }
  }
  // add new node
  split_feature_[new_node_idx] = feature;
  split_feature_real_[new_node_idx] = real_feature;
  threshold_in_bin_[new_node_idx] = threshold_bin;
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
  threshold_[new_node_idx] = threshold_double;
  if (bin_type == BinType::NumericalBin) {
    decision_type_[new_node_idx] = 0;
  } else {
    decision_type_[new_node_idx] = 1;
  }
Guolin Ke's avatar
Guolin Ke committed
76
77
78
79
80
81
82
  split_gain_[new_node_idx] = gain;
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
  right_child_[new_node_idx] = ~num_leaves_;
  // update new leaves
  leaf_parent_[leaf] = new_node_idx;
  leaf_parent_[num_leaves_] = new_node_idx;
83
84
  // save current leaf value to internal node before change
  internal_value_[new_node_idx] = leaf_value_[leaf];
Guolin Ke's avatar
Guolin Ke committed
85
  internal_count_[new_node_idx] = left_cnt + right_cnt;
Guolin Ke's avatar
Guolin Ke committed
86
  leaf_value_[leaf] = left_value;
Guolin Ke's avatar
Guolin Ke committed
87
  leaf_count_[leaf] = left_cnt;
Guolin Ke's avatar
Guolin Ke committed
88
  leaf_value_[num_leaves_] = right_value;
Guolin Ke's avatar
Guolin Ke committed
89
  leaf_count_[num_leaves_] = right_cnt;
Guolin Ke's avatar
Guolin Ke committed
90
91
92
  // update leaf depth
  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
  leaf_depth_[leaf]++;
Guolin Ke's avatar
Guolin Ke committed
93
94
95
96
97

  ++num_leaves_;
  return num_leaves_ - 1;
}

98
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
Guolin Ke's avatar
Guolin Ke committed
99
  Threading::For<data_size_t>(0, num_data, [this, data, score](int, data_size_t start, data_size_t end) {
Guolin Ke's avatar
Guolin Ke committed
100
    std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
101
    for (int i = 0; i < data->num_features(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
102
      iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(start));
Guolin Ke's avatar
Guolin Ke committed
103
    }
104
    for (data_size_t i = start; i < end; ++i) {
105
      score[i] += static_cast<double>(leaf_value_[GetLeaf(iterators, i)]);
Guolin Ke's avatar
Guolin Ke committed
106
107
108
109
110
    }
  });
}

void Tree::AddPredictionToScore(const Dataset* data, const data_size_t* used_data_indices,
111
                                             data_size_t num_data, double* score) const {
Guolin Ke's avatar
Guolin Ke committed
112
113
  Threading::For<data_size_t>(0, num_data,
      [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
Guolin Ke's avatar
Guolin Ke committed
114
    std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
115
    for (int i = 0; i < data->num_features(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
116
      iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(used_data_indices[start]));
Guolin Ke's avatar
Guolin Ke committed
117
    }
118
    for (data_size_t i = start; i < end; ++i) {
119
      score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeaf(iterators, used_data_indices[i])]);
Guolin Ke's avatar
Guolin Ke committed
120
121
122
123
124
    }
  });
}

std::string Tree::ToString() {
125
126
127
  std::stringstream str_buf;
  str_buf << "num_leaves=" << num_leaves_ << std::endl;
  str_buf << "split_feature="
Guolin Ke's avatar
Guolin Ke committed
128
    << Common::ArrayToString<int>(split_feature_real_, num_leaves_ - 1, ' ') << std::endl;
129
  str_buf << "split_gain="
Guolin Ke's avatar
Guolin Ke committed
130
    << Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl;
131
  str_buf << "threshold="
Guolin Ke's avatar
Guolin Ke committed
132
    << Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
133
  str_buf << "decision_type="
Guolin Ke's avatar
Guolin Ke committed
134
    << Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl;
135
  str_buf << "left_child="
Guolin Ke's avatar
Guolin Ke committed
136
    << Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
137
  str_buf << "right_child="
Guolin Ke's avatar
Guolin Ke committed
138
    << Common::ArrayToString<int>(right_child_, num_leaves_ - 1, ' ') << std::endl;
139
  str_buf << "leaf_parent="
Guolin Ke's avatar
Guolin Ke committed
140
    << Common::ArrayToString<int>(leaf_parent_, num_leaves_, ' ') << std::endl;
141
  str_buf << "leaf_value="
Guolin Ke's avatar
Guolin Ke committed
142
    << Common::ArrayToString<double>(leaf_value_, num_leaves_, ' ') << std::endl;
143
  str_buf << "leaf_count="
Guolin Ke's avatar
Guolin Ke committed
144
    << Common::ArrayToString<data_size_t>(leaf_count_, num_leaves_, ' ') << std::endl;
145
  str_buf << "internal_value="
Guolin Ke's avatar
Guolin Ke committed
146
    << Common::ArrayToString<double>(internal_value_, num_leaves_ - 1, ' ') << std::endl;
147
  str_buf << "internal_count="
Guolin Ke's avatar
Guolin Ke committed
148
    << Common::ArrayToString<data_size_t>(internal_count_, num_leaves_ - 1, ' ') << std::endl;
149
150
  str_buf << std::endl;
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
151
152
}

wxchan's avatar
wxchan committed
153
std::string Tree::ToJSON() {
154
  std::stringstream str_buf;
155
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
156
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
wxchan's avatar
wxchan committed
157

158
  str_buf << "\"tree_structure\":" << NodeToJSON(0) << std::endl;
wxchan's avatar
wxchan committed
159

160
  return str_buf.str();
wxchan's avatar
wxchan committed
161
162
163
}

std::string Tree::NodeToJSON(int index) {
164
  std::stringstream str_buf;
165
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
wxchan's avatar
wxchan committed
166
167
  if (index >= 0) {
    // non-leaf
168
169
170
171
172
173
174
175
176
177
178
    str_buf << "{" << std::endl;
    str_buf << "\"split_index\":" << index << "," << std::endl;
    str_buf << "\"split_feature\":" << split_feature_real_[index] << "," << std::endl;
    str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
    str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
    str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
    str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
    str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
    str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
    str_buf << "\"right_child\":" << NodeToJSON(right_child_[index]) << std::endl;
    str_buf << "}";
wxchan's avatar
wxchan committed
179
180
181
  } else {
    // leaf
    index = ~index;
182
183
184
185
186
187
    str_buf << "{" << std::endl;
    str_buf << "\"leaf_index\":" << index << "," << std::endl;
    str_buf << "\"leaf_parent\":" << leaf_parent_[index] << "," << std::endl;
    str_buf << "\"leaf_value\":" << leaf_value_[index] << "," << std::endl;
    str_buf << "\"leaf_count\":" << leaf_count_[index] << std::endl;
    str_buf << "}";
wxchan's avatar
wxchan committed
188
189
  }

190
  return str_buf.str();
wxchan's avatar
wxchan committed
191
192
}

Guolin Ke's avatar
Guolin Ke committed
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
Tree::Tree(const std::string& str) {
  std::vector<std::string> lines = Common::Split(str.c_str(), '\n');
  std::unordered_map<std::string, std::string> key_vals;
  for (const std::string& line : lines) {
    std::vector<std::string> tmp_strs = Common::Split(line.c_str(), '=');
    if (tmp_strs.size() == 2) {
      std::string key = Common::Trim(tmp_strs[0]);
      std::string val = Common::Trim(tmp_strs[1]);
      if (key.size() > 0 && val.size() > 0) {
        key_vals[key] = val;
      }
    }
  }
  if (key_vals.count("num_leaves") <= 0 || key_vals.count("split_feature") <= 0
    || key_vals.count("split_gain") <= 0 || key_vals.count("threshold") <= 0
    || key_vals.count("left_child") <= 0 || key_vals.count("right_child") <= 0
209
    || key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0
Guolin Ke's avatar
Guolin Ke committed
210
211
212
    || key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0
    || key_vals.count("leaf_count") <= 0 || key_vals.count("decision_type") <= 0
    ) {
213
    Log::Fatal("Tree model string format error");
Guolin Ke's avatar
Guolin Ke committed
214
215
216
217
  }

  Common::Atoi(key_vals["num_leaves"].c_str(), &num_leaves_);

Guolin Ke's avatar
Guolin Ke committed
218
219
220
221
222
223
224
225
226
227
228
229
230
  left_child_ = Common::StringToArray<int>(key_vals["left_child"], ' ', num_leaves_ - 1);
  right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1);
  split_feature_real_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
  threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1);
  split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1);
  internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1);
  internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1);
  decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);

  leaf_count_ = Common::StringToArray<data_size_t>(key_vals["leaf_count"], ' ', num_leaves_);
  leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
  leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);

Guolin Ke's avatar
Guolin Ke committed
231
232
233
}

}  // namespace LightGBM