"src/vscode:/vscode.git/clone" did not exist on "19e085c9925ac039cf2ce17649013b1ef69385ee"
leaf_splits.hpp 9.97 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_

Guolin Ke's avatar
Guolin Ke committed
8
#include <LightGBM/config.h>
9
#include <LightGBM/meta.h>
10
#include <LightGBM/utils/threading.h>
11

12
#include <limits>
Guolin Ke's avatar
Guolin Ke committed
13
14
#include <vector>

15
16
#include "data_partition.hpp"

Guolin Ke's avatar
Guolin Ke committed
17
18
19
namespace LightGBM {

/*!
Qiwei Ye's avatar
Qiwei Ye committed
20
* \brief used to find split candidates for a leaf
Guolin Ke's avatar
Guolin Ke committed
21
22
*/
class LeafSplits {
Nikita Titov's avatar
Nikita Titov committed
23
 public:
Guolin Ke's avatar
Guolin Ke committed
24
  LeafSplits(data_size_t num_data, const Config* config)
25
26
27
      : deterministic_(false),
        num_data_in_leaf_(num_data),
        num_data_(num_data),
Belinda Trotta's avatar
Belinda Trotta committed
28
    data_indices_(nullptr), weight_(0) {
29
30
31
    if (config != nullptr) {
      deterministic_ = config->deterministic;
    }
Guolin Ke's avatar
Guolin Ke committed
32
  }
Guolin Ke's avatar
Guolin Ke committed
33
34
35
36
  void ResetNumData(data_size_t num_data) {
    num_data_ = num_data;
    num_data_in_leaf_ = num_data;
  }
Guolin Ke's avatar
Guolin Ke committed
37
38
39
40
  ~LeafSplits() {
  }

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
41
  * \brief Init split on current leaf on partial data. 
Guolin Ke's avatar
Guolin Ke committed
42
43
44
45
46
  * \param leaf Index of current leaf
  * \param data_partition current data partition
  * \param sum_gradients
  * \param sum_hessians
  */
Belinda Trotta's avatar
Belinda Trotta committed
47
48
  void Init(int leaf, const DataPartition* data_partition, double sum_gradients,
            double sum_hessians, double weight) {
Guolin Ke's avatar
Guolin Ke committed
49
    leaf_index_ = leaf;
Guolin Ke's avatar
Guolin Ke committed
50
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
Guolin Ke's avatar
Guolin Ke committed
51
52
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
Belinda Trotta's avatar
Belinda Trotta committed
53
    weight_ = weight;
54
55
  }

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
  /*!
  * \brief Init split on current leaf on partial data. 
  * \param leaf Index of current leaf
  * \param data_partition current data partition
  * \param sum_gradients
  * \param sum_hessians
  * \param sum_gradients_and_hessians
  * \param weight
  */
  void Init(int leaf, const DataPartition* data_partition, double sum_gradients,
            double sum_hessians, int64_t sum_gradients_and_hessians, double weight) {
    leaf_index_ = leaf;
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
    int_sum_gradients_and_hessians_ = sum_gradients_and_hessians;
    weight_ = weight;
  }

75
76
77
78
79
80
81
82
83
84
  /*!
  * \brief Init split on current leaf on partial data.
  * \param leaf Index of current leaf
  * \param sum_gradients
  * \param sum_hessians
  */
  void Init(int leaf, double sum_gradients, double sum_hessians) {
    leaf_index_ = leaf;
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
Guolin Ke's avatar
Guolin Ke committed
85
86
  }

Guolin Ke's avatar
Guolin Ke committed
87
  /*!
88
89
90
91
   * \brief Init splits on the current leaf, it will traverse all data to sum up the results
   * \param gradients
   * \param hessians
   */
92
  void Init(const score_t* gradients, const score_t* hessians) {
Guolin Ke's avatar
Guolin Ke committed
93
94
95
    num_data_in_leaf_ = num_data_;
    leaf_index_ = 0;
    data_indices_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
96
97
    double tmp_sum_gradients = 0.0f;
    double tmp_sum_hessians = 0.0f;
98
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
Guolin Ke's avatar
Guolin Ke committed
99
100
101
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      tmp_sum_gradients += gradients[i];
      tmp_sum_hessians += hessians[i];
102
    }
Guolin Ke's avatar
Guolin Ke committed
103
104
    sum_gradients_ = tmp_sum_gradients;
    sum_hessians_ = tmp_sum_hessians;
Guolin Ke's avatar
Guolin Ke committed
105
106
  }

107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

  /*!
   * \brief Init splits on the current leaf, it will traverse all data to sum up the results
   * \param int_gradients_and_hessians Discretized gradients and hessians
   * \param grad_scale Scaling factor to recover original gradients from discretized gradients
   * \param hess_scale Scaling factor to recover original hessians from discretized hessians
   */
  void Init(const int8_t* int_gradients_and_hessians,
    const double grad_scale, const double hess_scale) {
    num_data_in_leaf_ = num_data_;
    leaf_index_ = 0;
    data_indices_ = nullptr;
    double tmp_sum_gradients = 0.0f;
    double tmp_sum_hessians = 0.0f;
    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
    int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
      tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
      const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
      const int64_t packed_long_int_grad_and_hess =
        (static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
        (static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
      tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
    }
    sum_gradients_ = tmp_sum_gradients;
    sum_hessians_ = tmp_sum_hessians;
    int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
  }


Guolin Ke's avatar
Guolin Ke committed
139
  /*!
140
141
142
143
144
145
146
147
   * \brief Init splits on current leaf of partial data.
   * \param leaf Index of current leaf
   * \param data_partition current data partition
   * \param gradients
   * \param hessians
   */
  void Init(int leaf, const DataPartition* data_partition,
            const score_t* gradients, const score_t* hessians) {
Guolin Ke's avatar
Guolin Ke committed
148
    leaf_index_ = leaf;
Guolin Ke's avatar
Guolin Ke committed
149
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
Guolin Ke's avatar
Guolin Ke committed
150
151
    double tmp_sum_gradients = 0.0f;
    double tmp_sum_hessians = 0.0f;
152
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
Guolin Ke's avatar
Guolin Ke committed
153
154
155
156
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      const data_size_t idx = data_indices_[i];
      tmp_sum_gradients += gradients[idx];
      tmp_sum_hessians += hessians[idx];
157
    }
Guolin Ke's avatar
Guolin Ke committed
158
159
    sum_gradients_ = tmp_sum_gradients;
    sum_hessians_ = tmp_sum_hessians;
Guolin Ke's avatar
Guolin Ke committed
160
161
162
  }


163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
  /*!
   * \brief Init splits on current leaf of partial data.
   * \param leaf Index of current leaf
   * \param data_partition current data partition
   * \param int_gradients_and_hessians Discretized gradients and hessians
   * \param grad_scale Scaling factor to recover original gradients from discretized gradients
   * \param hess_scale Scaling factor to recover original hessians from discretized hessians
   */
  void Init(int leaf, const DataPartition* data_partition,
            const int8_t* int_gradients_and_hessians,
            const score_t grad_scale, const score_t hess_scale) {
    leaf_index_ = leaf;
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
    double tmp_sum_gradients = 0.0f;
    double tmp_sum_hessians = 0.0f;
    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
    int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      const data_size_t idx = data_indices_[i];
      tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;
      tmp_sum_hessians += int_gradients_and_hessians[2 * idx] * hess_scale;
      const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
      const int64_t packed_long_int_grad_and_hess =
        (static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
        (static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
      tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
    }
    sum_gradients_ = tmp_sum_gradients;
    sum_hessians_ = tmp_sum_hessians;
    int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
  }


Guolin Ke's avatar
Guolin Ke committed
197
198
199
200
201
  /*!
  * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
  * \param sum_gradients
  * \param sum_hessians
  */
Guolin Ke's avatar
Guolin Ke committed
202
  void Init(double sum_gradients, double sum_hessians) {
Guolin Ke's avatar
Guolin Ke committed
203
204
205
206
207
    leaf_index_ = 0;
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
  }

208
209
210
211
212
213
214
215
216
217
218
219
220
  /*!
  * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
  * \param sum_gradients
  * \param sum_hessians
  * \param int_sum_gradients_and_hessians
  */
  void Init(double sum_gradients, double sum_hessians, int64_t int_sum_gradients_and_hessians) {
    leaf_index_ = 0;
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
    int_sum_gradients_and_hessians_ = int_sum_gradients_and_hessians;
  }

Guolin Ke's avatar
Guolin Ke committed
221
222
223
224
225
  /*!
  * \brief Init splits on current leaf
  */
  void Init() {
    leaf_index_ = -1;
Guolin Ke's avatar
Guolin Ke committed
226
227
    data_indices_ = nullptr;
    num_data_in_leaf_ = 0;
Guolin Ke's avatar
Guolin Ke committed
228
229
230
231
  }


  /*! \brief Get current leaf index */
232
  int leaf_index() const { return leaf_index_; }
Guolin Ke's avatar
Guolin Ke committed
233

Andrew Ziem's avatar
Andrew Ziem committed
234
  /*! \brief Get number of data in current leaf */
Guolin Ke's avatar
Guolin Ke committed
235
236
237
  data_size_t num_data_in_leaf() const { return num_data_in_leaf_; }

  /*! \brief Get sum of gradients of current leaf */
Guolin Ke's avatar
Guolin Ke committed
238
  double sum_gradients() const { return sum_gradients_; }
239

Andrew Ziem's avatar
Andrew Ziem committed
240
  /*! \brief Get sum of Hessians of current leaf */
Guolin Ke's avatar
Guolin Ke committed
241
  double sum_hessians() const { return sum_hessians_; }
Guolin Ke's avatar
Guolin Ke committed
242

243
244
245
  /*! \brief Get sum of discretized gradients and Hessians of current leaf */
  int64_t int_sum_gradients_and_hessians() const { return int_sum_gradients_and_hessians_; }

Guolin Ke's avatar
Guolin Ke committed
246
  /*! \brief Get indices of data of current leaf */
Guolin Ke's avatar
Guolin Ke committed
247
  const data_size_t* data_indices() const { return data_indices_; }
Guolin Ke's avatar
Guolin Ke committed
248

Belinda Trotta's avatar
Belinda Trotta committed
249
250
251
252
  /*! \brief Get weight of current leaf */
  double weight() const { return weight_; }


Guolin Ke's avatar
Guolin Ke committed
253

Nikita Titov's avatar
Nikita Titov committed
254
 private:
255
  bool deterministic_;
Guolin Ke's avatar
Guolin Ke committed
256
257
258
259
260
261
262
  /*! \brief current leaf index */
  int leaf_index_;
  /*! \brief number of data on current leaf */
  data_size_t num_data_in_leaf_;
  /*! \brief number of all training data */
  data_size_t num_data_;
  /*! \brief sum of gradients of current leaf */
Guolin Ke's avatar
Guolin Ke committed
263
  double sum_gradients_;
Andrew Ziem's avatar
Andrew Ziem committed
264
  /*! \brief sum of Hessians of current leaf */
Guolin Ke's avatar
Guolin Ke committed
265
  double sum_hessians_;
266
267
  /*! \brief sum of discretized gradients and Hessians of current leaf */
  int64_t int_sum_gradients_and_hessians_;
Guolin Ke's avatar
Guolin Ke committed
268
  /*! \brief indices of data of current leaf */
Guolin Ke's avatar
Guolin Ke committed
269
  const data_size_t* data_indices_;
Belinda Trotta's avatar
Belinda Trotta committed
270
271
  /*! \brief weight of current leaf */
  double weight_;
Guolin Ke's avatar
Guolin Ke committed
272
273
274
};

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
275
#endif   // LightGBM_TREELEARNER_LEAF_SPLITS_HPP_