feature_histogram.hpp 44 KB
Newer Older
1
2
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_

9
#include <LightGBM/bin.h>
Guolin Ke's avatar
Guolin Ke committed
10
#include <LightGBM/dataset.h>
11
#include <LightGBM/utils/array_args.h>
Guolin Ke's avatar
Guolin Ke committed
12

13
#include <algorithm>
14
#include <cmath>
15
16
17
18
19
#include <cstring>
#include <memory>
#include <utility>
#include <vector>

20
#include "monotone_constraints.hpp"
Nikita Titov's avatar
Nikita Titov committed
21
#include "split_info.hpp"
Guolin Ke's avatar
Guolin Ke committed
22

23
namespace LightGBM {
Guolin Ke's avatar
Guolin Ke committed
24

Guolin Ke's avatar
Guolin Ke committed
25
class FeatureMetainfo {
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
  int num_bin;
Guolin Ke's avatar
Guolin Ke committed
28
  MissingType missing_type;
29
  int8_t offset = 0;
Guolin Ke's avatar
Guolin Ke committed
30
  uint32_t default_bin;
31
32
  int8_t monotone_type = 0;
  double penalty = 1.0;
Guolin Ke's avatar
Guolin Ke committed
33
  /*! \brief pointer of tree config */
Guolin Ke's avatar
Guolin Ke committed
34
  const Config* config;
35
  BinType bin_type;
36
37
38
  /*! \brief random number generator for extremely randomized trees */
  mutable Random rand;
  ;
Guolin Ke's avatar
Guolin Ke committed
39
};
Guolin Ke's avatar
Guolin Ke committed
40
/*!
41
42
43
 * \brief FeatureHistogram is used to construct and store a histogram for a
 * feature.
 */
Guolin Ke's avatar
Guolin Ke committed
44
class FeatureHistogram {
45
 public:
46
  FeatureHistogram() { data_ = nullptr; }
Guolin Ke's avatar
Guolin Ke committed
47

48
  ~FeatureHistogram() {}
Guolin Ke's avatar
Guolin Ke committed
49

Guolin Ke's avatar
Guolin Ke committed
50
51
52
53
54
  /*! \brief Disable copy */
  FeatureHistogram& operator=(const FeatureHistogram&) = delete;
  /*! \brief Disable copy */
  FeatureHistogram(const FeatureHistogram&) = delete;

Guolin Ke's avatar
Guolin Ke committed
55
  /*!
56
57
58
59
   * \brief Init the feature histogram
   * \param feature the feature data for this histogram
   * \param min_num_data_one_leaf minimal number of data in one leaf
   */
60
  void Init(hist_t* data, const FeatureMetainfo* meta) {
Guolin Ke's avatar
Guolin Ke committed
61
62
    meta_ = meta;
    data_ = data;
63
    if (meta_->bin_type == BinType::NumericalBin) {
64
      FuncForNumrical();
65
    } else {
66
      FuncForCategorical();
67
    }
Guolin Ke's avatar
Guolin Ke committed
68
69
  }

70
  hist_t* RawData() { return data_; }
Guolin Ke's avatar
Guolin Ke committed
71
  /*!
72
73
74
   * \brief Subtract current histograms with other
   * \param other The histogram that want to subtract
   */
Guolin Ke's avatar
Guolin Ke committed
75
  void Subtract(const FeatureHistogram& other) {
76
77
    for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
      data_[i] -= other.data_[i];
Guolin Ke's avatar
Guolin Ke committed
78
79
    }
  }
80

81
82
83
84
  void FindBestThreshold(double sum_gradient, double sum_hessian,
                         data_size_t num_data,
                         const ConstraintEntry& constraints,
                         SplitInfo* output) {
Guolin Ke's avatar
Guolin Ke committed
85
    output->default_left = true;
Guolin Ke's avatar
Guolin Ke committed
86
    output->gain = kMinScore;
87
88
    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data,
                             constraints, output);
Guolin Ke's avatar
Guolin Ke committed
89
    output->gain *= meta_->penalty;
90
91
  }

92
93
94
  template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT>
  double BeforeNumercal(double sum_gradient, double sum_hessian,
                        SplitInfo* output, int* rand_threshold) {
Guolin Ke's avatar
Guolin Ke committed
95
    is_splittable_ = false;
96
97
98
99
100
101
102
103
104
    output->monotone_type = meta_->monotone_type;
    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
    *rand_threshold = 0;
    if (USE_RAND) {
      if (meta_->num_bin - 2 > 0) {
        *rand_threshold = meta_->rand.NextInt(0, meta_->num_bin - 2);
      }
105
    }
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
    return gain_shift + meta_->config->min_gain_to_split;
  }

  void FuncForNumrical() {
    if (meta_->config->extra_trees) {
      if (meta_->config->monotone_constraints.empty()) {
        FuncForNumricalL1<true, false>();
      } else {
        FuncForNumricalL1<true, true>();
      }
    } else {
      if (meta_->config->monotone_constraints.empty()) {
        FuncForNumricalL1<false, false>();
      } else {
        FuncForNumricalL1<false, true>();
      }
    }
  }
  template <bool USE_RAND, bool USE_MC>
  void FuncForNumricalL1() {
    if (meta_->config->lambda_l1 > 0) {
      if (meta_->config->max_delta_step > 0) {
        FuncForNumricalL2<USE_RAND, USE_MC, true, true>();
      } else {
        FuncForNumricalL2<USE_RAND, USE_MC, true, false>();
      }
    } else {
      if (meta_->config->max_delta_step > 0) {
        FuncForNumricalL2<USE_RAND, USE_MC, false, true>();
      } else {
        FuncForNumricalL2<USE_RAND, USE_MC, false, false>();
      }
    }
  }

  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  void FuncForNumricalL2() {
Guolin Ke's avatar
Guolin Ke committed
143
144
    if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
      if (meta_->missing_type == MissingType::Zero) {
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
        find_best_threshold_fun_ =
            [=](double sum_gradient, double sum_hessian, data_size_t num_data,
                const ConstraintEntry& constraints, SplitInfo* output) {
              int rand_threshold = 0;
              double min_gain_shift =
                  BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                      sum_gradient, sum_hessian, output, &rand_threshold);
              FindBestThresholdSequence<USE_RAND, USE_MC, USE_L1,
                                        USE_MAX_OUTPUT, true, true, false>(
                  sum_gradient, sum_hessian, num_data, constraints,
                  min_gain_shift, output, rand_threshold);
              FindBestThresholdSequence<USE_RAND, USE_MC, USE_L1,
                                        USE_MAX_OUTPUT, false, true, false>(
                  sum_gradient, sum_hessian, num_data, constraints,
                  min_gain_shift, output, rand_threshold);
            };
Guolin Ke's avatar
Guolin Ke committed
161
      } else {
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
        find_best_threshold_fun_ =
            [=](double sum_gradient, double sum_hessian, data_size_t num_data,
                const ConstraintEntry& constraints, SplitInfo* output) {
              int rand_threshold = 0;
              double min_gain_shift =
                  BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                      sum_gradient, sum_hessian, output, &rand_threshold);
              FindBestThresholdSequence<USE_RAND, USE_MC, USE_L1,
                                        USE_MAX_OUTPUT, true, false, true>(
                  sum_gradient, sum_hessian, num_data, constraints,
                  min_gain_shift, output, rand_threshold);
              FindBestThresholdSequence<USE_RAND, USE_MC, USE_L1,
                                        USE_MAX_OUTPUT, false, false, true>(
                  sum_gradient, sum_hessian, num_data, constraints,
                  min_gain_shift, output, rand_threshold);
            };
Guolin Ke's avatar
Guolin Ke committed
178
      }
179
    } else {
180
181
182
183
184
185
186
187
188
189
190
191
192
      if (meta_->missing_type != MissingType::NaN) {
        find_best_threshold_fun_ =
            [=](double sum_gradient, double sum_hessian, data_size_t num_data,
                const ConstraintEntry& constraints, SplitInfo* output) {
              int rand_threshold = 0;
              double min_gain_shift =
                  BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                      sum_gradient, sum_hessian, output, &rand_threshold);
              FindBestThresholdSequence<USE_RAND, USE_MC, USE_L1,
                                        USE_MAX_OUTPUT, true, false, false>(
                  sum_gradient, sum_hessian, num_data, constraints,
                  min_gain_shift, output, rand_threshold);
            };
193
      } else {
194
195
196
197
198
199
200
201
202
203
204
205
206
        find_best_threshold_fun_ =
            [=](double sum_gradient, double sum_hessian, data_size_t num_data,
                const ConstraintEntry& constraints, SplitInfo* output) {
              int rand_threshold = 0;
              double min_gain_shift =
                  BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                      sum_gradient, sum_hessian, output, &rand_threshold);
              FindBestThresholdSequence<USE_RAND, USE_MC, USE_L1,
                                        USE_MAX_OUTPUT, true, false, false>(
                  sum_gradient, sum_hessian, num_data, constraints,
                  min_gain_shift, output, rand_threshold);
              output->default_left = false;
            };
Guolin Ke's avatar
Guolin Ke committed
207
      }
Guolin Ke's avatar
Guolin Ke committed
208
209
    }
  }
210

211
  void FuncForCategorical() {
212
    if (meta_->config->extra_trees) {
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
      if (meta_->config->monotone_constraints.empty()) {
        FuncForCategoricalL1<true, false>();
      } else {
        FuncForCategoricalL1<true, true>();
      }
    } else {
      if (meta_->config->monotone_constraints.empty()) {
        FuncForCategoricalL1<false, false>();
      } else {
        FuncForCategoricalL1<false, true>();
      }
    }
  }
  template <bool USE_RAND, bool USE_MC>
  void FuncForCategoricalL1() {
    if (meta_->config->lambda_l1 > 0) {
      if (meta_->config->max_delta_step > 0) {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, true, true>,
                      this, std::placeholders::_1, std::placeholders::_2,
                      std::placeholders::_3, std::placeholders::_4,
                      std::placeholders::_5);
      } else {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, true, false>,
                      this, std::placeholders::_1, std::placeholders::_2,
                      std::placeholders::_3, std::placeholders::_4,
                      std::placeholders::_5);
      }
244
    } else {
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
      if (meta_->config->max_delta_step > 0) {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, false, true>,
                      this, std::placeholders::_1, std::placeholders::_2,
                      std::placeholders::_3, std::placeholders::_4,
                      std::placeholders::_5);
      } else {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, false, false>,
                      this, std::placeholders::_1, std::placeholders::_2,
                      std::placeholders::_3, std::placeholders::_4,
                      std::placeholders::_5);
      }
260
261
262
    }
  }

263
264
265
266
267
268
269
  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  void FindBestThresholdCategoricalInner(double sum_gradient,
                                         double sum_hessian,
                                         data_size_t num_data,
                                         const ConstraintEntry& constraints,
                                         SplitInfo* output) {
    is_splittable_ = false;
Guolin Ke's avatar
Guolin Ke committed
270
    output->default_left = false;
271
    double best_gain = kMinScore;
272
    data_size_t best_left_count = 0;
ChenZhiyong's avatar
ChenZhiyong committed
273
274
    double best_sum_left_gradient = 0;
    double best_sum_left_hessian = 0;
275
276
277
    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
278

Guolin Ke's avatar
Guolin Ke committed
279
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
ChenZhiyong's avatar
ChenZhiyong committed
280
    bool is_full_categorical = meta_->missing_type == MissingType::None;
281
    int used_bin = meta_->num_bin - 1 + is_full_categorical;
ChenZhiyong's avatar
ChenZhiyong committed
282

Guolin Ke's avatar
Guolin Ke committed
283
    std::vector<int> sorted_idx;
Guolin Ke's avatar
Guolin Ke committed
284
285
    double l2 = meta_->config->lambda_l2;
    bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
286
287
    int best_threshold = -1;
    int best_dir = 1;
288
    const double cnt_factor = num_data / sum_hessian;
289
    int rand_threshold = 0;
290
    if (use_onehot) {
291
      if (USE_RAND) {
292
        if (used_bin > 0) {
293
          rand_threshold = meta_->rand.NextInt(0, used_bin);
294
295
        }
      }
296
      for (int t = 0; t < used_bin; ++t) {
297
298
        const auto grad = GET_GRAD(data_, t);
        const auto hess = GET_HESS(data_, t);
299
300
        data_size_t cnt =
            static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
301
        // if data not enough, or sum hessian too small
302
303
304
        if (cnt < meta_->config->min_data_in_leaf ||
            hess < meta_->config->min_sum_hessian_in_leaf)
          continue;
305
        data_size_t other_count = num_data - cnt;
306
        // if data not enough
Guolin Ke's avatar
Guolin Ke committed
307
        if (other_count < meta_->config->min_data_in_leaf) continue;
ChenZhiyong's avatar
ChenZhiyong committed
308

309
        double sum_other_hessian = sum_hessian - hess - kEpsilon;
310
        // if sum hessian too small
311
312
        if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf)
          continue;
ChenZhiyong's avatar
ChenZhiyong committed
313

314
        double sum_other_gradient = sum_gradient - grad;
315
        if (USE_RAND) {
316
317
318
319
          if (t != rand_threshold) {
            continue;
          }
        }
320
        // current split gain
321
322
323
324
        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
            sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
            constraints, 0);
325
        // gain with split is worse than without split
ChenZhiyong's avatar
ChenZhiyong committed
326
        if (current_gain <= min_gain_shift) continue;
327
328

        // mark to is splittable
ChenZhiyong's avatar
ChenZhiyong committed
329
        is_splittable_ = true;
330
        // better split point
ChenZhiyong's avatar
ChenZhiyong committed
331
        if (current_gain > best_gain) {
332
          best_threshold = t;
333
334
335
          best_sum_left_gradient = grad;
          best_sum_left_hessian = hess + kEpsilon;
          best_left_count = cnt;
ChenZhiyong's avatar
ChenZhiyong committed
336
          best_gain = current_gain;
337
338
339
340
        }
      }
    } else {
      for (int i = 0; i < used_bin; ++i) {
341
342
        if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >=
            meta_->config->cat_smooth) {
343
344
345
346
347
          sorted_idx.push_back(i);
        }
      }
      used_bin = static_cast<int>(sorted_idx.size());

Guolin Ke's avatar
Guolin Ke committed
348
      l2 += meta_->config->cat_l2;
349
350

      auto ctr_fun = [this](double sum_grad, double sum_hess) {
Guolin Ke's avatar
Guolin Ke committed
351
        return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
352
353
      };
      std::sort(sorted_idx.begin(), sorted_idx.end(),
354
355
356
357
                [this, &ctr_fun](int i, int j) {
                  return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) <
                         ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
                });
358
359
360
361
362

      std::vector<int> find_direction(1, 1);
      std::vector<int> start_position(1, 0);
      find_direction.push_back(-1);
      start_position.push_back(used_bin - 1);
363
364
      const int max_num_cat =
          std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2);
365
      int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0);
366
      if (USE_RAND) {
367
        if (max_threshold > 0) {
368
          rand_threshold = meta_->rand.NextInt(0, max_threshold);
369
        }
370
      }
371

372
373
374
375
      is_splittable_ = false;
      for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
        auto dir = find_direction[out_i];
        auto start_pos = start_position[out_i];
Guolin Ke's avatar
Guolin Ke committed
376
        data_size_t min_data_per_group = meta_->config->min_data_per_group;
377
378
379
380
381
382
383
        data_size_t cnt_cur_group = 0;
        double sum_left_gradient = 0.0f;
        double sum_left_hessian = kEpsilon;
        data_size_t left_count = 0;
        for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
          auto t = sorted_idx[start_pos];
          start_pos += dir;
384
385
          const auto grad = GET_GRAD(data_, t);
          const auto hess = GET_HESS(data_, t);
386
387
          data_size_t cnt =
              static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
388

389
390
391
392
          sum_left_gradient += grad;
          sum_left_hessian += hess;
          left_count += cnt;
          cnt_cur_group += cnt;
393

394
395
396
          if (left_count < meta_->config->min_data_in_leaf ||
              sum_left_hessian < meta_->config->min_sum_hessian_in_leaf)
            continue;
397
          data_size_t right_count = num_data - left_count;
398
399
400
          if (right_count < meta_->config->min_data_in_leaf ||
              right_count < min_data_per_group)
            break;
401
402

          double sum_right_hessian = sum_hessian - sum_left_hessian;
Guolin Ke's avatar
Guolin Ke committed
403
          if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break;
404
405
406
407
408
409

          if (cnt_cur_group < min_data_per_group) continue;

          cnt_cur_group = 0;

          double sum_right_gradient = sum_gradient - sum_left_gradient;
410
          if (USE_RAND) {
411
412
            if (i != rand_threshold) {
              continue;
413
            }
414
          }
415
416
417
418
          double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_left_gradient, sum_left_hessian, sum_right_gradient,
              sum_right_hessian, meta_->config->lambda_l1, l2,
              meta_->config->max_delta_step, constraints, 0);
419
420
421
422
423
424
425
426
427
428
          if (current_gain <= min_gain_shift) continue;
          is_splittable_ = true;
          if (current_gain > best_gain) {
            best_left_count = left_count;
            best_sum_left_gradient = sum_left_gradient;
            best_sum_left_hessian = sum_left_hessian;
            best_threshold = i;
            best_gain = current_gain;
            best_dir = dir;
          }
ChenZhiyong's avatar
ChenZhiyong committed
429
        }
430
431
      }
    }
432

433
    if (is_splittable_) {
434
435
436
437
438
      output->left_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              best_sum_left_gradient, best_sum_left_hessian,
              meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
              constraints);
439
440
441
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
442
443
444
445
446
      output->right_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_gradient - best_sum_left_gradient,
              sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
              meta_->config->max_delta_step, constraints);
447
448
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
449
450
      output->right_sum_hessian =
          sum_hessian - best_sum_left_hessian - kEpsilon;
Guolin Ke's avatar
Guolin Ke committed
451
      output->gain = best_gain - min_gain_shift;
452
453
      if (use_onehot) {
        output->num_cat_threshold = 1;
454
455
        output->cat_threshold =
            std::vector<uint32_t>(1, static_cast<uint32_t>(best_threshold));
ChenZhiyong's avatar
ChenZhiyong committed
456
      } else {
457
        output->num_cat_threshold = best_threshold + 1;
458
459
        output->cat_threshold =
            std::vector<uint32_t>(output->num_cat_threshold);
460
461
462
463
464
465
466
467
468
469
        if (best_dir == 1) {
          for (int i = 0; i < output->num_cat_threshold; ++i) {
            auto t = sorted_idx[i];
            output->cat_threshold[i] = t;
          }
        } else {
          for (int i = 0; i < output->num_cat_threshold; ++i) {
            auto t = sorted_idx[used_bin - 1 - i];
            output->cat_threshold[i] = t;
          }
ChenZhiyong's avatar
ChenZhiyong committed
470
471
        }
      }
Guolin Ke's avatar
Guolin Ke committed
472
      output->monotone_type = 0;
473
    }
474
475
  }

476
  void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
477
478
                              uint32_t threshold, data_size_t num_data,
                              SplitInfo* output) {
479
    if (meta_->bin_type == BinType::NumericalBin) {
480
481
      GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
                                      num_data, output);
482
    } else {
483
484
      GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
                                        num_data, output);
485
486
487
488
    }
  }

  void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
489
490
491
492
493
                                       uint32_t threshold, data_size_t num_data,
                                       SplitInfo* output) {
    double gain_shift = GetLeafGain<true, true>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
Guolin Ke's avatar
Guolin Ke committed
494
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
495
496

    // do stuff here
497
    const int8_t offset = meta_->offset;
498
499
500
501
502
503

    double sum_right_gradient = 0.0f;
    double sum_right_hessian = kEpsilon;
    data_size_t right_count = 0;

    // set values
504
505
    bool use_na_as_missing = false;
    bool skip_default_bin = false;
506
507
    if (meta_->missing_type == MissingType::Zero) {
      skip_default_bin = true;
508
    } else if (meta_->missing_type == MissingType::NaN) {
509
510
511
      use_na_as_missing = true;
    }

512
513
    int t = meta_->num_bin - 1 - offset - use_na_as_missing;
    const int t_end = 1 - offset;
514
    const double cnt_factor = num_data / sum_hessian;
515
516
    // from right to left, and we don't need data in bin0
    for (; t >= t_end; --t) {
517
518
519
      if (static_cast<uint32_t>(t + offset) < threshold) {
        break;
      }
520
521

      // need to skip default bin
522
523
524
525
      if (skip_default_bin &&
          (t + offset) == static_cast<int>(meta_->default_bin)) {
        continue;
      }
526
527
      const auto grad = GET_GRAD(data_, t);
      const auto hess = GET_HESS(data_, t);
528
529
      data_size_t cnt =
          static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
530
531
532
      sum_right_gradient += grad;
      sum_right_hessian += hess;
      right_count += cnt;
533
534
535
536
    }
    double sum_left_gradient = sum_gradient - sum_right_gradient;
    double sum_left_hessian = sum_hessian - sum_right_hessian;
    data_size_t left_count = num_data - right_count;
537
538
539
540
541
542
543
    double current_gain =
        GetLeafGain<true, true>(
            sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step) +
        GetLeafGain<true, true>(
            sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step);
544
545
546
547

    // gain with split is worse than without split
    if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
      output->gain = kMinScore;
548
549
      Log::Warning(
          "'Forced Split' will be ignored since the gain getting worse. ");
550
      return;
551
    }
552
553
554

    // update split information
    output->threshold = threshold;
555
556
557
    output->left_output = CalculateSplittedLeafOutput<true, true>(
        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
558
559
560
    output->left_count = left_count;
    output->left_sum_gradient = sum_left_gradient;
    output->left_sum_hessian = sum_left_hessian - kEpsilon;
561
562
563
564
    output->right_output = CalculateSplittedLeafOutput<true, true>(
        sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
        meta_->config->lambda_l1, meta_->config->lambda_l2,
        meta_->config->max_delta_step);
565
566
567
    output->right_count = num_data - left_count;
    output->right_sum_gradient = sum_gradient - sum_left_gradient;
    output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
568
    output->gain = current_gain - min_gain_shift;
569
570
571
    output->default_left = true;
  }

572
573
574
575
  void GatherInfoForThresholdCategorical(double sum_gradient,
                                         double sum_hessian, uint32_t threshold,
                                         data_size_t num_data,
                                         SplitInfo* output) {
576
577
    // get SplitInfo for a given one-hot categorical split.
    output->default_left = false;
578
579
580
    double gain_shift = GetLeafGain<true, true>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
Guolin Ke's avatar
Guolin Ke committed
581
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
582
583
584
585
586
587
588
    bool is_full_categorical = meta_->missing_type == MissingType::None;
    int used_bin = meta_->num_bin - 1 + is_full_categorical;
    if (threshold >= static_cast<uint32_t>(used_bin)) {
      output->gain = kMinScore;
      Log::Warning("Invalid categorical threshold split");
      return;
    }
589
590
591
    const double cnt_factor = num_data / sum_hessian;
    const auto grad = GET_GRAD(data_, threshold);
    const auto hess = GET_HESS(data_, threshold);
592
593
    data_size_t cnt =
        static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
594

Guolin Ke's avatar
Guolin Ke committed
595
    double l2 = meta_->config->lambda_l2;
596
    data_size_t left_count = cnt;
597
    data_size_t right_count = num_data - left_count;
598
    double sum_left_hessian = hess + kEpsilon;
599
    double sum_right_hessian = sum_hessian - sum_left_hessian;
600
    double sum_left_gradient = grad;
601
602
    double sum_right_gradient = sum_gradient - sum_left_gradient;
    // current split gain
603
604
605
606
607
608
609
    double current_gain =
        GetLeafGain<true, true>(sum_right_gradient, sum_right_hessian,
                                meta_->config->lambda_l1, l2,
                                meta_->config->max_delta_step) +
        GetLeafGain<true, true>(sum_left_gradient, sum_left_hessian,
                                meta_->config->lambda_l1, l2,
                                meta_->config->max_delta_step);
610
611
    if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
      output->gain = kMinScore;
612
613
      Log::Warning(
          "'Forced Split' will be ignored since the gain getting worse.");
614
615
616
      return;
    }

617
618
619
    output->left_output = CalculateSplittedLeafOutput<true, true>(
        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
        meta_->config->max_delta_step);
620
621
622
    output->left_count = left_count;
    output->left_sum_gradient = sum_left_gradient;
    output->left_sum_hessian = sum_left_hessian - kEpsilon;
623
624
625
    output->right_output = CalculateSplittedLeafOutput<true, true>(
        sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
        meta_->config->max_delta_step);
626
627
628
629
630
631
632
633
    output->right_count = right_count;
    output->right_sum_gradient = sum_gradient - sum_left_gradient;
    output->right_sum_hessian = sum_right_hessian - kEpsilon;
    output->gain = current_gain - min_gain_shift;
    output->num_cat_threshold = 1;
    output->cat_threshold = std::vector<uint32_t>(1, threshold);
  }

Guolin Ke's avatar
Guolin Ke committed
634
  /*!
635
636
   * \brief Binary size of this histogram
   */
Guolin Ke's avatar
Guolin Ke committed
637
  int SizeOfHistgram() const {
638
    return (meta_->num_bin - meta_->offset) * kHistEntrySize;
Guolin Ke's avatar
Guolin Ke committed
639
640
641
  }

  /*!
642
643
   * \brief Restore histogram from memory
   */
Guolin Ke's avatar
Guolin Ke committed
644
  void FromMemory(char* memory_data) {
645
646
    std::memcpy(data_, memory_data,
                (meta_->num_bin - meta_->offset) * kHistEntrySize);
Guolin Ke's avatar
Guolin Ke committed
647
648
649
  }

  /*!
650
651
   * \brief True if this histogram can be splitted
   */
Guolin Ke's avatar
Guolin Ke committed
652
653
654
  bool is_splittable() { return is_splittable_; }

  /*!
655
656
   * \brief Set splittable to this histogram
   */
Guolin Ke's avatar
Guolin Ke committed
657
658
  void set_is_splittable(bool val) { is_splittable_ = val; }

659
660
661
662
663
  static double ThresholdL1(double s, double l1) {
    const double reg_s = std::max(0.0, std::fabs(s) - l1);
    return Common::Sign(s) * reg_s;
  }

664
665
666
667
668
669
670
671
672
673
674
  template <bool USE_L1, bool USE_MAX_OUTPUT>
  static double CalculateSplittedLeafOutput(double sum_gradients,
                                            double sum_hessians, double l1,
                                            double l2, double max_delta_step) {
    if (USE_L1) {
      double ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
      if (USE_MAX_OUTPUT) {
        if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
          return Common::Sign(ret) * max_delta_step;
        }
      }
675
676
      return ret;
    } else {
677
678
679
680
681
682
683
      double ret = -sum_gradients / (sum_hessians + l2);
      if (USE_MAX_OUTPUT) {
        if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
          return Common::Sign(ret) * max_delta_step;
        }
      }
      return ret;
684
    }
Guolin Ke's avatar
Guolin Ke committed
685
686
  }

687
688
689
690
691
692
693
694
695
696
697
698
699
700
  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  static double CalculateSplittedLeafOutput(
      double sum_gradients, double sum_hessians, double l1, double l2,
      double max_delta_step, const ConstraintEntry& constraints) {
    double ret = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>(
        sum_gradients, sum_hessians, l1, l2, max_delta_step);
    if (USE_MC) {
      if (ret < constraints.min) {
        ret = constraints.min;
      } else if (ret > constraints.max) {
        ret = constraints.max;
      }
    }
    return ret;
Guolin Ke's avatar
Guolin Ke committed
701
702
  }

703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
 private:
  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  static double GetSplitGains(double sum_left_gradients,
                              double sum_left_hessians,
                              double sum_right_gradients,
                              double sum_right_hessians, double l1, double l2,
                              double max_delta_step,
                              const ConstraintEntry& constraints,
                              int8_t monotone_constraint) {
    if (!USE_MC) {
      return GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_left_gradients,
                                                 sum_left_hessians, l1, l2,
                                                 max_delta_step) +
             GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_right_gradients,
                                                 sum_right_hessians, l1, l2,
                                                 max_delta_step);
    } else {
      double left_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step,
              constraints);
      double right_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step,
              constraints);
      if (((monotone_constraint > 0) && (left_output > right_output)) ||
          ((monotone_constraint < 0) && (left_output < right_output))) {
        return 0;
      }
      return GetLeafGainGivenOutput<USE_L1>(
                 sum_left_gradients, sum_left_hessians, l1, l2, left_output) +
             GetLeafGainGivenOutput<USE_L1>(
                 sum_right_gradients, sum_right_hessians, l1, l2, right_output);
Guolin Ke's avatar
Guolin Ke committed
736
    }
Guolin Ke's avatar
Guolin Ke committed
737
  }
Guolin Ke's avatar
Guolin Ke committed
738

739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
  template <bool USE_L1, bool USE_MAX_OUTPUT>
  static double GetLeafGain(double sum_gradients, double sum_hessians,
                            double l1, double l2, double max_delta_step) {
    if (!USE_MAX_OUTPUT) {
      if (USE_L1) {
        const double sg_l1 = ThresholdL1(sum_gradients, l1);
        return (sg_l1 * sg_l1) / (sum_hessians + l2);
      } else {
        return (sum_gradients * sum_gradients) / (sum_hessians + l2);
      }
    } else {
      double output = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>(
          sum_gradients, sum_hessians, l1, l2, max_delta_step);
      return GetLeafGainGivenOutput<USE_L1>(sum_gradients, sum_hessians, l1, l2,
                                            output);
    }
Guolin Ke's avatar
Guolin Ke committed
755
756
  }

757
758
759
760
761
762
763
764
765
766
767
  template <bool USE_L1>
  static double GetLeafGainGivenOutput(double sum_gradients,
                                       double sum_hessians, double l1,
                                       double l2, double output) {
    if (USE_L1) {
      const double sg_l1 = ThresholdL1(sum_gradients, l1);
      return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output);
    } else {
      return -(2.0 * sum_gradients * output +
               (sum_hessians + l2) * output * output);
    }
Guolin Ke's avatar
Guolin Ke committed
768
  }
Guolin Ke's avatar
Guolin Ke committed
769

770
771
772
773
774
775
776
  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT,
            bool REVERSE, bool SKIP_DEFAULT_BIN, bool NA_AS_MISSING>
  void FindBestThresholdSequence(double sum_gradient, double sum_hessian,
                                 data_size_t num_data,
                                 const ConstraintEntry& constraints,
                                 double min_gain_shift, SplitInfo* output,
                                 int rand_threshold) {
777
    const int8_t offset = meta_->offset;
Guolin Ke's avatar
Guolin Ke committed
778
779
780
781
782
    double best_sum_left_gradient = NAN;
    double best_sum_left_hessian = NAN;
    double best_gain = kMinScore;
    data_size_t best_left_count = 0;
    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
783
    const double cnt_factor = num_data / sum_hessian;
784
    if (REVERSE) {
Guolin Ke's avatar
Guolin Ke committed
785
786
787
788
      double sum_right_gradient = 0.0f;
      double sum_right_hessian = kEpsilon;
      data_size_t right_count = 0;

789
      int t = meta_->num_bin - 1 - offset - NA_AS_MISSING;
790
      const int t_end = 1 - offset;
Guolin Ke's avatar
Guolin Ke committed
791
792
793
794

      // from right to left, and we don't need data in bin0
      for (; t >= t_end; --t) {
        // need to skip default bin
795
796
797
798
799
        if (SKIP_DEFAULT_BIN) {
          if ((t + offset) == static_cast<int>(meta_->default_bin)) {
            continue;
          }
        }
800
801
        const auto grad = GET_GRAD(data_, t);
        const auto hess = GET_HESS(data_, t);
802
803
        data_size_t cnt =
            static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
804
805
806
        sum_right_gradient += grad;
        sum_right_hessian += hess;
        right_count += cnt;
Guolin Ke's avatar
Guolin Ke committed
807
        // if data not enough, or sum hessian too small
808
809
810
        if (right_count < meta_->config->min_data_in_leaf ||
            sum_right_hessian < meta_->config->min_sum_hessian_in_leaf)
          continue;
Guolin Ke's avatar
Guolin Ke committed
811
812
        data_size_t left_count = num_data - right_count;
        // if data not enough
Guolin Ke's avatar
Guolin Ke committed
813
        if (left_count < meta_->config->min_data_in_leaf) break;
Guolin Ke's avatar
Guolin Ke committed
814
815
816

        double sum_left_hessian = sum_hessian - sum_right_hessian;
        // if sum hessian too small
Guolin Ke's avatar
Guolin Ke committed
817
        if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) break;
Guolin Ke's avatar
Guolin Ke committed
818
819

        double sum_left_gradient = sum_gradient - sum_right_gradient;
820
        if (USE_RAND) {
821
          if (t - 1 + offset != rand_threshold) {
Guolin Ke's avatar
Guolin Ke committed
822
            continue;
823
          }
Guolin Ke's avatar
Guolin Ke committed
824
        }
Guolin Ke's avatar
Guolin Ke committed
825
        // current split gain
826
827
828
829
830
        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
            sum_left_gradient, sum_left_hessian, sum_right_gradient,
            sum_right_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step,
            constraints, meta_->monotone_type);
Guolin Ke's avatar
Guolin Ke committed
831
832
833
834
835
836
837
838
839
840
841
842
843
844
        // gain with split is worse than without split
        if (current_gain <= min_gain_shift) continue;

        // mark to is splittable
        is_splittable_ = true;
        // better split point
        if (current_gain > best_gain) {
          best_left_count = left_count;
          best_sum_left_gradient = sum_left_gradient;
          best_sum_left_hessian = sum_left_hessian;
          // left is <= threshold, right is > threshold.  so this is t-1
          best_threshold = static_cast<uint32_t>(t - 1 + offset);
          best_gain = current_gain;
        }
Guolin Ke's avatar
Guolin Ke committed
845
      }
ChenZhiyong's avatar
ChenZhiyong committed
846
    } else {
Guolin Ke's avatar
Guolin Ke committed
847
848
849
850
851
      double sum_left_gradient = 0.0f;
      double sum_left_hessian = kEpsilon;
      data_size_t left_count = 0;

      int t = 0;
852
      const int t_end = meta_->num_bin - 2 - offset;
Guolin Ke's avatar
Guolin Ke committed
853

854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
      if (NA_AS_MISSING) {
        if (offset == 1) {
          sum_left_gradient = sum_gradient;
          sum_left_hessian = sum_hessian - kEpsilon;
          left_count = num_data;
          for (int i = 0; i < meta_->num_bin - offset; ++i) {
            const auto grad = GET_GRAD(data_, i);
            const auto hess = GET_HESS(data_, i);
            data_size_t cnt =
                static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
            sum_left_gradient -= grad;
            sum_left_hessian -= hess;
            left_count -= cnt;
          }
          t = -1;
Guolin Ke's avatar
Guolin Ke committed
869
870
871
        }
      }

Guolin Ke's avatar
Guolin Ke committed
872
      for (; t <= t_end; ++t) {
873
874
875
876
877
        if (SKIP_DEFAULT_BIN) {
          if ((t + offset) == static_cast<int>(meta_->default_bin)) {
            continue;
          }
        }
Guolin Ke's avatar
Guolin Ke committed
878
        if (t >= 0) {
879
880
          sum_left_gradient += GET_GRAD(data_, t);
          sum_left_hessian += GET_HESS(data_, t);
881
882
          left_count += static_cast<data_size_t>(
              Common::RoundInt(GET_HESS(data_, t) * cnt_factor));
Guolin Ke's avatar
Guolin Ke committed
883
        }
Guolin Ke's avatar
Guolin Ke committed
884
        // if data not enough, or sum hessian too small
885
886
887
        if (left_count < meta_->config->min_data_in_leaf ||
            sum_left_hessian < meta_->config->min_sum_hessian_in_leaf)
          continue;
Guolin Ke's avatar
Guolin Ke committed
888
889
        data_size_t right_count = num_data - left_count;
        // if data not enough
Guolin Ke's avatar
Guolin Ke committed
890
        if (right_count < meta_->config->min_data_in_leaf) break;
Guolin Ke's avatar
Guolin Ke committed
891
892
893

        double sum_right_hessian = sum_hessian - sum_left_hessian;
        // if sum hessian too small
Guolin Ke's avatar
Guolin Ke committed
894
        if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break;
Guolin Ke's avatar
Guolin Ke committed
895
896

        double sum_right_gradient = sum_gradient - sum_left_gradient;
897
        if (USE_RAND) {
Guolin Ke's avatar
Guolin Ke committed
898
899
          if (t + offset != rand_threshold) {
            continue;
900
          }
Guolin Ke's avatar
Guolin Ke committed
901
        }
Guolin Ke's avatar
Guolin Ke committed
902
        // current split gain
903
904
905
906
907
        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
            sum_left_gradient, sum_left_hessian, sum_right_gradient,
            sum_right_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step,
            constraints, meta_->monotone_type);
Guolin Ke's avatar
Guolin Ke committed
908
909
910
911
912
913
914
915
916
917
918
919
920
        // gain with split is worse than without split
        if (current_gain <= min_gain_shift) continue;

        // mark to is splittable
        is_splittable_ = true;
        // better split point
        if (current_gain > best_gain) {
          best_left_count = left_count;
          best_sum_left_gradient = sum_left_gradient;
          best_sum_left_hessian = sum_left_hessian;
          best_threshold = static_cast<uint32_t>(t + offset);
          best_gain = current_gain;
        }
Guolin Ke's avatar
Guolin Ke committed
921
922
923
      }
    }

924
    if (is_splittable_ && best_gain > output->gain + min_gain_shift) {
Guolin Ke's avatar
Guolin Ke committed
925
926
      // update split information
      output->threshold = best_threshold;
927
928
929
930
931
      output->left_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              best_sum_left_gradient, best_sum_left_hessian,
              meta_->config->lambda_l1, meta_->config->lambda_l2,
              meta_->config->max_delta_step, constraints);
Guolin Ke's avatar
Guolin Ke committed
932
933
934
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
935
936
937
938
939
940
      output->right_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_gradient - best_sum_left_gradient,
              sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1,
              meta_->config->lambda_l2, meta_->config->max_delta_step,
              constraints);
Guolin Ke's avatar
Guolin Ke committed
941
942
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
943
944
945
946
      output->right_sum_hessian =
          sum_hessian - best_sum_left_hessian - kEpsilon;
      output->gain = best_gain - min_gain_shift;
      output->default_left = REVERSE;
Guolin Ke's avatar
Guolin Ke committed
947
948
949
    }
  }

Guolin Ke's avatar
Guolin Ke committed
950
  const FeatureMetainfo* meta_;
Guolin Ke's avatar
Guolin Ke committed
951
  /*! \brief sum of gradient of each bin */
952
  hist_t* data_;
Guolin Ke's avatar
Guolin Ke committed
953
  bool is_splittable_ = true;
954

955
956
957
  std::function<void(double, double, data_size_t, const ConstraintEntry&,
                     SplitInfo*)>
      find_best_threshold_fun_;
Guolin Ke's avatar
Guolin Ke committed
958
};
Nikita Titov's avatar
Nikita Titov committed
959

Guolin Ke's avatar
Guolin Ke committed
960
class HistogramPool {
961
 public:
Guolin Ke's avatar
Guolin Ke committed
962
  /*!
963
964
   * \brief Constructor
   */
Guolin Ke's avatar
Guolin Ke committed
965
  HistogramPool() {
Guolin Ke's avatar
Guolin Ke committed
966
967
    cache_size_ = 0;
    total_size_ = 0;
Guolin Ke's avatar
Guolin Ke committed
968
  }
969

Guolin Ke's avatar
Guolin Ke committed
970
  /*!
971
972
973
   * \brief Destructor
   */
  ~HistogramPool() {}
974

Guolin Ke's avatar
Guolin Ke committed
975
  /*!
976
977
978
979
   * \brief Reset pool size
   * \param cache_size Max cache size
   * \param total_size Total size will be used
   */
Guolin Ke's avatar
Guolin Ke committed
980
  void Reset(int cache_size, int total_size) {
Guolin Ke's avatar
Guolin Ke committed
981
982
    cache_size_ = cache_size;
    // at least need 2 bucket to store smaller leaf and larger leaf
983
    CHECK_GE(cache_size_, 2);
Guolin Ke's avatar
Guolin Ke committed
984
985
986
987
988
989
    total_size_ = total_size;
    if (cache_size_ > total_size_) {
      cache_size_ = total_size_;
    }
    is_enough_ = (cache_size_ == total_size_);
    if (!is_enough_) {
990
991
992
      mapper_.resize(total_size_);
      inverse_mapper_.resize(cache_size_);
      last_used_time_.resize(cache_size_);
Guolin Ke's avatar
Guolin Ke committed
993
994
995
      ResetMap();
    }
  }
996

Guolin Ke's avatar
Guolin Ke committed
997
  /*!
998
999
   * \brief Reset mapper
   */
Guolin Ke's avatar
Guolin Ke committed
1000
1001
1002
1003
1004
1005
1006
1007
  void ResetMap() {
    if (!is_enough_) {
      cur_time_ = 0;
      std::fill(mapper_.begin(), mapper_.end(), -1);
      std::fill(inverse_mapper_.begin(), inverse_mapper_.end(), -1);
      std::fill(last_used_time_.begin(), last_used_time_.end(), 0);
    }
  }
1008
1009
1010
  template <bool USE_DATA, bool USE_CONFIG>
  static void SetFeatureInfo(const Dataset* train_data, const Config* config,
                             std::vector<FeatureMetainfo>* feature_meta) {
1011
1012
1013
    auto& ref_feature_meta = *feature_meta;
    const int num_feature = train_data->num_features();
    ref_feature_meta.resize(num_feature);
1014
#pragma omp parallel for schedule(static, 512) if (num_feature >= 1024)
1015
    for (int i = 0; i < num_feature; ++i) {
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
      if (USE_DATA) {
        ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i);
        ref_feature_meta[i].default_bin =
            train_data->FeatureBinMapper(i)->GetDefaultBin();
        ref_feature_meta[i].missing_type =
            train_data->FeatureBinMapper(i)->missing_type();
        if (train_data->FeatureBinMapper(i)->GetMostFreqBin() == 0) {
          ref_feature_meta[i].offset = 1;
        } else {
          ref_feature_meta[i].offset = 0;
        }
        ref_feature_meta[i].bin_type =
            train_data->FeatureBinMapper(i)->bin_type();
1029
      }
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
      if (USE_CONFIG) {
        const int real_fidx = train_data->RealFeatureIndex(i);
        if (!config->monotone_constraints.empty()) {
          ref_feature_meta[i].monotone_type =
              config->monotone_constraints[real_fidx];
        } else {
          ref_feature_meta[i].monotone_type = 0;
        }
        if (!config->feature_contri.empty()) {
          ref_feature_meta[i].penalty = config->feature_contri[real_fidx];
        } else {
          ref_feature_meta[i].penalty = 1.0;
        }
        ref_feature_meta[i].rand = Random(config->extra_seed + i);
1044
1045
1046
1047
1048
      }
      ref_feature_meta[i].config = config;
    }
  }

1049
1050
  void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise,
                         const Config* config, int cache_size, int total_size) {
Guolin Ke's avatar
Guolin Ke committed
1051
    if (feature_metas_.empty()) {
1052
      SetFeatureInfo<true, true>(train_data, config, &feature_metas_);
1053
      uint64_t bin_cnt_over_features = 0;
1054
      for (int i = 0; i < train_data->num_features(); ++i) {
1055
1056
        bin_cnt_over_features +=
            static_cast<uint64_t>(feature_metas_[i].num_bin);
Guolin Ke's avatar
Guolin Ke committed
1057
      }
1058
      Log::Info("Total Bins %d", bin_cnt_over_features);
Guolin Ke's avatar
Guolin Ke committed
1059
    }
Guolin Ke's avatar
Guolin Ke committed
1060
    int old_cache_size = static_cast<int>(pool_.size());
Guolin Ke's avatar
Guolin Ke committed
1061
    Reset(cache_size, total_size);
Guolin Ke's avatar
Guolin Ke committed
1062
1063
1064
1065
1066

    if (cache_size > old_cache_size) {
      pool_.resize(cache_size);
      data_.resize(cache_size);
    }
1067
    int num_total_bin = static_cast<int>(train_data->NumTotalBin());
Guolin Ke's avatar
Guolin Ke committed
1068

1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
    std::vector<int> offsets;
    if (is_hist_colwise) {
      int offset = 0;
      for (int j = 0; j < train_data->num_features(); ++j) {
        offset += train_data->SubFeatureBinOffset(j);
        offsets.push_back(offset);
        auto num_bin = train_data->FeatureNumBin(j);
        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
          num_bin -= 1;
        }
        offset += num_bin;
      }
    } else {
      num_total_bin = 1;
      for (int j = 0; j < train_data->num_features(); ++j) {
        offsets.push_back(num_total_bin);
        num_total_bin += train_data->FeatureBinMapper(j)->num_bin();
        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
          num_total_bin -= 1;
        }
      }
    }
1091
    OMP_INIT_EX();
1092
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
1093
    for (int i = old_cache_size; i < cache_size; ++i) {
1094
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
1095
      pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
1096
      data_[i].resize(num_total_bin * 2);
Guolin Ke's avatar
Guolin Ke committed
1097
      for (int j = 0; j < train_data->num_features(); ++j) {
1098
        pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
Guolin Ke's avatar
Guolin Ke committed
1099
      }
1100
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1101
    }
1102
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1103
1104
  }

1105
  void ResetConfig(const Dataset* train_data, const Config* config) {
1106
    SetFeatureInfo<false, true>(train_data, config, &feature_metas_);
Guolin Ke's avatar
Guolin Ke committed
1107
  }
1108

Guolin Ke's avatar
Guolin Ke committed
1109
  /*!
1110
1111
1112
1113
1114
1115
   * \brief Get data for the specific index
   * \param idx which index want to get
   * \param out output data will store into this
   * \return True if this index is in the pool, False if this index is not in
   * the pool
   */
Guolin Ke's avatar
Guolin Ke committed
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
  bool Get(int idx, FeatureHistogram** out) {
    if (is_enough_) {
      *out = pool_[idx].get();
      return true;
    } else if (mapper_[idx] >= 0) {
      int slot = mapper_[idx];
      *out = pool_[slot].get();
      last_used_time_[slot] = ++cur_time_;
      return true;
    } else {
1126
      // choose the least used slot
Guolin Ke's avatar
Guolin Ke committed
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
      int slot = static_cast<int>(ArrayArgs<int>::ArgMin(last_used_time_));
      *out = pool_[slot].get();
      last_used_time_[slot] = ++cur_time_;

      // reset previous mapper
      if (inverse_mapper_[slot] >= 0) mapper_[inverse_mapper_[slot]] = -1;

      // update current mapper
      mapper_[idx] = slot;
      inverse_mapper_[slot] = idx;
      return false;
    }
  }

  /*!
1142
1143
1144
1145
   * \brief Move data from one index to another index
   * \param src_idx
   * \param dst_idx
   */
Guolin Ke's avatar
Guolin Ke committed
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
  void Move(int src_idx, int dst_idx) {
    if (is_enough_) {
      std::swap(pool_[src_idx], pool_[dst_idx]);
      return;
    }
    if (mapper_[src_idx] < 0) {
      return;
    }
    // get slot of src idx
    int slot = mapper_[src_idx];
    // reset src_idx
    mapper_[src_idx] = -1;

    // move to dst idx
    mapper_[dst_idx] = slot;
    last_used_time_[slot] = ++cur_time_;
    inverse_mapper_[slot] = dst_idx;
  }
1164

1165
 private:
Guolin Ke's avatar
Guolin Ke committed
1166
  std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
1167
1168
1169
  std::vector<
      std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>>
      data_;
Guolin Ke's avatar
Guolin Ke committed
1170
  std::vector<FeatureMetainfo> feature_metas_;
Guolin Ke's avatar
Guolin Ke committed
1171
1172
1173
1174
1175
1176
1177
1178
1179
  int cache_size_;
  int total_size_;
  bool is_enough_ = false;
  std::vector<int> mapper_;
  std::vector<int> inverse_mapper_;
  std::vector<int> last_used_time_;
  int cur_time_ = 0;
};

Guolin Ke's avatar
Guolin Ke committed
1180
}  // namespace LightGBM
1181
#endif  // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_