feature_histogram.hpp 43.7 KB
Newer Older
1
2
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_

9
#include <LightGBM/bin.h>
Guolin Ke's avatar
Guolin Ke committed
10
#include <LightGBM/dataset.h>
11
#include <LightGBM/utils/array_args.h>
Guolin Ke's avatar
Guolin Ke committed
12

13
#include <algorithm>
14
#include <cmath>
15
16
17
18
19
#include <cstring>
#include <memory>
#include <utility>
#include <vector>

20
#include "monotone_constraints.hpp"
Nikita Titov's avatar
Nikita Titov committed
21
#include "split_info.hpp"
Guolin Ke's avatar
Guolin Ke committed
22

23
namespace LightGBM {
Guolin Ke's avatar
Guolin Ke committed
24

Guolin Ke's avatar
Guolin Ke committed
25
class FeatureMetainfo {
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
  int num_bin;
Guolin Ke's avatar
Guolin Ke committed
28
  MissingType missing_type;
29
  int8_t offset = 0;
Guolin Ke's avatar
Guolin Ke committed
30
  uint32_t default_bin;
31
32
  int8_t monotone_type = 0;
  double penalty = 1.0;
Guolin Ke's avatar
Guolin Ke committed
33
  /*! \brief pointer of tree config */
Guolin Ke's avatar
Guolin Ke committed
34
  const Config* config;
35
  BinType bin_type;
36
37
  /*! \brief random number generator for extremely randomized trees */
  mutable Random rand;
Guolin Ke's avatar
Guolin Ke committed
38
};
Guolin Ke's avatar
Guolin Ke committed
39
/*!
40
41
42
 * \brief FeatureHistogram is used to construct and store a histogram for a
 * feature.
 */
Guolin Ke's avatar
Guolin Ke committed
43
class FeatureHistogram {
44
 public:
45
  FeatureHistogram() { data_ = nullptr; }
Guolin Ke's avatar
Guolin Ke committed
46

47
  ~FeatureHistogram() {}
Guolin Ke's avatar
Guolin Ke committed
48

Guolin Ke's avatar
Guolin Ke committed
49
50
51
52
53
  /*! \brief Disable copy */
  FeatureHistogram& operator=(const FeatureHistogram&) = delete;
  /*! \brief Disable copy */
  FeatureHistogram(const FeatureHistogram&) = delete;

Guolin Ke's avatar
Guolin Ke committed
54
  /*!
55
56
57
58
   * \brief Init the feature histogram
   * \param feature the feature data for this histogram
   * \param min_num_data_one_leaf minimal number of data in one leaf
   */
59
  void Init(hist_t* data, const FeatureMetainfo* meta) {
Guolin Ke's avatar
Guolin Ke committed
60
61
    meta_ = meta;
    data_ = data;
62
63
64
65
    ResetFunc();
  }

  void ResetFunc() {
66
    if (meta_->bin_type == BinType::NumericalBin) {
67
      FuncForNumrical();
68
    } else {
69
      FuncForCategorical();
70
    }
Guolin Ke's avatar
Guolin Ke committed
71
72
  }

73
  hist_t* RawData() { return data_; }
74

Guolin Ke's avatar
Guolin Ke committed
75
  /*!
76
77
78
   * \brief Subtract current histograms with other
   * \param other The histogram that want to subtract
   */
Guolin Ke's avatar
Guolin Ke committed
79
  void Subtract(const FeatureHistogram& other) {
80
81
    for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
      data_[i] -= other.data_[i];
Guolin Ke's avatar
Guolin Ke committed
82
83
    }
  }
84

85
86
87
88
  void FindBestThreshold(double sum_gradient, double sum_hessian,
                         data_size_t num_data,
                         const ConstraintEntry& constraints,
                         SplitInfo* output) {
Guolin Ke's avatar
Guolin Ke committed
89
    output->default_left = true;
Guolin Ke's avatar
Guolin Ke committed
90
    output->gain = kMinScore;
91
92
    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data,
                             constraints, output);
Guolin Ke's avatar
Guolin Ke committed
93
    output->gain *= meta_->penalty;
94
95
  }

96
97
98
  template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT>
  double BeforeNumercal(double sum_gradient, double sum_hessian,
                        SplitInfo* output, int* rand_threshold) {
Guolin Ke's avatar
Guolin Ke committed
99
    is_splittable_ = false;
100
101
102
103
104
105
106
107
108
    output->monotone_type = meta_->monotone_type;
    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
    *rand_threshold = 0;
    if (USE_RAND) {
      if (meta_->num_bin - 2 > 0) {
        *rand_threshold = meta_->rand.NextInt(0, meta_->num_bin - 2);
      }
109
    }
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
    return gain_shift + meta_->config->min_gain_to_split;
  }

  void FuncForNumrical() {
    if (meta_->config->extra_trees) {
      if (meta_->config->monotone_constraints.empty()) {
        FuncForNumricalL1<true, false>();
      } else {
        FuncForNumricalL1<true, true>();
      }
    } else {
      if (meta_->config->monotone_constraints.empty()) {
        FuncForNumricalL1<false, false>();
      } else {
        FuncForNumricalL1<false, true>();
      }
    }
  }
  template <bool USE_RAND, bool USE_MC>
  void FuncForNumricalL1() {
    if (meta_->config->lambda_l1 > 0) {
      if (meta_->config->max_delta_step > 0) {
        FuncForNumricalL2<USE_RAND, USE_MC, true, true>();
      } else {
        FuncForNumricalL2<USE_RAND, USE_MC, true, false>();
      }
    } else {
      if (meta_->config->max_delta_step > 0) {
        FuncForNumricalL2<USE_RAND, USE_MC, false, true>();
      } else {
        FuncForNumricalL2<USE_RAND, USE_MC, false, false>();
      }
    }
  }

  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  void FuncForNumricalL2() {
147
148
149
150
151
152
153
154
155
156

#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT
#define LAMBDA_ARGUMENTS                                         \
  double sum_gradient, double sum_hessian, data_size_t num_data, \
      const ConstraintEntry &constraints, SplitInfo *output
#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, output, &rand_threshold
#define FUNC_ARGUMENTS                                                      \
  sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, output, \
      rand_threshold

Guolin Ke's avatar
Guolin Ke committed
157
158
    if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
      if (meta_->missing_type == MissingType::Zero) {
159
160
161
162
163
164
165
166
167
168
        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
          int rand_threshold = 0;
          double min_gain_shift =
              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                  BEFORE_ARGUMENTS);
          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, true, false>(
              FUNC_ARGUMENTS);
          FindBestThresholdSequentially<TEMPLATE_PREFIX, false, true, false>(
              FUNC_ARGUMENTS);
        };
Guolin Ke's avatar
Guolin Ke committed
169
      } else {
170
171
172
173
174
175
176
177
178
179
        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
          int rand_threshold = 0;
          double min_gain_shift =
              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                  BEFORE_ARGUMENTS);
          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, true>(
              FUNC_ARGUMENTS);
          FindBestThresholdSequentially<TEMPLATE_PREFIX, false, false, true>(
              FUNC_ARGUMENTS);
        };
Guolin Ke's avatar
Guolin Ke committed
180
      }
181
    } else {
182
      if (meta_->missing_type != MissingType::NaN) {
183
184
185
186
187
188
189
190
        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
          int rand_threshold = 0;
          double min_gain_shift =
              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                  BEFORE_ARGUMENTS);
          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
              FUNC_ARGUMENTS);
        };
191
      } else {
192
193
194
195
196
197
198
199
200
201
        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
          int rand_threshold = 0;
          double min_gain_shift =
              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
                  BEFORE_ARGUMENTS);
          FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
                                        USE_MAX_OUTPUT, true, false, false>(
              FUNC_ARGUMENTS);
          output->default_left = false;
        };
Guolin Ke's avatar
Guolin Ke committed
202
      }
Guolin Ke's avatar
Guolin Ke committed
203
    }
204
205
206
207
#undef TEMPLATE_PREFIX
#undef LAMBDA_ARGUMENTS
#undef BEFORE_ARGUMENTS
#undef FUNC_ARGURMENTS
Guolin Ke's avatar
Guolin Ke committed
208
  }
209

210
  void FuncForCategorical() {
211
    if (meta_->config->extra_trees) {
212
213
214
215
216
217
218
219
220
221
222
223
224
      if (meta_->config->monotone_constraints.empty()) {
        FuncForCategoricalL1<true, false>();
      } else {
        FuncForCategoricalL1<true, true>();
      }
    } else {
      if (meta_->config->monotone_constraints.empty()) {
        FuncForCategoricalL1<false, false>();
      } else {
        FuncForCategoricalL1<false, true>();
      }
    }
  }
225

226
227
  template <bool USE_RAND, bool USE_MC>
  void FuncForCategoricalL1() {
228
229
230
#define ARGUMENTS                                                      \
  std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \
      std::placeholders::_4, std::placeholders::_5
231
232
233
234
235
    if (meta_->config->lambda_l1 > 0) {
      if (meta_->config->max_delta_step > 0) {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, true, true>,
236
                      this, ARGUMENTS);
237
238
239
240
      } else {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, true, false>,
241
                      this, ARGUMENTS);
242
      }
243
    } else {
244
245
246
247
      if (meta_->config->max_delta_step > 0) {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, false, true>,
248
                      this, ARGUMENTS);
249
250
251
252
      } else {
        find_best_threshold_fun_ =
            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
                          USE_RAND, USE_MC, false, false>,
253
                      this, ARGUMENTS);
254
      }
255
    }
256
#undef ARGUMENTS
257
258
  }

259
260
261
262
263
264
265
  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  void FindBestThresholdCategoricalInner(double sum_gradient,
                                         double sum_hessian,
                                         data_size_t num_data,
                                         const ConstraintEntry& constraints,
                                         SplitInfo* output) {
    is_splittable_ = false;
Guolin Ke's avatar
Guolin Ke committed
266
    output->default_left = false;
267
    double best_gain = kMinScore;
268
    data_size_t best_left_count = 0;
ChenZhiyong's avatar
ChenZhiyong committed
269
270
    double best_sum_left_gradient = 0;
    double best_sum_left_hessian = 0;
271
272
273
    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
274

Guolin Ke's avatar
Guolin Ke committed
275
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
ChenZhiyong's avatar
ChenZhiyong committed
276
    bool is_full_categorical = meta_->missing_type == MissingType::None;
277
    int used_bin = meta_->num_bin - 1 + is_full_categorical;
ChenZhiyong's avatar
ChenZhiyong committed
278

Guolin Ke's avatar
Guolin Ke committed
279
    std::vector<int> sorted_idx;
Guolin Ke's avatar
Guolin Ke committed
280
281
    double l2 = meta_->config->lambda_l2;
    bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
282
283
    int best_threshold = -1;
    int best_dir = 1;
284
    const double cnt_factor = num_data / sum_hessian;
285
    int rand_threshold = 0;
286
    if (use_onehot) {
287
      if (USE_RAND) {
288
        if (used_bin > 0) {
289
          rand_threshold = meta_->rand.NextInt(0, used_bin);
290
291
        }
      }
292
      for (int t = 0; t < used_bin; ++t) {
293
294
        const auto grad = GET_GRAD(data_, t);
        const auto hess = GET_HESS(data_, t);
295
296
        data_size_t cnt =
            static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
297
        // if data not enough, or sum hessian too small
298
        if (cnt < meta_->config->min_data_in_leaf ||
299
            hess < meta_->config->min_sum_hessian_in_leaf) {
300
          continue;
301
        }
302
        data_size_t other_count = num_data - cnt;
303
        // if data not enough
304
305
306
        if (other_count < meta_->config->min_data_in_leaf) {
          continue;
        }
ChenZhiyong's avatar
ChenZhiyong committed
307

308
        double sum_other_hessian = sum_hessian - hess - kEpsilon;
309
        // if sum hessian too small
310
        if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) {
311
          continue;
312
        }
ChenZhiyong's avatar
ChenZhiyong committed
313

314
        double sum_other_gradient = sum_gradient - grad;
315
        if (USE_RAND) {
316
317
318
319
          if (t != rand_threshold) {
            continue;
          }
        }
320
        // current split gain
321
322
323
324
        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
            sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
            constraints, 0);
325
        // gain with split is worse than without split
326
327
328
        if (current_gain <= min_gain_shift) {
          continue;
        }
329
330

        // mark to is splittable
ChenZhiyong's avatar
ChenZhiyong committed
331
        is_splittable_ = true;
332
        // better split point
ChenZhiyong's avatar
ChenZhiyong committed
333
        if (current_gain > best_gain) {
334
          best_threshold = t;
335
336
337
          best_sum_left_gradient = grad;
          best_sum_left_hessian = hess + kEpsilon;
          best_left_count = cnt;
ChenZhiyong's avatar
ChenZhiyong committed
338
          best_gain = current_gain;
339
340
341
342
        }
      }
    } else {
      for (int i = 0; i < used_bin; ++i) {
343
344
        if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >=
            meta_->config->cat_smooth) {
345
346
347
348
349
          sorted_idx.push_back(i);
        }
      }
      used_bin = static_cast<int>(sorted_idx.size());

Guolin Ke's avatar
Guolin Ke committed
350
      l2 += meta_->config->cat_l2;
351
352

      auto ctr_fun = [this](double sum_grad, double sum_hess) {
Guolin Ke's avatar
Guolin Ke committed
353
        return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
354
355
      };
      std::sort(sorted_idx.begin(), sorted_idx.end(),
356
357
358
359
                [this, &ctr_fun](int i, int j) {
                  return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) <
                         ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
                });
360
361
362
363
364

      std::vector<int> find_direction(1, 1);
      std::vector<int> start_position(1, 0);
      find_direction.push_back(-1);
      start_position.push_back(used_bin - 1);
365
366
      const int max_num_cat =
          std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2);
367
      int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0);
368
      if (USE_RAND) {
369
        if (max_threshold > 0) {
370
          rand_threshold = meta_->rand.NextInt(0, max_threshold);
371
        }
372
      }
373

374
375
376
377
      is_splittable_ = false;
      for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
        auto dir = find_direction[out_i];
        auto start_pos = start_position[out_i];
Guolin Ke's avatar
Guolin Ke committed
378
        data_size_t min_data_per_group = meta_->config->min_data_per_group;
379
380
381
382
383
384
385
        data_size_t cnt_cur_group = 0;
        double sum_left_gradient = 0.0f;
        double sum_left_hessian = kEpsilon;
        data_size_t left_count = 0;
        for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
          auto t = sorted_idx[start_pos];
          start_pos += dir;
386
387
          const auto grad = GET_GRAD(data_, t);
          const auto hess = GET_HESS(data_, t);
388
389
          data_size_t cnt =
              static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
390

391
392
393
394
          sum_left_gradient += grad;
          sum_left_hessian += hess;
          left_count += cnt;
          cnt_cur_group += cnt;
395

396
          if (left_count < meta_->config->min_data_in_leaf ||
397
              sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
398
            continue;
399
          }
400
          data_size_t right_count = num_data - left_count;
401
          if (right_count < meta_->config->min_data_in_leaf ||
402
              right_count < min_data_per_group) {
403
            break;
404
          }
405
406

          double sum_right_hessian = sum_hessian - sum_left_hessian;
407
408
409
          if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
            break;
          }
410

411
412
413
          if (cnt_cur_group < min_data_per_group) {
            continue;
          }
414
415
416
417

          cnt_cur_group = 0;

          double sum_right_gradient = sum_gradient - sum_left_gradient;
418
          if (USE_RAND) {
419
420
            if (i != rand_threshold) {
              continue;
421
            }
422
          }
423
424
425
426
          double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_left_gradient, sum_left_hessian, sum_right_gradient,
              sum_right_hessian, meta_->config->lambda_l1, l2,
              meta_->config->max_delta_step, constraints, 0);
427
428
429
          if (current_gain <= min_gain_shift) {
            continue;
          }
430
431
432
433
434
435
436
437
438
          is_splittable_ = true;
          if (current_gain > best_gain) {
            best_left_count = left_count;
            best_sum_left_gradient = sum_left_gradient;
            best_sum_left_hessian = sum_left_hessian;
            best_threshold = i;
            best_gain = current_gain;
            best_dir = dir;
          }
ChenZhiyong's avatar
ChenZhiyong committed
439
        }
440
441
      }
    }
442

443
    if (is_splittable_) {
444
445
446
447
448
      output->left_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              best_sum_left_gradient, best_sum_left_hessian,
              meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
              constraints);
449
450
451
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
452
453
454
455
456
      output->right_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_gradient - best_sum_left_gradient,
              sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
              meta_->config->max_delta_step, constraints);
457
458
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
459
460
      output->right_sum_hessian =
          sum_hessian - best_sum_left_hessian - kEpsilon;
Guolin Ke's avatar
Guolin Ke committed
461
      output->gain = best_gain - min_gain_shift;
462
463
      if (use_onehot) {
        output->num_cat_threshold = 1;
464
465
        output->cat_threshold =
            std::vector<uint32_t>(1, static_cast<uint32_t>(best_threshold));
ChenZhiyong's avatar
ChenZhiyong committed
466
      } else {
467
        output->num_cat_threshold = best_threshold + 1;
468
469
        output->cat_threshold =
            std::vector<uint32_t>(output->num_cat_threshold);
470
471
472
473
474
475
476
477
478
479
        if (best_dir == 1) {
          for (int i = 0; i < output->num_cat_threshold; ++i) {
            auto t = sorted_idx[i];
            output->cat_threshold[i] = t;
          }
        } else {
          for (int i = 0; i < output->num_cat_threshold; ++i) {
            auto t = sorted_idx[used_bin - 1 - i];
            output->cat_threshold[i] = t;
          }
ChenZhiyong's avatar
ChenZhiyong committed
480
481
        }
      }
Guolin Ke's avatar
Guolin Ke committed
482
      output->monotone_type = 0;
483
    }
484
485
  }

486
  void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
487
488
                              uint32_t threshold, data_size_t num_data,
                              SplitInfo* output) {
489
    if (meta_->bin_type == BinType::NumericalBin) {
490
491
      GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
                                      num_data, output);
492
    } else {
493
494
      GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
                                        num_data, output);
495
496
497
498
    }
  }

  void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
499
500
501
502
503
                                       uint32_t threshold, data_size_t num_data,
                                       SplitInfo* output) {
    double gain_shift = GetLeafGain<true, true>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
Guolin Ke's avatar
Guolin Ke committed
504
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
505
506

    // do stuff here
507
    const int8_t offset = meta_->offset;
508
509
510
511
512
513

    double sum_right_gradient = 0.0f;
    double sum_right_hessian = kEpsilon;
    data_size_t right_count = 0;

    // set values
514
515
    bool use_na_as_missing = false;
    bool skip_default_bin = false;
516
517
    if (meta_->missing_type == MissingType::Zero) {
      skip_default_bin = true;
518
    } else if (meta_->missing_type == MissingType::NaN) {
519
520
521
      use_na_as_missing = true;
    }

522
523
    int t = meta_->num_bin - 1 - offset - use_na_as_missing;
    const int t_end = 1 - offset;
524
    const double cnt_factor = num_data / sum_hessian;
525
526
    // from right to left, and we don't need data in bin0
    for (; t >= t_end; --t) {
527
528
529
      if (static_cast<uint32_t>(t + offset) < threshold) {
        break;
      }
530
531

      // need to skip default bin
532
533
534
535
      if (skip_default_bin &&
          (t + offset) == static_cast<int>(meta_->default_bin)) {
        continue;
      }
536
537
      const auto grad = GET_GRAD(data_, t);
      const auto hess = GET_HESS(data_, t);
538
539
      data_size_t cnt =
          static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
540
541
542
      sum_right_gradient += grad;
      sum_right_hessian += hess;
      right_count += cnt;
543
544
545
546
    }
    double sum_left_gradient = sum_gradient - sum_right_gradient;
    double sum_left_hessian = sum_hessian - sum_right_hessian;
    data_size_t left_count = num_data - right_count;
547
548
549
550
551
552
553
    double current_gain =
        GetLeafGain<true, true>(
            sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step) +
        GetLeafGain<true, true>(
            sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step);
554
555
556
557

    // gain with split is worse than without split
    if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
      output->gain = kMinScore;
558
      Log::Warning(
559
          "'Forced Split' will be ignored since the gain getting worse.");
560
      return;
561
    }
562
563
564

    // update split information
    output->threshold = threshold;
565
566
567
    output->left_output = CalculateSplittedLeafOutput<true, true>(
        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
568
569
570
    output->left_count = left_count;
    output->left_sum_gradient = sum_left_gradient;
    output->left_sum_hessian = sum_left_hessian - kEpsilon;
571
572
573
574
    output->right_output = CalculateSplittedLeafOutput<true, true>(
        sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
        meta_->config->lambda_l1, meta_->config->lambda_l2,
        meta_->config->max_delta_step);
575
576
577
    output->right_count = num_data - left_count;
    output->right_sum_gradient = sum_gradient - sum_left_gradient;
    output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
578
    output->gain = current_gain - min_gain_shift;
579
580
581
    output->default_left = true;
  }

582
583
584
585
  void GatherInfoForThresholdCategorical(double sum_gradient,
                                         double sum_hessian, uint32_t threshold,
                                         data_size_t num_data,
                                         SplitInfo* output) {
586
587
    // get SplitInfo for a given one-hot categorical split.
    output->default_left = false;
588
589
590
    double gain_shift = GetLeafGain<true, true>(
        sum_gradient, sum_hessian, meta_->config->lambda_l1,
        meta_->config->lambda_l2, meta_->config->max_delta_step);
Guolin Ke's avatar
Guolin Ke committed
591
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
592
593
594
595
596
597
598
    bool is_full_categorical = meta_->missing_type == MissingType::None;
    int used_bin = meta_->num_bin - 1 + is_full_categorical;
    if (threshold >= static_cast<uint32_t>(used_bin)) {
      output->gain = kMinScore;
      Log::Warning("Invalid categorical threshold split");
      return;
    }
599
600
601
    const double cnt_factor = num_data / sum_hessian;
    const auto grad = GET_GRAD(data_, threshold);
    const auto hess = GET_HESS(data_, threshold);
602
603
    data_size_t cnt =
        static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
604

Guolin Ke's avatar
Guolin Ke committed
605
    double l2 = meta_->config->lambda_l2;
606
    data_size_t left_count = cnt;
607
    data_size_t right_count = num_data - left_count;
608
    double sum_left_hessian = hess + kEpsilon;
609
    double sum_right_hessian = sum_hessian - sum_left_hessian;
610
    double sum_left_gradient = grad;
611
612
    double sum_right_gradient = sum_gradient - sum_left_gradient;
    // current split gain
613
614
615
616
617
618
619
    double current_gain =
        GetLeafGain<true, true>(sum_right_gradient, sum_right_hessian,
                                meta_->config->lambda_l1, l2,
                                meta_->config->max_delta_step) +
        GetLeafGain<true, true>(sum_left_gradient, sum_left_hessian,
                                meta_->config->lambda_l1, l2,
                                meta_->config->max_delta_step);
620
621
    if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
      output->gain = kMinScore;
622
623
      Log::Warning(
          "'Forced Split' will be ignored since the gain getting worse.");
624
625
626
      return;
    }

627
628
629
    output->left_output = CalculateSplittedLeafOutput<true, true>(
        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
        meta_->config->max_delta_step);
630
631
632
    output->left_count = left_count;
    output->left_sum_gradient = sum_left_gradient;
    output->left_sum_hessian = sum_left_hessian - kEpsilon;
633
634
635
    output->right_output = CalculateSplittedLeafOutput<true, true>(
        sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
        meta_->config->max_delta_step);
636
637
638
639
640
641
642
643
    output->right_count = right_count;
    output->right_sum_gradient = sum_gradient - sum_left_gradient;
    output->right_sum_hessian = sum_right_hessian - kEpsilon;
    output->gain = current_gain - min_gain_shift;
    output->num_cat_threshold = 1;
    output->cat_threshold = std::vector<uint32_t>(1, threshold);
  }

Guolin Ke's avatar
Guolin Ke committed
644
  /*!
645
646
   * \brief Binary size of this histogram
   */
Guolin Ke's avatar
Guolin Ke committed
647
  int SizeOfHistgram() const {
648
    return (meta_->num_bin - meta_->offset) * kHistEntrySize;
Guolin Ke's avatar
Guolin Ke committed
649
650
651
  }

  /*!
652
653
   * \brief Restore histogram from memory
   */
Guolin Ke's avatar
Guolin Ke committed
654
  void FromMemory(char* memory_data) {
655
656
    std::memcpy(data_, memory_data,
                (meta_->num_bin - meta_->offset) * kHistEntrySize);
Guolin Ke's avatar
Guolin Ke committed
657
658
659
  }

  /*!
660
661
   * \brief True if this histogram can be splitted
   */
Guolin Ke's avatar
Guolin Ke committed
662
663
664
  bool is_splittable() { return is_splittable_; }

  /*!
665
666
   * \brief Set splittable to this histogram
   */
Guolin Ke's avatar
Guolin Ke committed
667
668
  void set_is_splittable(bool val) { is_splittable_ = val; }

669
670
671
672
673
  static double ThresholdL1(double s, double l1) {
    const double reg_s = std::max(0.0, std::fabs(s) - l1);
    return Common::Sign(s) * reg_s;
  }

674
675
676
677
678
679
680
681
682
683
684
  template <bool USE_L1, bool USE_MAX_OUTPUT>
  static double CalculateSplittedLeafOutput(double sum_gradients,
                                            double sum_hessians, double l1,
                                            double l2, double max_delta_step) {
    if (USE_L1) {
      double ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
      if (USE_MAX_OUTPUT) {
        if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
          return Common::Sign(ret) * max_delta_step;
        }
      }
685
686
      return ret;
    } else {
687
688
689
690
691
692
693
      double ret = -sum_gradients / (sum_hessians + l2);
      if (USE_MAX_OUTPUT) {
        if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
          return Common::Sign(ret) * max_delta_step;
        }
      }
      return ret;
694
    }
Guolin Ke's avatar
Guolin Ke committed
695
696
  }

697
698
699
700
701
702
703
704
705
706
707
708
709
710
  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  static double CalculateSplittedLeafOutput(
      double sum_gradients, double sum_hessians, double l1, double l2,
      double max_delta_step, const ConstraintEntry& constraints) {
    double ret = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>(
        sum_gradients, sum_hessians, l1, l2, max_delta_step);
    if (USE_MC) {
      if (ret < constraints.min) {
        ret = constraints.min;
      } else if (ret > constraints.max) {
        ret = constraints.max;
      }
    }
    return ret;
Guolin Ke's avatar
Guolin Ke committed
711
712
  }

713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
 private:
  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
  static double GetSplitGains(double sum_left_gradients,
                              double sum_left_hessians,
                              double sum_right_gradients,
                              double sum_right_hessians, double l1, double l2,
                              double max_delta_step,
                              const ConstraintEntry& constraints,
                              int8_t monotone_constraint) {
    if (!USE_MC) {
      return GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_left_gradients,
                                                 sum_left_hessians, l1, l2,
                                                 max_delta_step) +
             GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_right_gradients,
                                                 sum_right_hessians, l1, l2,
                                                 max_delta_step);
    } else {
      double left_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step,
              constraints);
      double right_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step,
              constraints);
      if (((monotone_constraint > 0) && (left_output > right_output)) ||
          ((monotone_constraint < 0) && (left_output < right_output))) {
        return 0;
      }
      return GetLeafGainGivenOutput<USE_L1>(
                 sum_left_gradients, sum_left_hessians, l1, l2, left_output) +
             GetLeafGainGivenOutput<USE_L1>(
                 sum_right_gradients, sum_right_hessians, l1, l2, right_output);
Guolin Ke's avatar
Guolin Ke committed
746
    }
Guolin Ke's avatar
Guolin Ke committed
747
  }
Guolin Ke's avatar
Guolin Ke committed
748

749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
  template <bool USE_L1, bool USE_MAX_OUTPUT>
  static double GetLeafGain(double sum_gradients, double sum_hessians,
                            double l1, double l2, double max_delta_step) {
    if (!USE_MAX_OUTPUT) {
      if (USE_L1) {
        const double sg_l1 = ThresholdL1(sum_gradients, l1);
        return (sg_l1 * sg_l1) / (sum_hessians + l2);
      } else {
        return (sum_gradients * sum_gradients) / (sum_hessians + l2);
      }
    } else {
      double output = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>(
          sum_gradients, sum_hessians, l1, l2, max_delta_step);
      return GetLeafGainGivenOutput<USE_L1>(sum_gradients, sum_hessians, l1, l2,
                                            output);
    }
Guolin Ke's avatar
Guolin Ke committed
765
766
  }

767
768
769
770
771
772
773
774
775
776
777
  template <bool USE_L1>
  static double GetLeafGainGivenOutput(double sum_gradients,
                                       double sum_hessians, double l1,
                                       double l2, double output) {
    if (USE_L1) {
      const double sg_l1 = ThresholdL1(sum_gradients, l1);
      return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output);
    } else {
      return -(2.0 * sum_gradients * output +
               (sum_hessians + l2) * output * output);
    }
Guolin Ke's avatar
Guolin Ke committed
778
  }
Guolin Ke's avatar
Guolin Ke committed
779

780
781
  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT,
            bool REVERSE, bool SKIP_DEFAULT_BIN, bool NA_AS_MISSING>
guolinke's avatar
guolinke committed
782
783
784
785
786
  void FindBestThresholdSequentially(double sum_gradient, double sum_hessian,
                                     data_size_t num_data,
                                     const ConstraintEntry& constraints,
                                     double min_gain_shift, SplitInfo* output,
                                     int rand_threshold) {
787
    const int8_t offset = meta_->offset;
Guolin Ke's avatar
Guolin Ke committed
788
789
790
791
792
    double best_sum_left_gradient = NAN;
    double best_sum_left_hessian = NAN;
    double best_gain = kMinScore;
    data_size_t best_left_count = 0;
    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
793
    const double cnt_factor = num_data / sum_hessian;
794
    if (REVERSE) {
Guolin Ke's avatar
Guolin Ke committed
795
796
797
798
      double sum_right_gradient = 0.0f;
      double sum_right_hessian = kEpsilon;
      data_size_t right_count = 0;

799
      int t = meta_->num_bin - 1 - offset - NA_AS_MISSING;
800
      const int t_end = 1 - offset;
Guolin Ke's avatar
Guolin Ke committed
801
802
803
804

      // from right to left, and we don't need data in bin0
      for (; t >= t_end; --t) {
        // need to skip default bin
805
806
807
808
809
        if (SKIP_DEFAULT_BIN) {
          if ((t + offset) == static_cast<int>(meta_->default_bin)) {
            continue;
          }
        }
810
811
        const auto grad = GET_GRAD(data_, t);
        const auto hess = GET_HESS(data_, t);
812
813
        data_size_t cnt =
            static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
814
815
816
        sum_right_gradient += grad;
        sum_right_hessian += hess;
        right_count += cnt;
Guolin Ke's avatar
Guolin Ke committed
817
        // if data not enough, or sum hessian too small
818
        if (right_count < meta_->config->min_data_in_leaf ||
819
            sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
820
          continue;
821
        }
Guolin Ke's avatar
Guolin Ke committed
822
823
        data_size_t left_count = num_data - right_count;
        // if data not enough
824
825
826
        if (left_count < meta_->config->min_data_in_leaf) {
          break;
        }
Guolin Ke's avatar
Guolin Ke committed
827
828
829

        double sum_left_hessian = sum_hessian - sum_right_hessian;
        // if sum hessian too small
830
831
832
        if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
          break;
        }
Guolin Ke's avatar
Guolin Ke committed
833
834

        double sum_left_gradient = sum_gradient - sum_right_gradient;
835
        if (USE_RAND) {
836
          if (t - 1 + offset != rand_threshold) {
Guolin Ke's avatar
Guolin Ke committed
837
            continue;
838
          }
Guolin Ke's avatar
Guolin Ke committed
839
        }
Guolin Ke's avatar
Guolin Ke committed
840
        // current split gain
841
842
843
844
845
        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
            sum_left_gradient, sum_left_hessian, sum_right_gradient,
            sum_right_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step,
            constraints, meta_->monotone_type);
Guolin Ke's avatar
Guolin Ke committed
846
        // gain with split is worse than without split
847
848
849
        if (current_gain <= min_gain_shift) {
          continue;
        }
Guolin Ke's avatar
Guolin Ke committed
850
851
852
853
854
855
856
857
858
859
860
861

        // mark to is splittable
        is_splittable_ = true;
        // better split point
        if (current_gain > best_gain) {
          best_left_count = left_count;
          best_sum_left_gradient = sum_left_gradient;
          best_sum_left_hessian = sum_left_hessian;
          // left is <= threshold, right is > threshold.  so this is t-1
          best_threshold = static_cast<uint32_t>(t - 1 + offset);
          best_gain = current_gain;
        }
Guolin Ke's avatar
Guolin Ke committed
862
      }
ChenZhiyong's avatar
ChenZhiyong committed
863
    } else {
Guolin Ke's avatar
Guolin Ke committed
864
865
866
867
868
      double sum_left_gradient = 0.0f;
      double sum_left_hessian = kEpsilon;
      data_size_t left_count = 0;

      int t = 0;
869
      const int t_end = meta_->num_bin - 2 - offset;
Guolin Ke's avatar
Guolin Ke committed
870

871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
      if (NA_AS_MISSING) {
        if (offset == 1) {
          sum_left_gradient = sum_gradient;
          sum_left_hessian = sum_hessian - kEpsilon;
          left_count = num_data;
          for (int i = 0; i < meta_->num_bin - offset; ++i) {
            const auto grad = GET_GRAD(data_, i);
            const auto hess = GET_HESS(data_, i);
            data_size_t cnt =
                static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
            sum_left_gradient -= grad;
            sum_left_hessian -= hess;
            left_count -= cnt;
          }
          t = -1;
Guolin Ke's avatar
Guolin Ke committed
886
887
888
        }
      }

Guolin Ke's avatar
Guolin Ke committed
889
      for (; t <= t_end; ++t) {
890
891
892
893
894
        if (SKIP_DEFAULT_BIN) {
          if ((t + offset) == static_cast<int>(meta_->default_bin)) {
            continue;
          }
        }
Guolin Ke's avatar
Guolin Ke committed
895
        if (t >= 0) {
896
897
          sum_left_gradient += GET_GRAD(data_, t);
          sum_left_hessian += GET_HESS(data_, t);
898
899
          left_count += static_cast<data_size_t>(
              Common::RoundInt(GET_HESS(data_, t) * cnt_factor));
Guolin Ke's avatar
Guolin Ke committed
900
        }
Guolin Ke's avatar
Guolin Ke committed
901
        // if data not enough, or sum hessian too small
902
        if (left_count < meta_->config->min_data_in_leaf ||
903
            sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
904
          continue;
905
        }
Guolin Ke's avatar
Guolin Ke committed
906
907
        data_size_t right_count = num_data - left_count;
        // if data not enough
908
909
910
        if (right_count < meta_->config->min_data_in_leaf) {
          break;
        }
Guolin Ke's avatar
Guolin Ke committed
911
912
913

        double sum_right_hessian = sum_hessian - sum_left_hessian;
        // if sum hessian too small
914
915
916
        if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
          break;
        }
Guolin Ke's avatar
Guolin Ke committed
917
918

        double sum_right_gradient = sum_gradient - sum_left_gradient;
919
        if (USE_RAND) {
Guolin Ke's avatar
Guolin Ke committed
920
921
          if (t + offset != rand_threshold) {
            continue;
922
          }
Guolin Ke's avatar
Guolin Ke committed
923
        }
Guolin Ke's avatar
Guolin Ke committed
924
        // current split gain
925
926
927
928
929
        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
            sum_left_gradient, sum_left_hessian, sum_right_gradient,
            sum_right_hessian, meta_->config->lambda_l1,
            meta_->config->lambda_l2, meta_->config->max_delta_step,
            constraints, meta_->monotone_type);
Guolin Ke's avatar
Guolin Ke committed
930
        // gain with split is worse than without split
931
932
933
        if (current_gain <= min_gain_shift) {
          continue;
        }
Guolin Ke's avatar
Guolin Ke committed
934
935
936
937
938
939
940
941
942
943
944

        // mark to is splittable
        is_splittable_ = true;
        // better split point
        if (current_gain > best_gain) {
          best_left_count = left_count;
          best_sum_left_gradient = sum_left_gradient;
          best_sum_left_hessian = sum_left_hessian;
          best_threshold = static_cast<uint32_t>(t + offset);
          best_gain = current_gain;
        }
Guolin Ke's avatar
Guolin Ke committed
945
946
947
      }
    }

948
    if (is_splittable_ && best_gain > output->gain + min_gain_shift) {
Guolin Ke's avatar
Guolin Ke committed
949
950
      // update split information
      output->threshold = best_threshold;
951
952
953
954
955
      output->left_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              best_sum_left_gradient, best_sum_left_hessian,
              meta_->config->lambda_l1, meta_->config->lambda_l2,
              meta_->config->max_delta_step, constraints);
Guolin Ke's avatar
Guolin Ke committed
956
957
958
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
959
960
961
962
963
964
      output->right_output =
          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
              sum_gradient - best_sum_left_gradient,
              sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1,
              meta_->config->lambda_l2, meta_->config->max_delta_step,
              constraints);
Guolin Ke's avatar
Guolin Ke committed
965
966
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
967
968
969
970
      output->right_sum_hessian =
          sum_hessian - best_sum_left_hessian - kEpsilon;
      output->gain = best_gain - min_gain_shift;
      output->default_left = REVERSE;
Guolin Ke's avatar
Guolin Ke committed
971
972
973
    }
  }

Guolin Ke's avatar
Guolin Ke committed
974
  const FeatureMetainfo* meta_;
Guolin Ke's avatar
Guolin Ke committed
975
  /*! \brief sum of gradient of each bin */
976
  hist_t* data_;
Guolin Ke's avatar
Guolin Ke committed
977
  bool is_splittable_ = true;
978

979
980
981
  std::function<void(double, double, data_size_t, const ConstraintEntry&,
                     SplitInfo*)>
      find_best_threshold_fun_;
Guolin Ke's avatar
Guolin Ke committed
982
};
Nikita Titov's avatar
Nikita Titov committed
983

Guolin Ke's avatar
Guolin Ke committed
984
class HistogramPool {
985
 public:
Guolin Ke's avatar
Guolin Ke committed
986
  /*!
987
988
   * \brief Constructor
   */
Guolin Ke's avatar
Guolin Ke committed
989
  HistogramPool() {
Guolin Ke's avatar
Guolin Ke committed
990
991
    cache_size_ = 0;
    total_size_ = 0;
Guolin Ke's avatar
Guolin Ke committed
992
  }
993

Guolin Ke's avatar
Guolin Ke committed
994
  /*!
995
996
997
   * \brief Destructor
   */
  ~HistogramPool() {}
998

Guolin Ke's avatar
Guolin Ke committed
999
  /*!
1000
1001
1002
1003
   * \brief Reset pool size
   * \param cache_size Max cache size
   * \param total_size Total size will be used
   */
Guolin Ke's avatar
Guolin Ke committed
1004
  void Reset(int cache_size, int total_size) {
Guolin Ke's avatar
Guolin Ke committed
1005
1006
    cache_size_ = cache_size;
    // at least need 2 bucket to store smaller leaf and larger leaf
1007
    CHECK_GE(cache_size_, 2);
Guolin Ke's avatar
Guolin Ke committed
1008
1009
1010
1011
1012
1013
    total_size_ = total_size;
    if (cache_size_ > total_size_) {
      cache_size_ = total_size_;
    }
    is_enough_ = (cache_size_ == total_size_);
    if (!is_enough_) {
1014
1015
1016
      mapper_.resize(total_size_);
      inverse_mapper_.resize(cache_size_);
      last_used_time_.resize(cache_size_);
Guolin Ke's avatar
Guolin Ke committed
1017
1018
1019
      ResetMap();
    }
  }
1020

Guolin Ke's avatar
Guolin Ke committed
1021
  /*!
1022
1023
   * \brief Reset mapper
   */
Guolin Ke's avatar
Guolin Ke committed
1024
1025
1026
1027
1028
1029
1030
1031
  void ResetMap() {
    if (!is_enough_) {
      cur_time_ = 0;
      std::fill(mapper_.begin(), mapper_.end(), -1);
      std::fill(inverse_mapper_.begin(), inverse_mapper_.end(), -1);
      std::fill(last_used_time_.begin(), last_used_time_.end(), 0);
    }
  }
1032
1033
1034
  template <bool USE_DATA, bool USE_CONFIG>
  static void SetFeatureInfo(const Dataset* train_data, const Config* config,
                             std::vector<FeatureMetainfo>* feature_meta) {
1035
1036
1037
    auto& ref_feature_meta = *feature_meta;
    const int num_feature = train_data->num_features();
    ref_feature_meta.resize(num_feature);
1038
#pragma omp parallel for schedule(static, 512) if (num_feature >= 1024)
1039
    for (int i = 0; i < num_feature; ++i) {
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
      if (USE_DATA) {
        ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i);
        ref_feature_meta[i].default_bin =
            train_data->FeatureBinMapper(i)->GetDefaultBin();
        ref_feature_meta[i].missing_type =
            train_data->FeatureBinMapper(i)->missing_type();
        if (train_data->FeatureBinMapper(i)->GetMostFreqBin() == 0) {
          ref_feature_meta[i].offset = 1;
        } else {
          ref_feature_meta[i].offset = 0;
        }
        ref_feature_meta[i].bin_type =
            train_data->FeatureBinMapper(i)->bin_type();
1053
      }
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
      if (USE_CONFIG) {
        const int real_fidx = train_data->RealFeatureIndex(i);
        if (!config->monotone_constraints.empty()) {
          ref_feature_meta[i].monotone_type =
              config->monotone_constraints[real_fidx];
        } else {
          ref_feature_meta[i].monotone_type = 0;
        }
        if (!config->feature_contri.empty()) {
          ref_feature_meta[i].penalty = config->feature_contri[real_fidx];
        } else {
          ref_feature_meta[i].penalty = 1.0;
        }
        ref_feature_meta[i].rand = Random(config->extra_seed + i);
1068
1069
1070
1071
1072
      }
      ref_feature_meta[i].config = config;
    }
  }

1073
1074
  void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise,
                         const Config* config, int cache_size, int total_size) {
Guolin Ke's avatar
Guolin Ke committed
1075
    if (feature_metas_.empty()) {
1076
      SetFeatureInfo<true, true>(train_data, config, &feature_metas_);
1077
      uint64_t bin_cnt_over_features = 0;
1078
      for (int i = 0; i < train_data->num_features(); ++i) {
1079
1080
        bin_cnt_over_features +=
            static_cast<uint64_t>(feature_metas_[i].num_bin);
Guolin Ke's avatar
Guolin Ke committed
1081
      }
1082
      Log::Info("Total Bins %d", bin_cnt_over_features);
Guolin Ke's avatar
Guolin Ke committed
1083
    }
Guolin Ke's avatar
Guolin Ke committed
1084
    int old_cache_size = static_cast<int>(pool_.size());
Guolin Ke's avatar
Guolin Ke committed
1085
    Reset(cache_size, total_size);
Guolin Ke's avatar
Guolin Ke committed
1086
1087
1088
1089
1090

    if (cache_size > old_cache_size) {
      pool_.resize(cache_size);
      data_.resize(cache_size);
    }
1091
    int num_total_bin = static_cast<int>(train_data->NumTotalBin());
Guolin Ke's avatar
Guolin Ke committed
1092

1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
    std::vector<int> offsets;
    if (is_hist_colwise) {
      int offset = 0;
      for (int j = 0; j < train_data->num_features(); ++j) {
        offset += train_data->SubFeatureBinOffset(j);
        offsets.push_back(offset);
        auto num_bin = train_data->FeatureNumBin(j);
        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
          num_bin -= 1;
        }
        offset += num_bin;
      }
    } else {
      num_total_bin = 1;
      for (int j = 0; j < train_data->num_features(); ++j) {
        offsets.push_back(num_total_bin);
        num_total_bin += train_data->FeatureBinMapper(j)->num_bin();
        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
          num_total_bin -= 1;
        }
      }
    }
1115
    OMP_INIT_EX();
1116
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
1117
    for (int i = old_cache_size; i < cache_size; ++i) {
1118
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
1119
      pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
1120
      data_[i].resize(num_total_bin * 2);
Guolin Ke's avatar
Guolin Ke committed
1121
      for (int j = 0; j < train_data->num_features(); ++j) {
1122
        pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
Guolin Ke's avatar
Guolin Ke committed
1123
      }
1124
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1125
    }
1126
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1127
1128
  }

1129
  void ResetConfig(const Dataset* train_data, const Config* config) {
1130
1131
    CHECK_GT(train_data->num_features(), 0);
    const Config* old_config = feature_metas_[0].config;
1132
    SetFeatureInfo<false, true>(train_data, config, &feature_metas_);
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
    // if need to reset the function pointers
    if (old_config->lambda_l1 != config->lambda_l1 ||
        old_config->monotone_constraints != config->monotone_constraints ||
        old_config->extra_trees != config->extra_trees ||
        old_config->max_delta_step != config->max_delta_step) {
#pragma omp parallel for schedule(static)
      for (int i = 0; i < cache_size_; ++i) {
        for (int j = 0; j < train_data->num_features(); ++j) {
          pool_[i][j].ResetFunc();
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
1145
  }
1146

Guolin Ke's avatar
Guolin Ke committed
1147
  /*!
1148
1149
1150
1151
1152
1153
   * \brief Get data for the specific index
   * \param idx which index want to get
   * \param out output data will store into this
   * \return True if this index is in the pool, False if this index is not in
   * the pool
   */
Guolin Ke's avatar
Guolin Ke committed
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
  bool Get(int idx, FeatureHistogram** out) {
    if (is_enough_) {
      *out = pool_[idx].get();
      return true;
    } else if (mapper_[idx] >= 0) {
      int slot = mapper_[idx];
      *out = pool_[slot].get();
      last_used_time_[slot] = ++cur_time_;
      return true;
    } else {
1164
      // choose the least used slot
Guolin Ke's avatar
Guolin Ke committed
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
      int slot = static_cast<int>(ArrayArgs<int>::ArgMin(last_used_time_));
      *out = pool_[slot].get();
      last_used_time_[slot] = ++cur_time_;

      // reset previous mapper
      if (inverse_mapper_[slot] >= 0) mapper_[inverse_mapper_[slot]] = -1;

      // update current mapper
      mapper_[idx] = slot;
      inverse_mapper_[slot] = idx;
      return false;
    }
  }

  /*!
1180
1181
1182
1183
   * \brief Move data from one index to another index
   * \param src_idx
   * \param dst_idx
   */
Guolin Ke's avatar
Guolin Ke committed
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
  void Move(int src_idx, int dst_idx) {
    if (is_enough_) {
      std::swap(pool_[src_idx], pool_[dst_idx]);
      return;
    }
    if (mapper_[src_idx] < 0) {
      return;
    }
    // get slot of src idx
    int slot = mapper_[src_idx];
    // reset src_idx
    mapper_[src_idx] = -1;

    // move to dst idx
    mapper_[dst_idx] = slot;
    last_used_time_[slot] = ++cur_time_;
    inverse_mapper_[slot] = dst_idx;
  }
1202

1203
 private:
Guolin Ke's avatar
Guolin Ke committed
1204
  std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
1205
1206
1207
  std::vector<
      std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>>
      data_;
Guolin Ke's avatar
Guolin Ke committed
1208
  std::vector<FeatureMetainfo> feature_metas_;
Guolin Ke's avatar
Guolin Ke committed
1209
1210
1211
1212
1213
1214
1215
1216
1217
  int cache_size_;
  int total_size_;
  bool is_enough_ = false;
  std::vector<int> mapper_;
  std::vector<int> inverse_mapper_;
  std::vector<int> last_used_time_;
  int cur_time_ = 0;
};

Guolin Ke's avatar
Guolin Ke committed
1218
}  // namespace LightGBM
1219
#endif  // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_