bin.h 17.8 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_BIN_H_
#define LIGHTGBM_BIN_H_

8
9
10
11
#include <LightGBM/meta.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/file_io.h>

12
13
#include <limits>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
14
#include <functional>
15
#include <sstream>
16
17
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
18
19
20

namespace LightGBM {

21
22
23
24
25
26
27
28
29
30
31
enum BinType {
  NumericalBin,
  CategoricalBin
};

enum MissingType {
  None,
  Zero,
  NaN
};

32
typedef double hist_t;
Guolin Ke's avatar
Guolin Ke committed
33
34
35
typedef uint64_t hist_cnt_t;
// check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
36

37
38
const size_t kHistEntrySize = 2 * sizeof(hist_t);
const int kHistOffset = 2;
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
const double kSparseThreshold = 0.7;

#define GET_GRAD(hist, i) hist[(i) << 1]
#define GET_HESS(hist, i) hist[((i) << 1) + 1]

inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  comm_size_t used_size = 0;
  const hist_t* p1;
  hist_t* p2;
  while (used_size < len) {
    // convert
    p1 = reinterpret_cast<const hist_t*>(src);
    p2 = reinterpret_cast<hist_t*>(dst);
    *p2 += *p1;
    src += type_size;
    dst += type_size;
    used_size += type_size;
56
  }
57
}
58
59
60
61

/*! \brief This class used to convert feature values into bin,
*          and store some meta information for bin*/
class BinMapper {
Nikita Titov's avatar
Nikita Titov committed
62
 public:
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  BinMapper();
  BinMapper(const BinMapper& other);
  explicit BinMapper(const void* memory);
  ~BinMapper();

  bool CheckAlign(const BinMapper& other) const {
    if (num_bin_ != other.num_bin_) {
      return false;
    }
    if (missing_type_ != other.missing_type_) {
      return false;
    }
    if (bin_type_ == BinType::NumericalBin) {
      for (int i = 0; i < num_bin_; ++i) {
        if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
          return false;
Guolin Ke's avatar
Guolin Ke committed
79
        }
80
81
82
83
84
      }
    } else {
      for (int i = 0; i < num_bin_; i++) {
        if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
          return false;
85
        }
86
87
      }
    }
88
89
    return true;
  }
90

91
92
  /*! \brief Get number of bins */
  inline int num_bin() const { return num_bin_; }
93

94
95
  /*! \brief Missing Type */
  inline MissingType missing_type() const { return missing_type_; }
96

Lingyi Hu's avatar
Lingyi Hu committed
97
98
  /*! \brief True if bin is trivial (contains only one bin) */
  inline bool is_trivial() const { return is_trivial_; }
99

100
101
  /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
  inline double sparse_rate() const { return sparse_rate_; }
102

103
104
105
106
  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
107
  void SaveBinaryToFile(const VirtualFileWriter* writer) const;
108

109
110
111
112
113
114
115
116
117
118
  /*!
  * \brief Mapping bin into feature value
  * \param bin
  * \return Feature value of this bin
  */
  inline double BinToValue(uint32_t bin) const {
    if (bin_type_ == BinType::NumericalBin) {
      return bin_upper_bound_[bin];
    } else {
      return bin_2_categorical_[bin];
Guolin Ke's avatar
Guolin Ke committed
119
    }
120
  }
Guolin Ke's avatar
Guolin Ke committed
121

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
  /*!
  * \brief Maximum categorical value
  * \return Maximum categorical value for categorical features, 0 for numerical features  
  */
  inline int MaxCatValue() const {
    if (bin_2_categorical_.size() == 0) {
      return 0;
    }
    int max_cat_value = bin_2_categorical_[0];
    for (size_t i = 1; i < bin_2_categorical_.size(); ++i) {
      if (bin_2_categorical_[i] > max_cat_value) {
        max_cat_value = bin_2_categorical_[i];
      }
    }
    return max_cat_value;
  }

139
140
141
142
  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const;
143

144
  /*!
145
  * \brief Mapping feature value into bin
146
147
148
149
150
151
152
153
154
155
156
157
  * \param value
  * \return bin for this feature value
  */
  inline uint32_t ValueToBin(double value) const;

  /*!
  * \brief Get the default bin when value is 0
  * \return default bin
  */
  inline uint32_t GetDefaultBin() const {
    return default_bin_;
  }
Guolin Ke's avatar
Guolin Ke committed
158
159
160
161
162

  inline uint32_t GetMostFreqBin() const {
    return most_freq_bin_;
  }

163
164
  /*!
  * \brief Construct feature value to bin mapper according feature values
165
  * \param values (Sampled) values of this feature, Note: not include zero.
166
167
168
169
170
  * \param num_values number of values.
  * \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
  * \param max_bin The maximal number of bin
  * \param min_data_in_bin min number of data in one bin
  * \param min_split_data
171
  * \param pre_filter
172
173
174
  * \param bin_type Type of this bin
  * \param use_missing True to enable missing value handle
  * \param zero_as_missing True to use zero as missing value
175
  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
176
  */
177
  void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
178
               bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
179
180

  /*!
181
  * \brief Serializing this object to buffer
182
183
184
185
186
  * \param buffer The destination
  */
  void CopyTo(char* buffer) const;

  /*!
187
  * \brief Deserializing this object from buffer
188
189
190
191
192
193
194
195
196
197
198
199
  * \param buffer The source
  */
  void CopyFrom(const char* buffer);

  /*!
  * \brief Get bin types
  */
  inline BinType bin_type() const { return bin_type_; }

  /*!
  * \brief Get bin info
  */
200
  inline std::string bin_info_string() const {
201
202
203
204
205
206
207
    if (bin_type_ == BinType::CategoricalBin) {
      return Common::Join(bin_2_categorical_, ":");
    } else {
      std::stringstream str_buf;
      str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
      str_buf << '[' << min_val_ << ':' << max_val_ << ']';
      return str_buf.str();
208
    }
209
  }
Guolin Ke's avatar
Guolin Ke committed
210

Nikita Titov's avatar
Nikita Titov committed
211
 private:
212
213
214
215
216
  /*! \brief Number of bins */
  int num_bin_;
  MissingType missing_type_;
  /*! \brief Store upper bound for each bin */
  std::vector<double> bin_upper_bound_;
Lingyi Hu's avatar
Lingyi Hu committed
217
218
  /*! \brief True if this feature is trivial */
  bool is_trivial_;
219
220
221
222
223
224
225
226
  /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
  double sparse_rate_;
  /*! \brief Type of this bin */
  BinType bin_type_;
  /*! \brief Mapper from categorical to bin */
  std::unordered_map<int, unsigned int> categorical_2_bin_;
  /*! \brief Mapper from bin to categorical */
  std::vector<int> bin_2_categorical_;
227
  /*! \brief minimal feature value */
228
229
230
231
232
  double min_val_;
  /*! \brief maximum feature value */
  double max_val_;
  /*! \brief bin value of feature value 0 */
  uint32_t default_bin_;
Guolin Ke's avatar
Guolin Ke committed
233
234

  uint32_t most_freq_bin_;
235
236
237
238
};

/*! \brief Iterator for one bin column */
class BinIterator {
Nikita Titov's avatar
Nikita Titov committed
239
 public:
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
  /*!
  * \brief Get bin data on specific row index
  * \param idx Index of this data
  * \return Bin data
  */
  virtual uint32_t Get(data_size_t idx) = 0;
  virtual uint32_t RawGet(data_size_t idx) = 0;
  virtual void Reset(data_size_t idx) = 0;
  virtual ~BinIterator() = default;
};

/*!
* \brief Interface for bin data. This class will store bin data for one feature.
*        unlike OrderedBin, this class will store data by original order.
*        Note that it may cause cache misses when construct histogram,
*        but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
Nikita Titov's avatar
Nikita Titov committed
258
 public:
259
260
261
  /*! \brief virtual destructor */
  virtual ~Bin() {}
  /*!
262
263
  * \brief Initialize for pushing.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
264
  * \param omp_max_threads The maximum number of OpenMP threads to allocate for
265
  */
266
  virtual void InitStreaming(uint32_t /*num_thread*/, int32_t /*omp_max_threads*/) { }
267
  /*!
268
  * \brief Push one record
269
  * \param tid Thread id
270
271
272
273
274
  * \param idx Index of record
  * \param value bin value of record
  */
  virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;

275
  virtual void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
276
277
278
279
  /*!
  * \brief Get bin iterator of this bin for specific feature
  * \param min_bin min_bin of current used feature
  * \param max_bin max_bin of current used feature
Guolin Ke's avatar
Guolin Ke committed
280
  * \param most_freq_bin
281
282
  * \return Iterator of this bin
  */
Guolin Ke's avatar
Guolin Ke committed
283
  virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const = 0;
284
285
286
287
288

  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
289
  virtual void SaveBinaryToFile(const VirtualFileWriter* writer) const = 0;
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

  /*!
  * \brief Load from memory
  * \param memory
  * \param local_used_indices
  */
  virtual void LoadFromMemory(const void* memory,
    const std::vector<data_size_t>& local_used_indices) = 0;

  /*!
  * \brief Get sizes in byte of this object
  */
  virtual size_t SizesInByte() const = 0;

  /*! \brief Number of all data */
  virtual data_size_t num_data() const = 0;

307
308
309
  /*! \brief Get data pointer */
  virtual void* get_data() = 0;

310
311
312
313
314
315
316
317
318
319
  virtual void ReSize(data_size_t num_data) = 0;

  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
           which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
320
321
  * \param start start index in data_indices
  * \param end end index in data_indices
322
323
324
325
326
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
  * \param out Output Result
  */
  virtual void ConstructHistogram(
327
    const data_size_t* data_indices, data_size_t start, data_size_t end,
328
    const score_t* ordered_gradients, const score_t* ordered_hessians,
329
    hist_t* out) const = 0;
330

331
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
332
    const score_t* ordered_gradients, const score_t* ordered_hessians,
333
    hist_t* out) const = 0;
334
335
336
337
338
339
340
341
342

  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
  which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
343
344
  * \param start start index in data_indices
  * \param end end index in data_indices
345
346
347
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param out Output Result
  */
348
  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
349
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
350

351
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
352
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
353

354
  virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
                            uint32_t default_bin, uint32_t most_freq_bin,
                            MissingType missing_type, bool default_left,
                            uint32_t threshold, const data_size_t* data_indices,
                            data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin,
      const uint32_t* threshold, int num_threshold,
      const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

  virtual data_size_t Split(uint32_t max_bin, uint32_t default_bin,
                            uint32_t most_freq_bin, MissingType missing_type,
                            bool default_left, uint32_t threshold,
                            const data_size_t* data_indices, data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t max_bin, uint32_t most_freq_bin, const uint32_t* threshold,
      int num_threshold, const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
  /*!
  * \brief After pushed all feature data, call this could have better refactor for bin data
  */
  virtual void FinishLoad() = 0;

  /*!
  * \brief Create object for bin data of one feature, used for dense feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateDenseBin(data_size_t num_data, int num_bin);

  /*!
  * \brief Create object for bin data of one feature, used for sparse feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
400
401
402
403
404

  /*!
  * \brief Deep copy the bin
  */
  virtual Bin* Clone() = 0;
405
406
407
408

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const = 0;

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0;
409
410
};

411
412

class MultiValBin {
413
 public:
414
415
416
417
418
419
  virtual ~MultiValBin() {}

  virtual data_size_t num_data() const = 0;

  virtual int32_t num_bin() const = 0;

420
  virtual double num_element_per_row() const = 0;
421

422
  virtual const std::vector<uint32_t>& offsets() const = 0;
423

424
  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
425

426
427
428
  virtual void CopySubrow(const MultiValBin* full_bin,
                          const data_size_t* used_indices,
                          data_size_t num_used_indices) = 0;
429

430
431
  virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
                                  int num_feature,
432
433
                                  double estimate_element_per_row,
                                  const std::vector<uint32_t>& offsets) const = 0;
434

435
436
437
438
439
440
441
  virtual void CopySubcol(const MultiValBin* full_bin,
                          const std::vector<int>& used_feature_index,
                          const std::vector<uint32_t>& lower,
                          const std::vector<uint32_t>& upper,
                          const std::vector<uint32_t>& delta) = 0;

  virtual void ReSize(data_size_t num_data, int num_bin, int num_feature,
442
                      double estimate_element_per_row, const std::vector<uint32_t>& offsets) = 0;
443
444
445
446
447
448

  virtual void CopySubrowAndSubcol(
      const MultiValBin* full_bin, const data_size_t* used_indices,
      data_size_t num_used_indices, const std::vector<int>& used_feature_index,
      const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
      const std::vector<uint32_t>& delta) = 0;
449

Guolin Ke's avatar
Guolin Ke committed
450
451
452
453
454
  virtual void ConstructHistogram(const data_size_t* data_indices,
                                  data_size_t start, data_size_t end,
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
455
456

  virtual void ConstructHistogram(data_size_t start, data_size_t end,
Guolin Ke's avatar
Guolin Ke committed
457
458
459
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
460

Guolin Ke's avatar
Guolin Ke committed
461
462
463
464
465
466
  virtual void ConstructHistogramOrdered(const data_size_t* data_indices,
                                         data_size_t start, data_size_t end,
                                         const score_t* ordered_gradients,
                                         const score_t* ordered_hessians,
                                         hist_t* out) const = 0;

467
468
469
470
  virtual void FinishLoad() = 0;

  virtual bool IsSparse() = 0;

471
  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin,
472
                                        int num_feature, double sparse_rate, const std::vector<uint32_t>& offsets);
473
474

  static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin,
475
                                             int num_feature, const std::vector<uint32_t>& offsets);
476
477

  static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row);
478

479
480
  static constexpr double multi_val_bin_sparse_threshold = 0.25f;

481
  virtual MultiValBin* Clone() = 0;
482

483
  #ifdef USE_CUDA
484
485
486
487
488
  virtual const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const = 0;
489
  #endif  // USE_CUDA
490
491
};

492
493
inline uint32_t BinMapper::ValueToBin(double value) const {
  if (std::isnan(value)) {
494
495
496
    if (bin_type_ == BinType::CategoricalBin) {
      return 0;
    } else if (missing_type_ == MissingType::NaN) {
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
      return num_bin_ - 1;
    } else {
      value = 0.0f;
    }
  }
  if (bin_type_ == BinType::NumericalBin) {
    // binary search to find bin
    int l = 0;
    int r = num_bin_ - 1;
    if (missing_type_ == MissingType::NaN) {
      r -= 1;
    }
    while (l < r) {
      int m = (r + l - 1) / 2;
      if (value <= bin_upper_bound_[m]) {
        r = m;
513
      } else {
514
        l = m + 1;
515
516
      }
    }
517
518
519
520
521
    return l;
  } else {
    int int_value = static_cast<int>(value);
    // convert negative value to NaN bin
    if (int_value < 0) {
522
      return 0;
523
524
525
    }
    if (categorical_2_bin_.count(int_value)) {
      return categorical_2_bin_.at(int_value);
Guolin Ke's avatar
Guolin Ke committed
526
    } else {
527
      return 0;
Guolin Ke's avatar
Guolin Ke committed
528
529
    }
  }
530
}
Guolin Ke's avatar
Guolin Ke committed
531
532
533

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
534
#endif   // LightGBM_BIN_H_