bin.h 24 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_BIN_H_
#define LIGHTGBM_BIN_H_

8
9
10
11
#include <LightGBM/meta.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/file_io.h>

12
13
#include <limits>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
14
#include <functional>
15
#include <sstream>
16
17
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
18
19
20

namespace LightGBM {

21
22
23
24
25
26
27
28
29
30
31
enum BinType {
  NumericalBin,
  CategoricalBin
};

enum MissingType {
  None,
  Zero,
  NaN
};

32
typedef double hist_t;
33
typedef int32_t int_hist_t;
Guolin Ke's avatar
Guolin Ke committed
34
35
36
typedef uint64_t hist_cnt_t;
// check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
37

38
const size_t kHistEntrySize = 2 * sizeof(hist_t);
39
40
const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t);
const size_t kInt16HistEntrySize = 2 * sizeof(int16_t);
41
const int kHistOffset = 2;
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
const double kSparseThreshold = 0.7;

#define GET_GRAD(hist, i) hist[(i) << 1]
#define GET_HESS(hist, i) hist[((i) << 1) + 1]

inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  comm_size_t used_size = 0;
  const hist_t* p1;
  hist_t* p2;
  while (used_size < len) {
    // convert
    p1 = reinterpret_cast<const hist_t*>(src);
    p2 = reinterpret_cast<hist_t*>(dst);
    *p2 += *p1;
    src += type_size;
    dst += type_size;
    used_size += type_size;
59
  }
60
}
61

62
63
64
65
inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  const int64_t* src_ptr = reinterpret_cast<const int64_t*>(src);
  int64_t* dst_ptr = reinterpret_cast<int64_t*>(dst);
  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
66
  #pragma omp parallel for schedule(static) num_threads(OMP_NUM_THREADS())
67
68
69
70
71
72
73
74
75
  for (comm_size_t i = 0; i < steps; ++i) {
    dst_ptr[i] += src_ptr[i];
  }
}

inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  const int32_t* src_ptr = reinterpret_cast<const int32_t*>(src);
  int32_t* dst_ptr = reinterpret_cast<int32_t*>(dst);
  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
76
  #pragma omp parallel for schedule(static) num_threads(OMP_NUM_THREADS())
77
78
79
80
81
  for (comm_size_t i = 0; i < steps; ++i) {
    dst_ptr[i] += src_ptr[i];
  }
}

82
83
84
/*! \brief This class used to convert feature values into bin,
*          and store some meta information for bin*/
class BinMapper {
Nikita Titov's avatar
Nikita Titov committed
85
 public:
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  BinMapper();
  BinMapper(const BinMapper& other);
  explicit BinMapper(const void* memory);
  ~BinMapper();

  bool CheckAlign(const BinMapper& other) const {
    if (num_bin_ != other.num_bin_) {
      return false;
    }
    if (missing_type_ != other.missing_type_) {
      return false;
    }
    if (bin_type_ == BinType::NumericalBin) {
      for (int i = 0; i < num_bin_; ++i) {
        if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
          return false;
Guolin Ke's avatar
Guolin Ke committed
102
        }
103
104
105
106
107
      }
    } else {
      for (int i = 0; i < num_bin_; i++) {
        if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
          return false;
108
        }
109
110
      }
    }
111
112
    return true;
  }
113

114
115
  /*! \brief Get number of bins */
  inline int num_bin() const { return num_bin_; }
116

117
118
  /*! \brief Missing Type */
  inline MissingType missing_type() const { return missing_type_; }
119

Lingyi Hu's avatar
Lingyi Hu committed
120
121
  /*! \brief True if bin is trivial (contains only one bin) */
  inline bool is_trivial() const { return is_trivial_; }
122

123
124
  /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
  inline double sparse_rate() const { return sparse_rate_; }
125

126
127
128
129
  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
130
  void SaveBinaryToFile(BinaryWriter* writer) const;
131

132
133
134
135
136
137
138
139
140
141
  /*!
  * \brief Mapping bin into feature value
  * \param bin
  * \return Feature value of this bin
  */
  inline double BinToValue(uint32_t bin) const {
    if (bin_type_ == BinType::NumericalBin) {
      return bin_upper_bound_[bin];
    } else {
      return bin_2_categorical_[bin];
Guolin Ke's avatar
Guolin Ke committed
142
    }
143
  }
Guolin Ke's avatar
Guolin Ke committed
144

145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  /*!
  * \brief Maximum categorical value
  * \return Maximum categorical value for categorical features, 0 for numerical features  
  */
  inline int MaxCatValue() const {
    if (bin_2_categorical_.size() == 0) {
      return 0;
    }
    int max_cat_value = bin_2_categorical_[0];
    for (size_t i = 1; i < bin_2_categorical_.size(); ++i) {
      if (bin_2_categorical_[i] > max_cat_value) {
        max_cat_value = bin_2_categorical_[i];
      }
    }
    return max_cat_value;
  }

162
163
164
165
  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const;
166

167
  /*!
168
  * \brief Mapping feature value into bin
169
170
171
172
173
174
175
176
177
178
179
180
  * \param value
  * \return bin for this feature value
  */
  inline uint32_t ValueToBin(double value) const;

  /*!
  * \brief Get the default bin when value is 0
  * \return default bin
  */
  inline uint32_t GetDefaultBin() const {
    return default_bin_;
  }
Guolin Ke's avatar
Guolin Ke committed
181
182
183
184
185

  inline uint32_t GetMostFreqBin() const {
    return most_freq_bin_;
  }

186
187
  /*!
  * \brief Construct feature value to bin mapper according feature values
188
  * \param values (Sampled) values of this feature, Note: not include zero.
189
190
191
192
193
  * \param num_values number of values.
  * \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
  * \param max_bin The maximal number of bin
  * \param min_data_in_bin min number of data in one bin
  * \param min_split_data
194
  * \param pre_filter
195
196
197
  * \param bin_type Type of this bin
  * \param use_missing True to enable missing value handle
  * \param zero_as_missing True to use zero as missing value
198
  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
199
  */
200
  void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
201
               bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
202
203

  /*!
204
  * \brief Serializing this object to buffer
205
206
207
208
209
  * \param buffer The destination
  */
  void CopyTo(char* buffer) const;

  /*!
210
  * \brief Deserializing this object from buffer
211
212
213
214
215
216
217
218
219
220
221
222
  * \param buffer The source
  */
  void CopyFrom(const char* buffer);

  /*!
  * \brief Get bin types
  */
  inline BinType bin_type() const { return bin_type_; }

  /*!
  * \brief Get bin info
  */
223
  inline std::string bin_info_string() const {
224
225
226
227
228
229
230
    if (bin_type_ == BinType::CategoricalBin) {
      return Common::Join(bin_2_categorical_, ":");
    } else {
      std::stringstream str_buf;
      str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
      str_buf << '[' << min_val_ << ':' << max_val_ << ']';
      return str_buf.str();
231
    }
232
  }
Guolin Ke's avatar
Guolin Ke committed
233

Nikita Titov's avatar
Nikita Titov committed
234
 private:
235
236
237
238
239
  /*! \brief Number of bins */
  int num_bin_;
  MissingType missing_type_;
  /*! \brief Store upper bound for each bin */
  std::vector<double> bin_upper_bound_;
Lingyi Hu's avatar
Lingyi Hu committed
240
241
  /*! \brief True if this feature is trivial */
  bool is_trivial_;
242
243
244
245
246
247
248
249
  /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
  double sparse_rate_;
  /*! \brief Type of this bin */
  BinType bin_type_;
  /*! \brief Mapper from categorical to bin */
  std::unordered_map<int, unsigned int> categorical_2_bin_;
  /*! \brief Mapper from bin to categorical */
  std::vector<int> bin_2_categorical_;
250
  /*! \brief minimal feature value */
251
252
253
254
255
  double min_val_;
  /*! \brief maximum feature value */
  double max_val_;
  /*! \brief bin value of feature value 0 */
  uint32_t default_bin_;
Guolin Ke's avatar
Guolin Ke committed
256
257

  uint32_t most_freq_bin_;
258
259
260
261
};

/*! \brief Iterator for one bin column */
class BinIterator {
Nikita Titov's avatar
Nikita Titov committed
262
 public:
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
  /*!
  * \brief Get bin data on specific row index
  * \param idx Index of this data
  * \return Bin data
  */
  virtual uint32_t Get(data_size_t idx) = 0;
  virtual uint32_t RawGet(data_size_t idx) = 0;
  virtual void Reset(data_size_t idx) = 0;
  virtual ~BinIterator() = default;
};

/*!
* \brief Interface for bin data. This class will store bin data for one feature.
*        unlike OrderedBin, this class will store data by original order.
*        Note that it may cause cache misses when construct histogram,
*        but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
Nikita Titov's avatar
Nikita Titov committed
281
 public:
282
283
284
  /*! \brief virtual destructor */
  virtual ~Bin() {}
  /*!
285
286
  * \brief Initialize for pushing.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
287
  * \param omp_max_threads The maximum number of OpenMP threads to allocate for
288
  */
289
  virtual void InitStreaming(uint32_t /*num_thread*/, int32_t /*omp_max_threads*/) { }
290
  /*!
291
  * \brief Push one record
292
  * \param tid Thread id
293
294
295
296
297
  * \param idx Index of record
  * \param value bin value of record
  */
  virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;

298
  virtual void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
299
300
301
302
  /*!
  * \brief Get bin iterator of this bin for specific feature
  * \param min_bin min_bin of current used feature
  * \param max_bin max_bin of current used feature
Guolin Ke's avatar
Guolin Ke committed
303
  * \param most_freq_bin
304
305
  * \return Iterator of this bin
  */
Guolin Ke's avatar
Guolin Ke committed
306
  virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const = 0;
307
308
309
310
311

  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
312
  virtual void SaveBinaryToFile(BinaryWriter* writer) const = 0;
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329

  /*!
  * \brief Load from memory
  * \param memory
  * \param local_used_indices
  */
  virtual void LoadFromMemory(const void* memory,
    const std::vector<data_size_t>& local_used_indices) = 0;

  /*!
  * \brief Get sizes in byte of this object
  */
  virtual size_t SizesInByte() const = 0;

  /*! \brief Number of all data */
  virtual data_size_t num_data() const = 0;

330
331
332
  /*! \brief Get data pointer */
  virtual void* get_data() = 0;

333
334
335
336
337
338
339
340
341
342
  virtual void ReSize(data_size_t num_data) = 0;

  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
           which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
343
344
  * \param start start index in data_indices
  * \param end end index in data_indices
345
346
347
348
349
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
  * \param out Output Result
  */
  virtual void ConstructHistogram(
350
    const data_size_t* data_indices, data_size_t start, data_size_t end,
351
    const score_t* ordered_gradients, const score_t* ordered_hessians,
352
    hist_t* out) const = 0;
353

354
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
355
    const score_t* ordered_gradients, const score_t* ordered_hessians,
356
    hist_t* out) const = 0;
357

358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
  virtual void ConstructHistogramInt8(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

385
386
387
388
389
390
391
392
  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
  which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
393
394
  * \param start start index in data_indices
  * \param end end index in data_indices
395
396
397
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param out Output Result
  */
398
  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
399
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
400

401
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
402
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
403

404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
  virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

422
  virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
                            uint32_t default_bin, uint32_t most_freq_bin,
                            MissingType missing_type, bool default_left,
                            uint32_t threshold, const data_size_t* data_indices,
                            data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin,
      const uint32_t* threshold, int num_threshold,
      const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

  virtual data_size_t Split(uint32_t max_bin, uint32_t default_bin,
                            uint32_t most_freq_bin, MissingType missing_type,
                            bool default_left, uint32_t threshold,
                            const data_size_t* data_indices, data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t max_bin, uint32_t most_freq_bin, const uint32_t* threshold,
      int num_threshold, const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
  /*!
  * \brief After pushed all feature data, call this could have better refactor for bin data
  */
  virtual void FinishLoad() = 0;

  /*!
  * \brief Create object for bin data of one feature, used for dense feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateDenseBin(data_size_t num_data, int num_bin);

  /*!
  * \brief Create object for bin data of one feature, used for sparse feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
468
469
470
471
472

  /*!
  * \brief Deep copy the bin
  */
  virtual Bin* Clone() = 0;
473
474
475
476

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const = 0;

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0;
477
478
};

479
480

class MultiValBin {
481
 public:
482
483
484
485
486
487
  virtual ~MultiValBin() {}

  virtual data_size_t num_data() const = 0;

  virtual int32_t num_bin() const = 0;

488
  virtual double num_element_per_row() const = 0;
489

490
  virtual const std::vector<uint32_t>& offsets() const = 0;
491

492
  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
493

494
495
496
  virtual void CopySubrow(const MultiValBin* full_bin,
                          const data_size_t* used_indices,
                          data_size_t num_used_indices) = 0;
497

498
499
  virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
                                  int num_feature,
500
501
                                  double estimate_element_per_row,
                                  const std::vector<uint32_t>& offsets) const = 0;
502

503
504
505
506
507
508
509
  virtual void CopySubcol(const MultiValBin* full_bin,
                          const std::vector<int>& used_feature_index,
                          const std::vector<uint32_t>& lower,
                          const std::vector<uint32_t>& upper,
                          const std::vector<uint32_t>& delta) = 0;

  virtual void ReSize(data_size_t num_data, int num_bin, int num_feature,
510
                      double estimate_element_per_row, const std::vector<uint32_t>& offsets) = 0;
511
512
513
514
515
516

  virtual void CopySubrowAndSubcol(
      const MultiValBin* full_bin, const data_size_t* used_indices,
      data_size_t num_used_indices, const std::vector<int>& used_feature_index,
      const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
      const std::vector<uint32_t>& delta) = 0;
517

Guolin Ke's avatar
Guolin Ke committed
518
519
520
521
522
  virtual void ConstructHistogram(const data_size_t* data_indices,
                                  data_size_t start, data_size_t end,
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
523
524

  virtual void ConstructHistogram(data_size_t start, data_size_t end,
Guolin Ke's avatar
Guolin Ke committed
525
526
527
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
528

Guolin Ke's avatar
Guolin Ke committed
529
530
531
532
533
534
  virtual void ConstructHistogramOrdered(const data_size_t* data_indices,
                                         data_size_t start, data_size_t end,
                                         const score_t* ordered_gradients,
                                         const score_t* ordered_hessians,
                                         hist_t* out) const = 0;

535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
  virtual void ConstructHistogramInt32(const data_size_t* data_indices,
                                       data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
                                              data_size_t start, data_size_t end,
                                              const score_t* ordered_gradients,
                                              const score_t* ordered_hessians,
                                              hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(const data_size_t* data_indices,
                                       data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
                                              data_size_t start, data_size_t end,
                                              const score_t* ordered_gradients,
                                              const score_t* ordered_hessians,
                                              hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(const data_size_t* data_indices,
                                      data_size_t start, data_size_t end,
                                      const score_t* gradients,
                                      const score_t* hessians,
                                      hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
                                      const score_t* gradients,
                                      const score_t* hessians,
                                      hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
                                             data_size_t start, data_size_t end,
                                             const score_t* ordered_gradients,
                                             const score_t* ordered_hessians,
                                             hist_t* out) const = 0;

586
587
588
589
  virtual void FinishLoad() = 0;

  virtual bool IsSparse() = 0;

590
  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin,
591
                                        int num_feature, double sparse_rate, const std::vector<uint32_t>& offsets);
592
593

  static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin,
594
                                             int num_feature, const std::vector<uint32_t>& offsets);
595
596

  static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row);
597

598
599
  static constexpr double multi_val_bin_sparse_threshold = 0.25f;

600
  virtual MultiValBin* Clone() = 0;
601

602
  #ifdef USE_CUDA
603
604
605
606
607
  virtual const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const = 0;
608
  #endif  // USE_CUDA
609
610
};

611
612
inline uint32_t BinMapper::ValueToBin(double value) const {
  if (std::isnan(value)) {
613
614
615
    if (bin_type_ == BinType::CategoricalBin) {
      return 0;
    } else if (missing_type_ == MissingType::NaN) {
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
      return num_bin_ - 1;
    } else {
      value = 0.0f;
    }
  }
  if (bin_type_ == BinType::NumericalBin) {
    // binary search to find bin
    int l = 0;
    int r = num_bin_ - 1;
    if (missing_type_ == MissingType::NaN) {
      r -= 1;
    }
    while (l < r) {
      int m = (r + l - 1) / 2;
      if (value <= bin_upper_bound_[m]) {
        r = m;
632
      } else {
633
        l = m + 1;
634
635
      }
    }
636
637
638
639
640
    return l;
  } else {
    int int_value = static_cast<int>(value);
    // convert negative value to NaN bin
    if (int_value < 0) {
641
      return 0;
642
643
644
    }
    if (categorical_2_bin_.count(int_value)) {
      return categorical_2_bin_.at(int_value);
Guolin Ke's avatar
Guolin Ke committed
645
    } else {
646
      return 0;
Guolin Ke's avatar
Guolin Ke committed
647
648
    }
  }
649
}
Guolin Ke's avatar
Guolin Ke committed
650
651
652

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
653
#endif   // LightGBM_BIN_H_