bin.h 24.1 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
6
#ifndef LIGHTGBM_INCLUDE_LIGHTGBM_BIN_H_
#define LIGHTGBM_INCLUDE_LIGHTGBM_BIN_H_
Guolin Ke's avatar
Guolin Ke committed
7

8
9
10
11
#include <LightGBM/meta.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/file_io.h>

12
#include <cstdint>
13
14
#include <limits>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
15
#include <functional>
16
#include <sstream>
17
18
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
19
20
21

namespace LightGBM {

22
23
24
25
26
27
28
29
30
31
32
enum BinType {
  NumericalBin,
  CategoricalBin
};

enum MissingType {
  None,
  Zero,
  NaN
};

33
typedef double hist_t;
34
typedef int32_t int_hist_t;
Guolin Ke's avatar
Guolin Ke committed
35
36
37
typedef uint64_t hist_cnt_t;
// check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
38

39
const size_t kHistEntrySize = 2 * sizeof(hist_t);
40
41
const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t);
const size_t kInt16HistEntrySize = 2 * sizeof(int16_t);
42
const int kHistOffset = 2;
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
const double kSparseThreshold = 0.7;

#define GET_GRAD(hist, i) hist[(i) << 1]
#define GET_HESS(hist, i) hist[((i) << 1) + 1]

inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  comm_size_t used_size = 0;
  const hist_t* p1;
  hist_t* p2;
  while (used_size < len) {
    // convert
    p1 = reinterpret_cast<const hist_t*>(src);
    p2 = reinterpret_cast<hist_t*>(dst);
    *p2 += *p1;
    src += type_size;
    dst += type_size;
    used_size += type_size;
60
  }
61
}
62

63
64
65
66
inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  const int64_t* src_ptr = reinterpret_cast<const int64_t*>(src);
  int64_t* dst_ptr = reinterpret_cast<int64_t*>(dst);
  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
67
  #pragma omp parallel for schedule(static) num_threads(OMP_NUM_THREADS())
68
69
70
71
72
73
74
75
76
  for (comm_size_t i = 0; i < steps; ++i) {
    dst_ptr[i] += src_ptr[i];
  }
}

inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  const int32_t* src_ptr = reinterpret_cast<const int32_t*>(src);
  int32_t* dst_ptr = reinterpret_cast<int32_t*>(dst);
  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
77
  #pragma omp parallel for schedule(static) num_threads(OMP_NUM_THREADS())
78
79
80
81
82
  for (comm_size_t i = 0; i < steps; ++i) {
    dst_ptr[i] += src_ptr[i];
  }
}

83
84
85
/*! \brief This class used to convert feature values into bin,
*          and store some meta information for bin*/
class BinMapper {
Nikita Titov's avatar
Nikita Titov committed
86
 public:
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
  BinMapper();
  BinMapper(const BinMapper& other);
  explicit BinMapper(const void* memory);
  ~BinMapper();

  bool CheckAlign(const BinMapper& other) const {
    if (num_bin_ != other.num_bin_) {
      return false;
    }
    if (missing_type_ != other.missing_type_) {
      return false;
    }
    if (bin_type_ == BinType::NumericalBin) {
      for (int i = 0; i < num_bin_; ++i) {
        if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
          return false;
Guolin Ke's avatar
Guolin Ke committed
103
        }
104
105
106
107
108
      }
    } else {
      for (int i = 0; i < num_bin_; i++) {
        if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
          return false;
109
        }
110
111
      }
    }
112
113
    return true;
  }
114

115
116
  /*! \brief Get number of bins */
  inline int num_bin() const { return num_bin_; }
117

118
119
  /*! \brief Missing Type */
  inline MissingType missing_type() const { return missing_type_; }
120

Lingyi Hu's avatar
Lingyi Hu committed
121
122
  /*! \brief True if bin is trivial (contains only one bin) */
  inline bool is_trivial() const { return is_trivial_; }
123

124
125
  /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
  inline double sparse_rate() const { return sparse_rate_; }
126

127
128
129
130
  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
131
  void SaveBinaryToFile(BinaryWriter* writer) const;
132

133
134
135
136
137
138
139
140
141
142
  /*!
  * \brief Mapping bin into feature value
  * \param bin
  * \return Feature value of this bin
  */
  inline double BinToValue(uint32_t bin) const {
    if (bin_type_ == BinType::NumericalBin) {
      return bin_upper_bound_[bin];
    } else {
      return bin_2_categorical_[bin];
Guolin Ke's avatar
Guolin Ke committed
143
    }
144
  }
Guolin Ke's avatar
Guolin Ke committed
145

146
147
  /*!
  * \brief Maximum categorical value
148
  * \return Maximum categorical value for categorical features, 0 for numerical features
149
150
151
152
153
154
155
156
157
158
159
160
161
162
  */
  inline int MaxCatValue() const {
    if (bin_2_categorical_.size() == 0) {
      return 0;
    }
    int max_cat_value = bin_2_categorical_[0];
    for (size_t i = 1; i < bin_2_categorical_.size(); ++i) {
      if (bin_2_categorical_[i] > max_cat_value) {
        max_cat_value = bin_2_categorical_[i];
      }
    }
    return max_cat_value;
  }

163
164
165
166
  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const;
167

168
  /*!
169
  * \brief Mapping feature value into bin
170
171
172
173
174
175
176
177
178
179
180
181
  * \param value
  * \return bin for this feature value
  */
  inline uint32_t ValueToBin(double value) const;

  /*!
  * \brief Get the default bin when value is 0
  * \return default bin
  */
  inline uint32_t GetDefaultBin() const {
    return default_bin_;
  }
Guolin Ke's avatar
Guolin Ke committed
182
183
184
185
186

  inline uint32_t GetMostFreqBin() const {
    return most_freq_bin_;
  }

187
188
  /*!
  * \brief Construct feature value to bin mapper according feature values
189
  * \param values (Sampled) values of this feature, Note: not include zero.
190
191
192
193
194
  * \param num_values number of values.
  * \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
  * \param max_bin The maximal number of bin
  * \param min_data_in_bin min number of data in one bin
  * \param min_split_data
195
  * \param pre_filter
196
197
198
  * \param bin_type Type of this bin
  * \param use_missing True to enable missing value handle
  * \param zero_as_missing True to use zero as missing value
199
  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
200
  */
201
  void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
202
               bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
203
204

  /*!
205
  * \brief Serializing this object to buffer
206
207
208
209
210
  * \param buffer The destination
  */
  void CopyTo(char* buffer) const;

  /*!
211
  * \brief Deserializing this object from buffer
212
213
214
215
216
217
218
219
220
221
222
223
  * \param buffer The source
  */
  void CopyFrom(const char* buffer);

  /*!
  * \brief Get bin types
  */
  inline BinType bin_type() const { return bin_type_; }

  /*!
  * \brief Get bin info
  */
224
  inline std::string bin_info_string() const {
225
226
227
228
229
230
231
    if (bin_type_ == BinType::CategoricalBin) {
      return Common::Join(bin_2_categorical_, ":");
    } else {
      std::stringstream str_buf;
      str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
      str_buf << '[' << min_val_ << ':' << max_val_ << ']';
      return str_buf.str();
232
    }
233
  }
Guolin Ke's avatar
Guolin Ke committed
234

Nikita Titov's avatar
Nikita Titov committed
235
 private:
236
237
238
239
240
  /*! \brief Number of bins */
  int num_bin_;
  MissingType missing_type_;
  /*! \brief Store upper bound for each bin */
  std::vector<double> bin_upper_bound_;
Lingyi Hu's avatar
Lingyi Hu committed
241
242
  /*! \brief True if this feature is trivial */
  bool is_trivial_;
243
244
245
246
247
248
249
250
  /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
  double sparse_rate_;
  /*! \brief Type of this bin */
  BinType bin_type_;
  /*! \brief Mapper from categorical to bin */
  std::unordered_map<int, unsigned int> categorical_2_bin_;
  /*! \brief Mapper from bin to categorical */
  std::vector<int> bin_2_categorical_;
251
  /*! \brief minimal feature value */
252
253
254
255
256
  double min_val_;
  /*! \brief maximum feature value */
  double max_val_;
  /*! \brief bin value of feature value 0 */
  uint32_t default_bin_;
Guolin Ke's avatar
Guolin Ke committed
257
258

  uint32_t most_freq_bin_;
259
260
261
262
};

/*! \brief Iterator for one bin column */
class BinIterator {
Nikita Titov's avatar
Nikita Titov committed
263
 public:
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
  /*!
  * \brief Get bin data on specific row index
  * \param idx Index of this data
  * \return Bin data
  */
  virtual uint32_t Get(data_size_t idx) = 0;
  virtual uint32_t RawGet(data_size_t idx) = 0;
  virtual void Reset(data_size_t idx) = 0;
  virtual ~BinIterator() = default;
};

/*!
* \brief Interface for bin data. This class will store bin data for one feature.
*        unlike OrderedBin, this class will store data by original order.
*        Note that it may cause cache misses when construct histogram,
*        but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
Nikita Titov's avatar
Nikita Titov committed
282
 public:
283
284
285
  /*! \brief virtual destructor */
  virtual ~Bin() {}
  /*!
286
287
  * \brief Initialize for pushing.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
288
  * \param omp_max_threads The maximum number of OpenMP threads to allocate for
289
  */
290
  virtual void InitStreaming(uint32_t /*num_thread*/, int32_t /*omp_max_threads*/) { }
291
  /*!
292
  * \brief Push one record
293
  * \param tid Thread id
294
295
296
297
298
  * \param idx Index of record
  * \param value bin value of record
  */
  virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;

299
  virtual void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
300
301
302
303
  /*!
  * \brief Get bin iterator of this bin for specific feature
  * \param min_bin min_bin of current used feature
  * \param max_bin max_bin of current used feature
Guolin Ke's avatar
Guolin Ke committed
304
  * \param most_freq_bin
305
306
  * \return Iterator of this bin
  */
Guolin Ke's avatar
Guolin Ke committed
307
  virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const = 0;
308
309
310
311
312

  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
313
  virtual void SaveBinaryToFile(BinaryWriter* writer) const = 0;
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330

  /*!
  * \brief Load from memory
  * \param memory
  * \param local_used_indices
  */
  virtual void LoadFromMemory(const void* memory,
    const std::vector<data_size_t>& local_used_indices) = 0;

  /*!
  * \brief Get sizes in byte of this object
  */
  virtual size_t SizesInByte() const = 0;

  /*! \brief Number of all data */
  virtual data_size_t num_data() const = 0;

331
332
333
  /*! \brief Get data pointer */
  virtual void* get_data() = 0;

334
335
336
337
338
339
340
341
342
343
  virtual void ReSize(data_size_t num_data) = 0;

  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
           which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
344
345
  * \param start start index in data_indices
  * \param end end index in data_indices
346
347
348
349
350
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
  * \param out Output Result
  */
  virtual void ConstructHistogram(
351
    const data_size_t* data_indices, data_size_t start, data_size_t end,
352
    const score_t* ordered_gradients, const score_t* ordered_hessians,
353
    hist_t* out) const = 0;
354

355
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
356
    const score_t* ordered_gradients, const score_t* ordered_hessians,
357
    hist_t* out) const = 0;
358

359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
  virtual void ConstructHistogramInt8(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

386
387
388
389
390
391
392
393
  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
  which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
394
395
  * \param start start index in data_indices
  * \param end end index in data_indices
396
397
398
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param out Output Result
  */
399
  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
400
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
401

402
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
403
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
404

405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
  virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

423
  virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
                            uint32_t default_bin, uint32_t most_freq_bin,
                            MissingType missing_type, bool default_left,
                            uint32_t threshold, const data_size_t* data_indices,
                            data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin,
      const uint32_t* threshold, int num_threshold,
      const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

  virtual data_size_t Split(uint32_t max_bin, uint32_t default_bin,
                            uint32_t most_freq_bin, MissingType missing_type,
                            bool default_left, uint32_t threshold,
                            const data_size_t* data_indices, data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t max_bin, uint32_t most_freq_bin, const uint32_t* threshold,
      int num_threshold, const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
  /*!
  * \brief After pushed all feature data, call this could have better refactor for bin data
  */
  virtual void FinishLoad() = 0;

  /*!
  * \brief Create object for bin data of one feature, used for dense feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateDenseBin(data_size_t num_data, int num_bin);

  /*!
  * \brief Create object for bin data of one feature, used for sparse feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
469
470
471
472
473

  /*!
  * \brief Deep copy the bin
  */
  virtual Bin* Clone() = 0;
474
475
476
477

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const = 0;

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0;
478
479
};

480
481

class MultiValBin {
482
 public:
483
484
485
486
487
488
  virtual ~MultiValBin() {}

  virtual data_size_t num_data() const = 0;

  virtual int32_t num_bin() const = 0;

489
  virtual double num_element_per_row() const = 0;
490

491
  virtual const std::vector<uint32_t>& offsets() const = 0;
492

493
  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
494

495
496
497
  virtual void CopySubrow(const MultiValBin* full_bin,
                          const data_size_t* used_indices,
                          data_size_t num_used_indices) = 0;
498

499
500
  virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
                                  int num_feature,
501
502
                                  double estimate_element_per_row,
                                  const std::vector<uint32_t>& offsets) const = 0;
503

504
505
506
507
508
509
510
  virtual void CopySubcol(const MultiValBin* full_bin,
                          const std::vector<int>& used_feature_index,
                          const std::vector<uint32_t>& lower,
                          const std::vector<uint32_t>& upper,
                          const std::vector<uint32_t>& delta) = 0;

  virtual void ReSize(data_size_t num_data, int num_bin, int num_feature,
511
                      double estimate_element_per_row, const std::vector<uint32_t>& offsets) = 0;
512
513
514
515
516
517

  virtual void CopySubrowAndSubcol(
      const MultiValBin* full_bin, const data_size_t* used_indices,
      data_size_t num_used_indices, const std::vector<int>& used_feature_index,
      const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
      const std::vector<uint32_t>& delta) = 0;
518

Guolin Ke's avatar
Guolin Ke committed
519
520
521
522
523
  virtual void ConstructHistogram(const data_size_t* data_indices,
                                  data_size_t start, data_size_t end,
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
524
525

  virtual void ConstructHistogram(data_size_t start, data_size_t end,
Guolin Ke's avatar
Guolin Ke committed
526
527
528
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
529

Guolin Ke's avatar
Guolin Ke committed
530
531
532
533
534
535
  virtual void ConstructHistogramOrdered(const data_size_t* data_indices,
                                         data_size_t start, data_size_t end,
                                         const score_t* ordered_gradients,
                                         const score_t* ordered_hessians,
                                         hist_t* out) const = 0;

536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
  virtual void ConstructHistogramInt32(const data_size_t* data_indices,
                                       data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
                                              data_size_t start, data_size_t end,
                                              const score_t* ordered_gradients,
                                              const score_t* ordered_hessians,
                                              hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(const data_size_t* data_indices,
                                       data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
                                              data_size_t start, data_size_t end,
                                              const score_t* ordered_gradients,
                                              const score_t* ordered_hessians,
                                              hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(const data_size_t* data_indices,
                                      data_size_t start, data_size_t end,
                                      const score_t* gradients,
                                      const score_t* hessians,
                                      hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
                                      const score_t* gradients,
                                      const score_t* hessians,
                                      hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
                                             data_size_t start, data_size_t end,
                                             const score_t* ordered_gradients,
                                             const score_t* ordered_hessians,
                                             hist_t* out) const = 0;

587
588
589
590
  virtual void FinishLoad() = 0;

  virtual bool IsSparse() = 0;

591
  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin,
592
                                        int num_feature, double sparse_rate, const std::vector<uint32_t>& offsets);
593
594

  static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin,
595
                                             int num_feature, const std::vector<uint32_t>& offsets);
596
597

  static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row);
598

599
600
  static constexpr double multi_val_bin_sparse_threshold = 0.25f;

601
  virtual MultiValBin* Clone() = 0;
602

603
  #ifdef USE_CUDA
604
605
606
607
608
  virtual const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const = 0;
609
  #endif  // USE_CUDA
610
611
};

612
613
inline uint32_t BinMapper::ValueToBin(double value) const {
  if (std::isnan(value)) {
614
615
616
    if (bin_type_ == BinType::CategoricalBin) {
      return 0;
    } else if (missing_type_ == MissingType::NaN) {
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
      return num_bin_ - 1;
    } else {
      value = 0.0f;
    }
  }
  if (bin_type_ == BinType::NumericalBin) {
    // binary search to find bin
    int l = 0;
    int r = num_bin_ - 1;
    if (missing_type_ == MissingType::NaN) {
      r -= 1;
    }
    while (l < r) {
      int m = (r + l - 1) / 2;
      if (value <= bin_upper_bound_[m]) {
        r = m;
633
      } else {
634
        l = m + 1;
635
636
      }
    }
637
638
639
640
641
    return l;
  } else {
    int int_value = static_cast<int>(value);
    // convert negative value to NaN bin
    if (int_value < 0) {
642
      return 0;
643
644
645
    }
    if (categorical_2_bin_.count(int_value)) {
      return categorical_2_bin_.at(int_value);
Guolin Ke's avatar
Guolin Ke committed
646
    } else {
647
      return 0;
Guolin Ke's avatar
Guolin Ke committed
648
649
    }
  }
650
}
Guolin Ke's avatar
Guolin Ke committed
651
652
653

}  // namespace LightGBM

654
#endif   // LIGHTGBM_INCLUDE_LIGHTGBM_BIN_H_