bin.h 24.1 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_BIN_H_
#define LIGHTGBM_BIN_H_

8
9
10
11
#include <LightGBM/meta.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/file_io.h>

12
13
#include <limits>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
14
#include <functional>
15
#include <sstream>
16
17
#include <unordered_map>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
18
19
20

namespace LightGBM {

21
22
23
24
25
26
27
28
29
30
31
enum BinType {
  NumericalBin,
  CategoricalBin
};

enum MissingType {
  None,
  Zero,
  NaN
};

32
typedef double hist_t;
33
typedef int32_t int_hist_t;
Guolin Ke's avatar
Guolin Ke committed
34
35
36
typedef uint64_t hist_cnt_t;
// check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
37

38
const size_t kHistEntrySize = 2 * sizeof(hist_t);
39
40
const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t);
const size_t kInt16HistEntrySize = 2 * sizeof(int16_t);
41
const int kHistOffset = 2;
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
const double kSparseThreshold = 0.7;

#define GET_GRAD(hist, i) hist[(i) << 1]
#define GET_HESS(hist, i) hist[((i) << 1) + 1]

inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  comm_size_t used_size = 0;
  const hist_t* p1;
  hist_t* p2;
  while (used_size < len) {
    // convert
    p1 = reinterpret_cast<const hist_t*>(src);
    p2 = reinterpret_cast<hist_t*>(dst);
    *p2 += *p1;
    src += type_size;
    dst += type_size;
    used_size += type_size;
59
  }
60
}
61

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  const int64_t* src_ptr = reinterpret_cast<const int64_t*>(src);
  int64_t* dst_ptr = reinterpret_cast<int64_t*>(dst);
  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
  const int num_threads = OMP_NUM_THREADS();
  #pragma omp parallel for schedule(static) num_threads(num_threads)
  for (comm_size_t i = 0; i < steps; ++i) {
    dst_ptr[i] += src_ptr[i];
  }
}

inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
  const int32_t* src_ptr = reinterpret_cast<const int32_t*>(src);
  int32_t* dst_ptr = reinterpret_cast<int32_t*>(dst);
  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
  const int num_threads = OMP_NUM_THREADS();
  #pragma omp parallel for schedule(static) num_threads(num_threads)
  for (comm_size_t i = 0; i < steps; ++i) {
    dst_ptr[i] += src_ptr[i];
  }
}

84
85
86
/*! \brief This class used to convert feature values into bin,
*          and store some meta information for bin*/
class BinMapper {
Nikita Titov's avatar
Nikita Titov committed
87
 public:
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  BinMapper();
  BinMapper(const BinMapper& other);
  explicit BinMapper(const void* memory);
  ~BinMapper();

  bool CheckAlign(const BinMapper& other) const {
    if (num_bin_ != other.num_bin_) {
      return false;
    }
    if (missing_type_ != other.missing_type_) {
      return false;
    }
    if (bin_type_ == BinType::NumericalBin) {
      for (int i = 0; i < num_bin_; ++i) {
        if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
          return false;
Guolin Ke's avatar
Guolin Ke committed
104
        }
105
106
107
108
109
      }
    } else {
      for (int i = 0; i < num_bin_; i++) {
        if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
          return false;
110
        }
111
112
      }
    }
113
114
    return true;
  }
115

116
117
  /*! \brief Get number of bins */
  inline int num_bin() const { return num_bin_; }
118

119
120
  /*! \brief Missing Type */
  inline MissingType missing_type() const { return missing_type_; }
121

Lingyi Hu's avatar
Lingyi Hu committed
122
123
  /*! \brief True if bin is trivial (contains only one bin) */
  inline bool is_trivial() const { return is_trivial_; }
124

125
126
  /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
  inline double sparse_rate() const { return sparse_rate_; }
127

128
129
130
131
  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
132
  void SaveBinaryToFile(BinaryWriter* writer) const;
133

134
135
136
137
138
139
140
141
142
143
  /*!
  * \brief Mapping bin into feature value
  * \param bin
  * \return Feature value of this bin
  */
  inline double BinToValue(uint32_t bin) const {
    if (bin_type_ == BinType::NumericalBin) {
      return bin_upper_bound_[bin];
    } else {
      return bin_2_categorical_[bin];
Guolin Ke's avatar
Guolin Ke committed
144
    }
145
  }
Guolin Ke's avatar
Guolin Ke committed
146

147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
  /*!
  * \brief Maximum categorical value
  * \return Maximum categorical value for categorical features, 0 for numerical features  
  */
  inline int MaxCatValue() const {
    if (bin_2_categorical_.size() == 0) {
      return 0;
    }
    int max_cat_value = bin_2_categorical_[0];
    for (size_t i = 1; i < bin_2_categorical_.size(); ++i) {
      if (bin_2_categorical_[i] > max_cat_value) {
        max_cat_value = bin_2_categorical_[i];
      }
    }
    return max_cat_value;
  }

164
165
166
167
  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const;
168

169
  /*!
170
  * \brief Mapping feature value into bin
171
172
173
174
175
176
177
178
179
180
181
182
  * \param value
  * \return bin for this feature value
  */
  inline uint32_t ValueToBin(double value) const;

  /*!
  * \brief Get the default bin when value is 0
  * \return default bin
  */
  inline uint32_t GetDefaultBin() const {
    return default_bin_;
  }
Guolin Ke's avatar
Guolin Ke committed
183
184
185
186
187

  inline uint32_t GetMostFreqBin() const {
    return most_freq_bin_;
  }

188
189
  /*!
  * \brief Construct feature value to bin mapper according feature values
190
  * \param values (Sampled) values of this feature, Note: not include zero.
191
192
193
194
195
  * \param num_values number of values.
  * \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
  * \param max_bin The maximal number of bin
  * \param min_data_in_bin min number of data in one bin
  * \param min_split_data
196
  * \param pre_filter
197
198
199
  * \param bin_type Type of this bin
  * \param use_missing True to enable missing value handle
  * \param zero_as_missing True to use zero as missing value
200
  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
201
  */
202
  void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
203
               bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
204
205

  /*!
206
  * \brief Serializing this object to buffer
207
208
209
210
211
  * \param buffer The destination
  */
  void CopyTo(char* buffer) const;

  /*!
212
  * \brief Deserializing this object from buffer
213
214
215
216
217
218
219
220
221
222
223
224
  * \param buffer The source
  */
  void CopyFrom(const char* buffer);

  /*!
  * \brief Get bin types
  */
  inline BinType bin_type() const { return bin_type_; }

  /*!
  * \brief Get bin info
  */
225
  inline std::string bin_info_string() const {
226
227
228
229
230
231
232
    if (bin_type_ == BinType::CategoricalBin) {
      return Common::Join(bin_2_categorical_, ":");
    } else {
      std::stringstream str_buf;
      str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
      str_buf << '[' << min_val_ << ':' << max_val_ << ']';
      return str_buf.str();
233
    }
234
  }
Guolin Ke's avatar
Guolin Ke committed
235

Nikita Titov's avatar
Nikita Titov committed
236
 private:
237
238
239
240
241
  /*! \brief Number of bins */
  int num_bin_;
  MissingType missing_type_;
  /*! \brief Store upper bound for each bin */
  std::vector<double> bin_upper_bound_;
Lingyi Hu's avatar
Lingyi Hu committed
242
243
  /*! \brief True if this feature is trivial */
  bool is_trivial_;
244
245
246
247
248
249
250
251
  /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
  double sparse_rate_;
  /*! \brief Type of this bin */
  BinType bin_type_;
  /*! \brief Mapper from categorical to bin */
  std::unordered_map<int, unsigned int> categorical_2_bin_;
  /*! \brief Mapper from bin to categorical */
  std::vector<int> bin_2_categorical_;
252
  /*! \brief minimal feature value */
253
254
255
256
257
  double min_val_;
  /*! \brief maximum feature value */
  double max_val_;
  /*! \brief bin value of feature value 0 */
  uint32_t default_bin_;
Guolin Ke's avatar
Guolin Ke committed
258
259

  uint32_t most_freq_bin_;
260
261
262
263
};

/*! \brief Iterator for one bin column */
class BinIterator {
Nikita Titov's avatar
Nikita Titov committed
264
 public:
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
  /*!
  * \brief Get bin data on specific row index
  * \param idx Index of this data
  * \return Bin data
  */
  virtual uint32_t Get(data_size_t idx) = 0;
  virtual uint32_t RawGet(data_size_t idx) = 0;
  virtual void Reset(data_size_t idx) = 0;
  virtual ~BinIterator() = default;
};

/*!
* \brief Interface for bin data. This class will store bin data for one feature.
*        unlike OrderedBin, this class will store data by original order.
*        Note that it may cause cache misses when construct histogram,
*        but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
Nikita Titov's avatar
Nikita Titov committed
283
 public:
284
285
286
  /*! \brief virtual destructor */
  virtual ~Bin() {}
  /*!
287
288
  * \brief Initialize for pushing.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
289
  * \param omp_max_threads The maximum number of OpenMP threads to allocate for
290
  */
291
  virtual void InitStreaming(uint32_t /*num_thread*/, int32_t /*omp_max_threads*/) { }
292
  /*!
293
  * \brief Push one record
294
  * \param tid Thread id
295
296
297
298
299
  * \param idx Index of record
  * \param value bin value of record
  */
  virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;

300
  virtual void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
301
302
303
304
  /*!
  * \brief Get bin iterator of this bin for specific feature
  * \param min_bin min_bin of current used feature
  * \param max_bin max_bin of current used feature
Guolin Ke's avatar
Guolin Ke committed
305
  * \param most_freq_bin
306
307
  * \return Iterator of this bin
  */
Guolin Ke's avatar
Guolin Ke committed
308
  virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const = 0;
309
310
311
312
313

  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
314
  virtual void SaveBinaryToFile(BinaryWriter* writer) const = 0;
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

  /*!
  * \brief Load from memory
  * \param memory
  * \param local_used_indices
  */
  virtual void LoadFromMemory(const void* memory,
    const std::vector<data_size_t>& local_used_indices) = 0;

  /*!
  * \brief Get sizes in byte of this object
  */
  virtual size_t SizesInByte() const = 0;

  /*! \brief Number of all data */
  virtual data_size_t num_data() const = 0;

332
333
334
  /*! \brief Get data pointer */
  virtual void* get_data() = 0;

335
336
337
338
339
340
341
342
343
344
  virtual void ReSize(data_size_t num_data) = 0;

  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
           which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
345
346
  * \param start start index in data_indices
  * \param end end index in data_indices
347
348
349
350
351
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
  * \param out Output Result
  */
  virtual void ConstructHistogram(
352
    const data_size_t* data_indices, data_size_t start, data_size_t end,
353
    const score_t* ordered_gradients, const score_t* ordered_hessians,
354
    hist_t* out) const = 0;
355

356
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
357
    const score_t* ordered_gradients, const score_t* ordered_hessians,
358
    hist_t* out) const = 0;
359

360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
  virtual void ConstructHistogramInt8(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
    hist_t* out) const = 0;

387
388
389
390
391
392
393
394
  /*!
  * \brief Construct histogram of this feature,
  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
  which is not cache friendly, since the access of memory is not continuous.
  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
  * \param data_indices Used data indices in current leaf
395
396
  * \param start start index in data_indices
  * \param end end index in data_indices
397
398
399
  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
  * \param out Output Result
  */
400
  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
401
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
402

403
  virtual void ConstructHistogram(data_size_t start, data_size_t end,
404
                                  const score_t* ordered_gradients, hist_t* out) const = 0;
405

406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
  virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
                                       const score_t* ordered_gradients, hist_t* out) const = 0;

424
  virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
                            uint32_t default_bin, uint32_t most_freq_bin,
                            MissingType missing_type, bool default_left,
                            uint32_t threshold, const data_size_t* data_indices,
                            data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin,
      const uint32_t* threshold, int num_threshold,
      const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

  virtual data_size_t Split(uint32_t max_bin, uint32_t default_bin,
                            uint32_t most_freq_bin, MissingType missing_type,
                            bool default_left, uint32_t threshold,
                            const data_size_t* data_indices, data_size_t cnt,
                            data_size_t* lte_indices,
                            data_size_t* gt_indices) const = 0;

  virtual data_size_t SplitCategorical(
      uint32_t max_bin, uint32_t most_freq_bin, const uint32_t* threshold,
      int num_threshold, const data_size_t* data_indices, data_size_t cnt,
      data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
  /*!
  * \brief After pushed all feature data, call this could have better refactor for bin data
  */
  virtual void FinishLoad() = 0;

  /*!
  * \brief Create object for bin data of one feature, used for dense feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateDenseBin(data_size_t num_data, int num_bin);

  /*!
  * \brief Create object for bin data of one feature, used for sparse feature
  * \param num_data Total number of data
  * \param num_bin Number of bin
  * \return The bin data object
  */
  static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
470
471
472
473
474

  /*!
  * \brief Deep copy the bin
  */
  virtual Bin* Clone() = 0;
475
476
477
478

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const = 0;

  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0;
479
480
};

481
482

class MultiValBin {
483
 public:
484
485
486
487
488
489
  virtual ~MultiValBin() {}

  virtual data_size_t num_data() const = 0;

  virtual int32_t num_bin() const = 0;

490
  virtual double num_element_per_row() const = 0;
491

492
  virtual const std::vector<uint32_t>& offsets() const = 0;
493

494
  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
495

496
497
498
  virtual void CopySubrow(const MultiValBin* full_bin,
                          const data_size_t* used_indices,
                          data_size_t num_used_indices) = 0;
499

500
501
  virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
                                  int num_feature,
502
503
                                  double estimate_element_per_row,
                                  const std::vector<uint32_t>& offsets) const = 0;
504

505
506
507
508
509
510
511
  virtual void CopySubcol(const MultiValBin* full_bin,
                          const std::vector<int>& used_feature_index,
                          const std::vector<uint32_t>& lower,
                          const std::vector<uint32_t>& upper,
                          const std::vector<uint32_t>& delta) = 0;

  virtual void ReSize(data_size_t num_data, int num_bin, int num_feature,
512
                      double estimate_element_per_row, const std::vector<uint32_t>& offsets) = 0;
513
514
515
516
517
518

  virtual void CopySubrowAndSubcol(
      const MultiValBin* full_bin, const data_size_t* used_indices,
      data_size_t num_used_indices, const std::vector<int>& used_feature_index,
      const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
      const std::vector<uint32_t>& delta) = 0;
519

Guolin Ke's avatar
Guolin Ke committed
520
521
522
523
524
  virtual void ConstructHistogram(const data_size_t* data_indices,
                                  data_size_t start, data_size_t end,
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
525
526

  virtual void ConstructHistogram(data_size_t start, data_size_t end,
Guolin Ke's avatar
Guolin Ke committed
527
528
529
                                  const score_t* gradients,
                                  const score_t* hessians,
                                  hist_t* out) const = 0;
530

Guolin Ke's avatar
Guolin Ke committed
531
532
533
534
535
536
  virtual void ConstructHistogramOrdered(const data_size_t* data_indices,
                                         data_size_t start, data_size_t end,
                                         const score_t* ordered_gradients,
                                         const score_t* ordered_hessians,
                                         hist_t* out) const = 0;

537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
  virtual void ConstructHistogramInt32(const data_size_t* data_indices,
                                       data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
                                              data_size_t start, data_size_t end,
                                              const score_t* ordered_gradients,
                                              const score_t* ordered_hessians,
                                              hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(const data_size_t* data_indices,
                                       data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
                                       const score_t* gradients,
                                       const score_t* hessians,
                                       hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
                                              data_size_t start, data_size_t end,
                                              const score_t* ordered_gradients,
                                              const score_t* ordered_hessians,
                                              hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(const data_size_t* data_indices,
                                      data_size_t start, data_size_t end,
                                      const score_t* gradients,
                                      const score_t* hessians,
                                      hist_t* out) const = 0;

  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
                                      const score_t* gradients,
                                      const score_t* hessians,
                                      hist_t* out) const = 0;

  virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
                                             data_size_t start, data_size_t end,
                                             const score_t* ordered_gradients,
                                             const score_t* ordered_hessians,
                                             hist_t* out) const = 0;

588
589
590
591
  virtual void FinishLoad() = 0;

  virtual bool IsSparse() = 0;

592
  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin,
593
                                        int num_feature, double sparse_rate, const std::vector<uint32_t>& offsets);
594
595

  static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin,
596
                                             int num_feature, const std::vector<uint32_t>& offsets);
597
598

  static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row);
599

600
601
  static constexpr double multi_val_bin_sparse_threshold = 0.25f;

602
  virtual MultiValBin* Clone() = 0;
603

604
  #ifdef USE_CUDA
605
606
607
608
609
  virtual const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const = 0;
610
  #endif  // USE_CUDA
611
612
};

613
614
inline uint32_t BinMapper::ValueToBin(double value) const {
  if (std::isnan(value)) {
615
616
617
    if (bin_type_ == BinType::CategoricalBin) {
      return 0;
    } else if (missing_type_ == MissingType::NaN) {
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
      return num_bin_ - 1;
    } else {
      value = 0.0f;
    }
  }
  if (bin_type_ == BinType::NumericalBin) {
    // binary search to find bin
    int l = 0;
    int r = num_bin_ - 1;
    if (missing_type_ == MissingType::NaN) {
      r -= 1;
    }
    while (l < r) {
      int m = (r + l - 1) / 2;
      if (value <= bin_upper_bound_[m]) {
        r = m;
634
      } else {
635
        l = m + 1;
636
637
      }
    }
638
639
640
641
642
    return l;
  } else {
    int int_value = static_cast<int>(value);
    // convert negative value to NaN bin
    if (int_value < 0) {
643
      return 0;
644
645
646
    }
    if (categorical_2_bin_.count(int_value)) {
      return categorical_2_bin_.at(int_value);
Guolin Ke's avatar
Guolin Ke committed
647
    } else {
648
      return 0;
Guolin Ke's avatar
Guolin Ke committed
649
650
    }
  }
651
}
Guolin Ke's avatar
Guolin Ke committed
652
653
654

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
655
#endif   // LightGBM_BIN_H_