feature_group.h 22.3 KB
Newer Older
1
2
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

9
10
11
12
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>

13
#include <cstdint>
14
15
16
17
#include <cstdio>
#include <memory>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
18
19
20
21
namespace LightGBM {

class Dataset;
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
22
struct TrainingShareStates;
23
class MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
24
25
/*! \brief Using to store data and providing some operations on one feature
 * group*/
Guolin Ke's avatar
Guolin Ke committed
26
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
27
 public:
Guolin Ke's avatar
Guolin Ke committed
28
29
  friend Dataset;
  friend DatasetLoader;
30
31
  friend TrainingShareStates;
  friend MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
32
33
34
35
36
37
38
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
  */
Guolin Ke's avatar
Guolin Ke committed
39
  FeatureGroup(int num_feature, int8_t is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
40
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
41
42
    data_size_t num_data, int group_id) :
    num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
Nikita Titov's avatar
Nikita Titov committed
43
    CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
Guolin Ke's avatar
Guolin Ke committed
44
    auto& ref_bin_mappers = *bin_mappers;
45
    double sum_sparse_rate = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
46
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
47
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
48
49
50
51
52
53
54
55
56
57
58
59
      sum_sparse_rate += bin_mappers_.back()->sparse_rate();
    }
    sum_sparse_rate /= num_feature_;
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
60
61
62
63
64
65
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
66
67
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
68
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
69
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
70
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
71
72
73
74
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
79
80
    CreateBinData(num_data, is_multi_val_, true, false);
  }

  FeatureGroup(const FeatureGroup& other, int num_data) {
    num_feature_ = other.num_feature_;
    is_multi_val_ = other.is_multi_val_;
81
    is_dense_multi_val_ = other.is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
82
83
84
85
86
87
88
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
89
    }
Guolin Ke's avatar
Guolin Ke committed
90
    CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
Guolin Ke's avatar
Guolin Ke committed
91
  }
Guolin Ke's avatar
Guolin Ke committed
92

93
94
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
Nikita Titov's avatar
Nikita Titov committed
95
    CHECK_EQ(static_cast<int>(bin_mappers->size()), 1);
96
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
97
    num_total_bin_ = 1;
98
    is_dense_multi_val_ = false;
Guolin Ke's avatar
Guolin Ke committed
99
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
100
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
101
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
102
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
103
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
104
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
105
106
107
108
109
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
110
    CreateBinData(num_data, false, false, false);
Guolin Ke's avatar
Guolin Ke committed
111
  }
112

Guolin Ke's avatar
Guolin Ke committed
113
  /*!
114
   * \brief Constructor from memory when data is present
Guolin Ke's avatar
Guolin Ke committed
115
116
117
   * \param memory Pointer of memory
   * \param num_all_data Number of global data
   * \param local_used_indices Local used indices, empty means using all data
118
   * \param group_id Id of group
Guolin Ke's avatar
Guolin Ke committed
119
   */
120
121
  FeatureGroup(const void* memory,
               data_size_t num_all_data,
122
123
               const std::vector<data_size_t>& local_used_indices,
               int group_id) {
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
    // Load the definition schema first
    const char* memory_ptr = LoadDefinitionFromMemory(memory, group_id);

    // Allocate memory for the data
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
    AllocateBins(num_data);

    // Now load the actual data
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_[i]->SizesInByte();
      }
    } else {
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
    }
  }

  /*!
   * \brief Constructor from definition in memory (without data)
   * \param memory Pointer of memory
   * \param local_used_indices Local used indices, empty means using all data
   */
  FeatureGroup(const void* memory, data_size_t num_data, int group_id) {
    LoadDefinitionFromMemory(memory, group_id);
    AllocateBins(num_data);
  }

  /*! \brief Destructor */
  ~FeatureGroup() {}

  /*!
   * \brief Load the overall definition of the feature group from binary serialized data
   * \param memory Pointer of memory
   * \param group_id Id of group
   */
  const char* LoadDefinitionFromMemory(const void* memory, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
164
165
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
166
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
167
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
168
169
    is_dense_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_));
Guolin Ke's avatar
Guolin Ke committed
170
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
171
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
172
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
173
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
174

175
176
    // get bin mapper(s)
    bin_mappers_.clear();
177
178
179
180
181
    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }

Guolin Ke's avatar
Guolin Ke committed
182
    bin_offsets_.clear();
183
184
185
186
187
188
189
190
191
192
193
194
    int offset = 1;
    if (is_dense_multi_val_) {
      offset = 0;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
Guolin Ke's avatar
Guolin Ke committed
195
196
197
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
198
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
199
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
200
201
202
203
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
204
205
206
207
208
209
210
211
212

    return memory_ptr;
  }

  /*!
   * \brief Allocate the bins
   * \param num_all_data Number of global data
   */
  inline void AllocateBins(data_size_t num_data) {
213
214
215
216
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
217
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
218
        } else {
219
          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
220
221
        }
      }
Guolin Ke's avatar
Guolin Ke committed
222
    } else {
223
224
225
226
227
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
Guolin Ke's avatar
Guolin Ke committed
228
229
    }
  }
230

231
232
233
  /*!
  * \brief Initialize for pushing in a streaming fashion.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
234
  * \param omp_max_threads The maximum number of OpenMP threads to allocate for
235
  */
236
  void InitStreaming(int32_t num_thread, int32_t omp_max_threads) {
237
238
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
239
        multi_bin_data_[i]->InitStreaming(num_thread, omp_max_threads);
240
241
      }
    } else {
242
      bin_data_->InitStreaming(num_thread, omp_max_threads);
243
244
245
    }
  }

Guolin Ke's avatar
Guolin Ke committed
246
  /*!
Guolin Ke's avatar
Guolin Ke committed
247
248
   * \brief Push one record, will auto convert to bin and push to bin data
   * \param tid Thread id
249
250
   * \param sub_feature_idx Index of the subfeature
   * \param line_idx Index of record
Guolin Ke's avatar
Guolin Ke committed
251
252
   * \param value feature value of record
   */
253
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
Guolin Ke's avatar
Guolin Ke committed
254
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
255
256
257
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
258
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
259
260
      bin -= 1;
    }
261
262
263
264
265
266
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
267
268
  }

Guolin Ke's avatar
Guolin Ke committed
269
270
271
272
273
274
275
276
277
278
  void ReSize(int num_data) {
    if (!is_multi_val_) {
      bin_data_->ReSize(num_data);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->ReSize(num_data);
      }
    }
  }

279
  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
280
    if (!is_multi_val_) {
281
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
282
283
    } else {
      for (int i = 0; i < num_feature_; ++i) {
284
        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
285
286
      }
    }
Guolin Ke's avatar
Guolin Ke committed
287
288
  }

289
290
291
292
293
294
295
296
  inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) {
    if (!is_multi_val_) {
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices);
    }
  }

297
  void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
298
299
    CHECK(is_multi_val_);
    CHECK(other->is_multi_val_);
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
    // every time when new features are added, we need to reconsider sparse or dense
    double sum_sparse_rate = 0.0f;
    for (int i = 0; i < num_feature_; ++i) {
      sum_sparse_rate += bin_mappers_[i]->sparse_rate();
    }
    for (int i = 0; i < other->num_feature_; ++i) {
      sum_sparse_rate += other->bin_mappers_[i]->sparse_rate();
    }
    sum_sparse_rate /= (num_feature_ + other->num_feature_);
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    bin_offsets_.clear();
    num_total_bin_ = offset;
318
319
320
321
322
323
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
324
325
326
327
328
329
330
331
332
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
        num_bin -= offset;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
333
334
335
336
337
    for (int i = 0; i < other->num_feature_; ++i) {
      const auto& other_bin_mapper = other->bin_mappers_[i];
      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
      auto num_bin = other_bin_mapper->num_bin();
      if (other_bin_mapper->GetMostFreqBin() == 0) {
338
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
339
340
      }
      num_total_bin_ += num_bin;
341
      bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
342
343
344
345
346
      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
    }
    num_feature_ += other->num_feature_;
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
347
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
348
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
349
350
351
352
353
354
355
356
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
Guolin Ke's avatar
Guolin Ke committed
357
358
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
                                                       most_freq_bin);
359
360
361
362
363
364
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
365
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
366
367
368
369
370
371
372
373
374
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
375
  }
376

377
  inline BinIterator* FeatureGroupIterator() {
378
379
380
    if (is_multi_val_) {
      return nullptr;
    }
381
382
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
383
384
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
385
  }
Guolin Ke's avatar
Guolin Ke committed
386

387
388
389
390
391
392
393
394
395
396
397
  inline size_t FeatureGroupSizesInByte() {
    return bin_data_->SizesInByte();
  }

  inline void* FeatureGroupData() {
    if (is_multi_val_) {
      return nullptr;
    }
    return bin_data_->get_data();
  }

398
399
400
401
402
  inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
                           const data_size_t* data_indices, data_size_t cnt,
                           data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
403
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
404
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
405
406
407
408
409
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
410
411
412
413
414
415
416
417
418
        if (num_feature_ == 1) {
          return bin_data_->Split(max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        } else {
          return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        }
419
      } else {
420
421
422
423
424
425
426
427
428
        if (num_feature_ == 1) {
          return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
                                             num_threshold, data_indices, cnt,
                                             lte_indices, gt_indices);
        } else {
          return bin_data_->SplitCategorical(
              min_bin, max_bin, most_freq_bin, threshold, num_threshold,
              data_indices, cnt, lte_indices, gt_indices);
        }
429
      }
430
    } else {
431
432
433
434
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
435
436
437
        return multi_bin_data_[sub_feature]->Split(
            max_bin, default_bin, most_freq_bin, missing_type, default_left,
            *threshold, data_indices, cnt, lte_indices, gt_indices);
438
      } else {
439
440
441
        return multi_bin_data_[sub_feature]->SplitCategorical(
            max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
            lte_indices, gt_indices);
442
      }
443
    }
Guolin Ke's avatar
Guolin Ke committed
444
  }
445

Guolin Ke's avatar
Guolin Ke committed
446
  /*!
Guolin Ke's avatar
Guolin Ke committed
447
448
449
450
   * \brief From bin to feature value
   * \param bin
   * \return FeatureGroup value of this bin
   */
Guolin Ke's avatar
Guolin Ke committed
451
452
453
454
455
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
456
457
458
   * \brief Write to binary stream
   * \param writer Writer
   * \param include_data Whether to write data (true) or just header information (false)
Guolin Ke's avatar
Guolin Ke committed
459
   */
460
  void SerializeToBinary(BinaryWriter* writer, bool include_data = true) const {
461
    writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
462
    writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
463
464
    writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
    writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
465
    for (int i = 0; i < num_feature_; ++i) {
466
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
467
    }
468
469
470
471
472
473
474
475

    if (include_data) {
      if (is_multi_val_) {
        for (int i = 0; i < num_feature_; ++i) {
          multi_bin_data_[i]->SaveBinaryToFile(writer);
        }
      } else {
        bin_data_->SaveBinaryToFile(writer);
476
477
      }
    }
Guolin Ke's avatar
Guolin Ke committed
478
  }
479

Guolin Ke's avatar
Guolin Ke committed
480
  /*!
Guolin Ke's avatar
Guolin Ke committed
481
482
   * \brief Get sizes in byte of this object
   */
483
  size_t SizesInByte(bool include_data = true) const {
484
    size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
485
                 VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
486
487
                 VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
                 VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
488
489
490
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
491
492
493
494
495
496
497
    if (include_data) {
      if (!is_multi_val_) {
        ret += bin_data_->SizesInByte();
      } else {
        for (int i = 0; i < num_feature_; ++i) {
          ret += multi_bin_data_[i]->SizesInByte();
        }
498
499
      }
    }
Guolin Ke's avatar
Guolin Ke committed
500
501
    return ret;
  }
502

Guolin Ke's avatar
Guolin Ke committed
503
504
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
505

506
  /*! \brief Deep copy */
507
508
  FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
    int group_id) {
509
    num_feature_ = other.num_feature_;
510
    is_multi_val_ = other.is_multi_val_;
511
    is_dense_multi_val_ = other.is_dense_multi_val_;
512
513
514
515
516
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
517
    for (auto& bin_mapper : other.bin_mappers_) {
518
519
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
520
521
522
523
524
525
526
527
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
528
529
530
531
532
533
534
535
536
537
538

    if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
      // this feature group was the first feature group, but now no longer is,
      // so we need to eliminate its special empty bin for multi val dense bin
      if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
        for (size_t i = 0; i < bin_offsets_.size(); ++i) {
          bin_offsets_[i] -= 1;
        }
        num_total_bin_ -= 1;
      }
    }
539
  }
Guolin Ke's avatar
Guolin Ke committed
540

541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    std::vector<BinIterator*>* bin_iterator,
    const int num_threads) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    }
  }

  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    BinIterator** bin_iterator) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator);
    }
  }

  uint32_t feature_max_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index + 1] - 1;
    } else {
      int addi = bin_mappers_[sub_feature_index]->GetMostFreqBin() == 0 ? 0 : 1;
      return bin_mappers_[sub_feature_index]->num_bin() - 1 + addi;
    }
  }

  uint32_t feature_min_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index];
    } else {
      return 1;
    }
  }

Nikita Titov's avatar
Nikita Titov committed
585
 private:
Guolin Ke's avatar
Guolin Ke committed
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
  void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
    if (is_multi_val) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
      is_multi_val_ = true;
    } else {
Guolin Ke's avatar
Guolin Ke committed
601
602
603
      if (force_sparse ||
          (!force_dense && num_feature_ == 1 &&
           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
Guolin Ke's avatar
Guolin Ke committed
604
605
606
607
608
609
610
611
612
613
        is_sparse_ = true;
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        is_sparse_ = false;
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      is_multi_val_ = false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
614
615
616
617
618
619
620
621
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
622
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
623
  /*! \brief True if this feature is sparse */
624
  bool is_multi_val_;
625
  bool is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
626
627
628
629
630
631
  bool is_sparse_;
  int num_total_bin_;
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
632
#endif  // LIGHTGBM_FEATURE_GROUP_H_