feature_group.h 22.2 KB
Newer Older
1
2
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

9
10
11
12
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>

13
14
15
16
#include <cstdio>
#include <memory>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
17
18
19
20
namespace LightGBM {

class Dataset;
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
21
struct TrainingShareStates;
22
class MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
23
24
/*! \brief Using to store data and providing some operations on one feature
 * group*/
Guolin Ke's avatar
Guolin Ke committed
25
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
  friend Dataset;
  friend DatasetLoader;
29
30
  friend TrainingShareStates;
  friend MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
31
32
33
34
35
36
37
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
  */
Guolin Ke's avatar
Guolin Ke committed
38
  FeatureGroup(int num_feature, int8_t is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
39
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
40
41
    data_size_t num_data, int group_id) :
    num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
Nikita Titov's avatar
Nikita Titov committed
42
    CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
Guolin Ke's avatar
Guolin Ke committed
43
    auto& ref_bin_mappers = *bin_mappers;
44
    double sum_sparse_rate = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
45
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
46
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
47
48
49
50
51
52
53
54
55
56
57
58
      sum_sparse_rate += bin_mappers_.back()->sparse_rate();
    }
    sum_sparse_rate /= num_feature_;
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
59
60
61
62
63
64
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
65
66
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
67
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
68
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
69
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
79
    CreateBinData(num_data, is_multi_val_, true, false);
  }

  FeatureGroup(const FeatureGroup& other, int num_data) {
    num_feature_ = other.num_feature_;
    is_multi_val_ = other.is_multi_val_;
80
    is_dense_multi_val_ = other.is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
86
87
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
88
    }
Guolin Ke's avatar
Guolin Ke committed
89
    CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
Guolin Ke's avatar
Guolin Ke committed
90
  }
Guolin Ke's avatar
Guolin Ke committed
91

92
93
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
Nikita Titov's avatar
Nikita Titov committed
94
    CHECK_EQ(static_cast<int>(bin_mappers->size()), 1);
95
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
96
    num_total_bin_ = 1;
97
    is_dense_multi_val_ = false;
Guolin Ke's avatar
Guolin Ke committed
98
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
99
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
100
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
101
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
102
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
103
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
109
    CreateBinData(num_data, false, false, false);
Guolin Ke's avatar
Guolin Ke committed
110
  }
111

Guolin Ke's avatar
Guolin Ke committed
112
  /*!
113
   * \brief Constructor from memory when data is present
Guolin Ke's avatar
Guolin Ke committed
114
115
116
   * \param memory Pointer of memory
   * \param num_all_data Number of global data
   * \param local_used_indices Local used indices, empty means using all data
117
   * \param group_id Id of group
Guolin Ke's avatar
Guolin Ke committed
118
   */
119
120
  FeatureGroup(const void* memory,
               data_size_t num_all_data,
121
122
               const std::vector<data_size_t>& local_used_indices,
               int group_id) {
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    // Load the definition schema first
    const char* memory_ptr = LoadDefinitionFromMemory(memory, group_id);

    // Allocate memory for the data
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
    AllocateBins(num_data);

    // Now load the actual data
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_[i]->SizesInByte();
      }
    } else {
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
    }
  }

  /*!
   * \brief Constructor from definition in memory (without data)
   * \param memory Pointer of memory
   * \param local_used_indices Local used indices, empty means using all data
   */
  FeatureGroup(const void* memory, data_size_t num_data, int group_id) {
    LoadDefinitionFromMemory(memory, group_id);
    AllocateBins(num_data);
  }

  /*! \brief Destructor */
  ~FeatureGroup() {}

  /*!
   * \brief Load the overall definition of the feature group from binary serialized data
   * \param memory Pointer of memory
   * \param group_id Id of group
   */
  const char* LoadDefinitionFromMemory(const void* memory, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
163
164
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
165
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
166
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
167
168
    is_dense_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_));
Guolin Ke's avatar
Guolin Ke committed
169
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
170
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
171
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
172
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
173

174
175
    // get bin mapper(s)
    bin_mappers_.clear();
176
177
178
179
180
    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }

Guolin Ke's avatar
Guolin Ke committed
181
    bin_offsets_.clear();
182
183
184
185
186
187
188
189
190
191
192
193
    int offset = 1;
    if (is_dense_multi_val_) {
      offset = 0;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
Guolin Ke's avatar
Guolin Ke committed
194
195
196
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
197
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
198
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
203
204
205
206
207
208
209
210
211

    return memory_ptr;
  }

  /*!
   * \brief Allocate the bins
   * \param num_all_data Number of global data
   */
  inline void AllocateBins(data_size_t num_data) {
212
213
214
215
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
216
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
217
        } else {
218
          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
219
220
        }
      }
Guolin Ke's avatar
Guolin Ke committed
221
    } else {
222
223
224
225
226
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
Guolin Ke's avatar
Guolin Ke committed
227
228
    }
  }
229

230
231
232
  /*!
  * \brief Initialize for pushing in a streaming fashion.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
233
  * \param omp_max_threads The maximum number of OpenMP threads to allocate for
234
  */
235
  void InitStreaming(int32_t num_thread, int32_t omp_max_threads) {
236
237
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
238
        multi_bin_data_[i]->InitStreaming(num_thread, omp_max_threads);
239
240
      }
    } else {
241
      bin_data_->InitStreaming(num_thread, omp_max_threads);
242
243
244
    }
  }

Guolin Ke's avatar
Guolin Ke committed
245
  /*!
Guolin Ke's avatar
Guolin Ke committed
246
247
   * \brief Push one record, will auto convert to bin and push to bin data
   * \param tid Thread id
248
249
   * \param sub_feature_idx Index of the subfeature
   * \param line_idx Index of record
Guolin Ke's avatar
Guolin Ke committed
250
251
   * \param value feature value of record
   */
252
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
Guolin Ke's avatar
Guolin Ke committed
253
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
254
255
256
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
257
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
258
259
      bin -= 1;
    }
260
261
262
263
264
265
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
266
267
  }

Guolin Ke's avatar
Guolin Ke committed
268
269
270
271
272
273
274
275
276
277
  void ReSize(int num_data) {
    if (!is_multi_val_) {
      bin_data_->ReSize(num_data);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->ReSize(num_data);
      }
    }
  }

278
  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
279
    if (!is_multi_val_) {
280
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
281
282
    } else {
      for (int i = 0; i < num_feature_; ++i) {
283
        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
284
285
      }
    }
Guolin Ke's avatar
Guolin Ke committed
286
287
  }

288
289
290
291
292
293
294
295
  inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) {
    if (!is_multi_val_) {
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices);
    }
  }

296
  void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
297
298
    CHECK(is_multi_val_);
    CHECK(other->is_multi_val_);
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
    // every time when new features are added, we need to reconsider sparse or dense
    double sum_sparse_rate = 0.0f;
    for (int i = 0; i < num_feature_; ++i) {
      sum_sparse_rate += bin_mappers_[i]->sparse_rate();
    }
    for (int i = 0; i < other->num_feature_; ++i) {
      sum_sparse_rate += other->bin_mappers_[i]->sparse_rate();
    }
    sum_sparse_rate /= (num_feature_ + other->num_feature_);
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    bin_offsets_.clear();
    num_total_bin_ = offset;
317
318
319
320
321
322
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
323
324
325
326
327
328
329
330
331
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
        num_bin -= offset;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
332
333
334
335
336
    for (int i = 0; i < other->num_feature_; ++i) {
      const auto& other_bin_mapper = other->bin_mappers_[i];
      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
      auto num_bin = other_bin_mapper->num_bin();
      if (other_bin_mapper->GetMostFreqBin() == 0) {
337
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
338
339
      }
      num_total_bin_ += num_bin;
340
      bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
341
342
343
344
345
      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
    }
    num_feature_ += other->num_feature_;
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
346
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
347
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
348
349
350
351
352
353
354
355
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
Guolin Ke's avatar
Guolin Ke committed
356
357
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
                                                       most_freq_bin);
358
359
360
361
362
363
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
364
#pragma omp parallel for schedule(guided)
365
366
367
368
369
370
371
372
373
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
374
  }
375

376
  inline BinIterator* FeatureGroupIterator() {
377
378
379
    if (is_multi_val_) {
      return nullptr;
    }
380
381
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
382
383
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
384
  }
Guolin Ke's avatar
Guolin Ke committed
385

386
387
388
389
390
391
392
393
394
395
396
  inline size_t FeatureGroupSizesInByte() {
    return bin_data_->SizesInByte();
  }

  inline void* FeatureGroupData() {
    if (is_multi_val_) {
      return nullptr;
    }
    return bin_data_->get_data();
  }

397
398
399
400
401
  inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
                           const data_size_t* data_indices, data_size_t cnt,
                           data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
402
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
403
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
404
405
406
407
408
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
409
410
411
412
413
414
415
416
417
        if (num_feature_ == 1) {
          return bin_data_->Split(max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        } else {
          return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        }
418
      } else {
419
420
421
422
423
424
425
426
427
        if (num_feature_ == 1) {
          return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
                                             num_threshold, data_indices, cnt,
                                             lte_indices, gt_indices);
        } else {
          return bin_data_->SplitCategorical(
              min_bin, max_bin, most_freq_bin, threshold, num_threshold,
              data_indices, cnt, lte_indices, gt_indices);
        }
428
      }
429
    } else {
430
431
432
433
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
434
435
436
        return multi_bin_data_[sub_feature]->Split(
            max_bin, default_bin, most_freq_bin, missing_type, default_left,
            *threshold, data_indices, cnt, lte_indices, gt_indices);
437
      } else {
438
439
440
        return multi_bin_data_[sub_feature]->SplitCategorical(
            max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
            lte_indices, gt_indices);
441
      }
442
    }
Guolin Ke's avatar
Guolin Ke committed
443
  }
444

Guolin Ke's avatar
Guolin Ke committed
445
  /*!
Guolin Ke's avatar
Guolin Ke committed
446
447
448
449
   * \brief From bin to feature value
   * \param bin
   * \return FeatureGroup value of this bin
   */
Guolin Ke's avatar
Guolin Ke committed
450
451
452
453
454
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
455
456
457
   * \brief Write to binary stream
   * \param writer Writer
   * \param include_data Whether to write data (true) or just header information (false)
Guolin Ke's avatar
Guolin Ke committed
458
   */
459
  void SerializeToBinary(BinaryWriter* writer, bool include_data = true) const {
460
    writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
461
    writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
462
463
    writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
    writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
464
    for (int i = 0; i < num_feature_; ++i) {
465
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
466
    }
467
468
469
470
471
472
473
474

    if (include_data) {
      if (is_multi_val_) {
        for (int i = 0; i < num_feature_; ++i) {
          multi_bin_data_[i]->SaveBinaryToFile(writer);
        }
      } else {
        bin_data_->SaveBinaryToFile(writer);
475
476
      }
    }
Guolin Ke's avatar
Guolin Ke committed
477
  }
478

Guolin Ke's avatar
Guolin Ke committed
479
  /*!
Guolin Ke's avatar
Guolin Ke committed
480
481
   * \brief Get sizes in byte of this object
   */
482
  size_t SizesInByte(bool include_data = true) const {
483
    size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
484
                 VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
485
486
                 VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
                 VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
487
488
489
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
490
491
492
493
494
495
496
    if (include_data) {
      if (!is_multi_val_) {
        ret += bin_data_->SizesInByte();
      } else {
        for (int i = 0; i < num_feature_; ++i) {
          ret += multi_bin_data_[i]->SizesInByte();
        }
497
498
      }
    }
Guolin Ke's avatar
Guolin Ke committed
499
500
    return ret;
  }
501

Guolin Ke's avatar
Guolin Ke committed
502
503
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
504

505
  /*! \brief Deep copy */
506
507
  FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
    int group_id) {
508
    num_feature_ = other.num_feature_;
509
    is_multi_val_ = other.is_multi_val_;
510
    is_dense_multi_val_ = other.is_dense_multi_val_;
511
512
513
514
515
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
516
    for (auto& bin_mapper : other.bin_mappers_) {
517
518
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
519
520
521
522
523
524
525
526
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
527
528
529
530
531
532
533
534
535
536
537

    if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
      // this feature group was the first feature group, but now no longer is,
      // so we need to eliminate its special empty bin for multi val dense bin
      if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
        for (size_t i = 0; i < bin_offsets_.size(); ++i) {
          bin_offsets_[i] -= 1;
        }
        num_total_bin_ -= 1;
      }
    }
538
  }
Guolin Ke's avatar
Guolin Ke committed
539

540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    std::vector<BinIterator*>* bin_iterator,
    const int num_threads) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    }
  }

  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    BinIterator** bin_iterator) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator);
    }
  }

  uint32_t feature_max_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index + 1] - 1;
    } else {
      int addi = bin_mappers_[sub_feature_index]->GetMostFreqBin() == 0 ? 0 : 1;
      return bin_mappers_[sub_feature_index]->num_bin() - 1 + addi;
    }
  }

  uint32_t feature_min_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index];
    } else {
      return 1;
    }
  }

Nikita Titov's avatar
Nikita Titov committed
584
 private:
Guolin Ke's avatar
Guolin Ke committed
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
  void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
    if (is_multi_val) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
      is_multi_val_ = true;
    } else {
Guolin Ke's avatar
Guolin Ke committed
600
601
602
      if (force_sparse ||
          (!force_dense && num_feature_ == 1 &&
           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
Guolin Ke's avatar
Guolin Ke committed
603
604
605
606
607
608
609
610
611
612
        is_sparse_ = true;
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        is_sparse_ = false;
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      is_multi_val_ = false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
613
614
615
616
617
618
619
620
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
621
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
622
  /*! \brief True if this feature is sparse */
623
  bool is_multi_val_;
624
  bool is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
625
626
627
628
629
630
  bool is_sparse_;
  int num_total_bin_;
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
631
#endif  // LIGHTGBM_FEATURE_GROUP_H_