feature_group.h 19 KB
Newer Older
1
2
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

9
10
11
12
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>

13
14
15
16
#include <cstdio>
#include <memory>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
17
18
19
20
namespace LightGBM {

class Dataset;
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
21
struct TrainingShareStates;
22
class MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
23
24
/*! \brief Using to store data and providing some operations on one feature
 * group*/
Guolin Ke's avatar
Guolin Ke committed
25
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
  friend Dataset;
  friend DatasetLoader;
29
30
  friend TrainingShareStates;
  friend MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
31
32
33
34
35
36
37
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
  */
Guolin Ke's avatar
Guolin Ke committed
38
  FeatureGroup(int num_feature, int8_t is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
39
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
40
41
    data_size_t num_data, int group_id) :
    num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
Nikita Titov's avatar
Nikita Titov committed
42
    CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
Guolin Ke's avatar
Guolin Ke committed
43
    auto& ref_bin_mappers = *bin_mappers;
44
    double sum_sparse_rate = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
45
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
46
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
47
48
49
50
51
52
53
54
55
56
57
58
      sum_sparse_rate += bin_mappers_.back()->sparse_rate();
    }
    sum_sparse_rate /= num_feature_;
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
59
60
61
62
63
64
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
65
66
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
67
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
68
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
69
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
79
    CreateBinData(num_data, is_multi_val_, true, false);
  }

  FeatureGroup(const FeatureGroup& other, int num_data) {
    num_feature_ = other.num_feature_;
    is_multi_val_ = other.is_multi_val_;
80
    is_dense_multi_val_ = other.is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
86
87
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
88
    }
Guolin Ke's avatar
Guolin Ke committed
89
    CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
Guolin Ke's avatar
Guolin Ke committed
90
  }
Guolin Ke's avatar
Guolin Ke committed
91

92
93
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
Nikita Titov's avatar
Nikita Titov committed
94
    CHECK_EQ(static_cast<int>(bin_mappers->size()), 1);
95
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
96
    num_total_bin_ = 1;
97
    is_dense_multi_val_ = false;
Guolin Ke's avatar
Guolin Ke committed
98
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
99
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
100
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
101
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
102
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
103
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
109
    CreateBinData(num_data, false, false, false);
Guolin Ke's avatar
Guolin Ke committed
110
  }
111

Guolin Ke's avatar
Guolin Ke committed
112
  /*!
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
   * \brief Constructor from memory
   * \param memory Pointer of memory
   * \param num_all_data Number of global data
   * \param local_used_indices Local used indices, empty means using all data
   */
Guolin Ke's avatar
Guolin Ke committed
118
  FeatureGroup(const void* memory, data_size_t num_all_data,
119
120
               const std::vector<data_size_t>& local_used_indices,
               int group_id) {
Guolin Ke's avatar
Guolin Ke committed
121
122
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
123
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
124
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
125
126
    is_dense_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_));
Guolin Ke's avatar
Guolin Ke committed
127
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
128
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
129
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
130
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
131
132
    // get bin mapper
    bin_mappers_.clear();
133
134
135
136
137
138

    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }

Guolin Ke's avatar
Guolin Ke committed
139
    bin_offsets_.clear();
140
141
142
143
144
145
146
147
148
149
150
151
    int offset = 1;
    if (is_dense_multi_val_) {
      offset = 0;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
Guolin Ke's avatar
Guolin Ke committed
152
153
154
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
155
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
156
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
157
158
159
160
161
162
163
164
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
165
166
167
168
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
169
170
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
171
        } else {
Guolin Ke's avatar
Guolin Ke committed
172
173
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
174
175
176
177
        }
        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_.back()->SizesInByte();
      }
Guolin Ke's avatar
Guolin Ke committed
178
    } else {
179
180
181
182
183
184
185
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      // get bin data
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
Guolin Ke's avatar
Guolin Ke committed
186
187
    }
  }
188

Guolin Ke's avatar
Guolin Ke committed
189
  /*! \brief Destructor */
Guolin Ke's avatar
Guolin Ke committed
190
  ~FeatureGroup() {}
Guolin Ke's avatar
Guolin Ke committed
191
192

  /*!
Guolin Ke's avatar
Guolin Ke committed
193
194
195
196
197
198
199
   * \brief Push one record, will auto convert to bin and push to bin data
   * \param tid Thread id
   * \param idx Index of record
   * \param value feature value of record
   */
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
                       double value) {
Guolin Ke's avatar
Guolin Ke committed
200
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
201
202
203
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
204
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
205
206
      bin -= 1;
    }
207
208
209
210
211
212
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
213
214
  }

Guolin Ke's avatar
Guolin Ke committed
215
216
217
218
219
220
221
222
223
224
  void ReSize(int num_data) {
    if (!is_multi_val_) {
      bin_data_->ReSize(num_data);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->ReSize(num_data);
      }
    }
  }

225
  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
226
    if (!is_multi_val_) {
227
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
228
229
    } else {
      for (int i = 0; i < num_feature_; ++i) {
230
        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
231
232
      }
    }
Guolin Ke's avatar
Guolin Ke committed
233
234
  }

235
236
237
238
239
240
241
242
  inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) {
    if (!is_multi_val_) {
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices);
    }
  }

243
  void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
244
245
    CHECK(is_multi_val_);
    CHECK(other->is_multi_val_);
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    // every time when new features are added, we need to reconsider sparse or dense
    double sum_sparse_rate = 0.0f;
    for (int i = 0; i < num_feature_; ++i) {
      sum_sparse_rate += bin_mappers_[i]->sparse_rate();
    }
    for (int i = 0; i < other->num_feature_; ++i) {
      sum_sparse_rate += other->bin_mappers_[i]->sparse_rate();
    }
    sum_sparse_rate /= (num_feature_ + other->num_feature_);
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    bin_offsets_.clear();
    num_total_bin_ = offset;
264
265
266
267
268
269
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
270
271
272
273
274
275
276
277
278
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
        num_bin -= offset;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
283
    for (int i = 0; i < other->num_feature_; ++i) {
      const auto& other_bin_mapper = other->bin_mappers_[i];
      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
      auto num_bin = other_bin_mapper->num_bin();
      if (other_bin_mapper->GetMostFreqBin() == 0) {
284
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
285
286
      }
      num_total_bin_ += num_bin;
287
      bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
288
289
290
291
292
      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
    }
    num_feature_ += other->num_feature_;
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
293
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
294
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
295
296
297
298
299
300
301
302
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
Guolin Ke's avatar
Guolin Ke committed
303
304
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
                                                       most_freq_bin);
305
306
307
308
309
310
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
311
#pragma omp parallel for schedule(guided)
312
313
314
315
316
317
318
319
320
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
321
  }
322

323
  inline BinIterator* FeatureGroupIterator() {
324
325
326
    if (is_multi_val_) {
      return nullptr;
    }
327
328
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
329
330
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
331
  }
Guolin Ke's avatar
Guolin Ke committed
332

333
334
335
336
337
338
339
340
341
342
343
  inline size_t FeatureGroupSizesInByte() {
    return bin_data_->SizesInByte();
  }

  inline void* FeatureGroupData() {
    if (is_multi_val_) {
      return nullptr;
    }
    return bin_data_->get_data();
  }

344
345
346
347
348
  inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
                           const data_size_t* data_indices, data_size_t cnt,
                           data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
349
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
350
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
351
352
353
354
355
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
356
357
358
359
360
361
362
363
364
        if (num_feature_ == 1) {
          return bin_data_->Split(max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        } else {
          return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        }
365
      } else {
366
367
368
369
370
371
372
373
374
        if (num_feature_ == 1) {
          return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
                                             num_threshold, data_indices, cnt,
                                             lte_indices, gt_indices);
        } else {
          return bin_data_->SplitCategorical(
              min_bin, max_bin, most_freq_bin, threshold, num_threshold,
              data_indices, cnt, lte_indices, gt_indices);
        }
375
      }
376
    } else {
377
378
379
380
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
381
382
383
        return multi_bin_data_[sub_feature]->Split(
            max_bin, default_bin, most_freq_bin, missing_type, default_left,
            *threshold, data_indices, cnt, lte_indices, gt_indices);
384
      } else {
385
386
387
        return multi_bin_data_[sub_feature]->SplitCategorical(
            max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
            lte_indices, gt_indices);
388
      }
389
    }
Guolin Ke's avatar
Guolin Ke committed
390
  }
391

Guolin Ke's avatar
Guolin Ke committed
392
  /*!
Guolin Ke's avatar
Guolin Ke committed
393
394
395
396
   * \brief From bin to feature value
   * \param bin
   * \return FeatureGroup value of this bin
   */
Guolin Ke's avatar
Guolin Ke committed
397
398
399
400
401
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
Guolin Ke's avatar
Guolin Ke committed
402
403
404
   * \brief Save binary data to file
   * \param file File want to write
   */
405
  void SaveBinaryToFile(const VirtualFileWriter* writer) const {
406
    writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
407
    writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
408
409
    writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
    writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
410
    for (int i = 0; i < num_feature_; ++i) {
411
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
412
    }
413
414
415
416
417
418
419
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->SaveBinaryToFile(writer);
      }
    } else {
      bin_data_->SaveBinaryToFile(writer);
    }
Guolin Ke's avatar
Guolin Ke committed
420
  }
421

Guolin Ke's avatar
Guolin Ke committed
422
  /*!
Guolin Ke's avatar
Guolin Ke committed
423
424
   * \brief Get sizes in byte of this object
   */
Guolin Ke's avatar
Guolin Ke committed
425
  size_t SizesInByte() const {
426
    size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
427
                 VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
428
429
                 VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
                 VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
430
431
432
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
433
434
435
436
437
438
439
    if (!is_multi_val_) {
      ret += bin_data_->SizesInByte();
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        ret += multi_bin_data_[i]->SizesInByte();
      }
    }
Guolin Ke's avatar
Guolin Ke committed
440
441
    return ret;
  }
442

Guolin Ke's avatar
Guolin Ke committed
443
444
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
445

446
  /*! \brief Deep copy */
447
448
  FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
    int group_id) {
449
    num_feature_ = other.num_feature_;
450
    is_multi_val_ = other.is_multi_val_;
451
    is_dense_multi_val_ = other.is_dense_multi_val_;
452
453
454
455
456
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
457
    for (auto& bin_mapper : other.bin_mappers_) {
458
459
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
460
461
462
463
464
465
466
467
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
468
469
470
471
472
473
474
475
476
477
478

    if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
      // this feature group was the first feature group, but now no longer is,
      // so we need to eliminate its special empty bin for multi val dense bin
      if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
        for (size_t i = 0; i < bin_offsets_.size(); ++i) {
          bin_offsets_[i] -= 1;
        }
        num_total_bin_ -= 1;
      }
    }
479
  }
Guolin Ke's avatar
Guolin Ke committed
480

Nikita Titov's avatar
Nikita Titov committed
481
 private:
Guolin Ke's avatar
Guolin Ke committed
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
  void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
    if (is_multi_val) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
      is_multi_val_ = true;
    } else {
Guolin Ke's avatar
Guolin Ke committed
497
498
499
      if (force_sparse ||
          (!force_dense && num_feature_ == 1 &&
           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
Guolin Ke's avatar
Guolin Ke committed
500
501
502
503
504
505
506
507
508
509
        is_sparse_ = true;
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        is_sparse_ = false;
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      is_multi_val_ = false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
510
511
512
513
514
515
516
517
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
518
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
519
  /*! \brief True if this feature is sparse */
520
  bool is_multi_val_;
521
  bool is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
522
523
524
525
526
527
  bool is_sparse_;
  int num_total_bin_;
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
528
#endif  // LIGHTGBM_FEATURE_GROUP_H_