feature_group.h 20.4 KB
Newer Older
1
2
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

9
10
11
12
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>

13
14
15
16
#include <cstdio>
#include <memory>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
17
18
19
20
namespace LightGBM {

class Dataset;
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
21
struct TrainingShareStates;
22
class MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
23
24
/*! \brief Using to store data and providing some operations on one feature
 * group*/
Guolin Ke's avatar
Guolin Ke committed
25
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
  friend Dataset;
  friend DatasetLoader;
29
30
  friend TrainingShareStates;
  friend MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
31
32
33
34
35
36
37
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
  */
Guolin Ke's avatar
Guolin Ke committed
38
  FeatureGroup(int num_feature, int8_t is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
39
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
40
41
    data_size_t num_data, int group_id) :
    num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
Nikita Titov's avatar
Nikita Titov committed
42
    CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
Guolin Ke's avatar
Guolin Ke committed
43
    auto& ref_bin_mappers = *bin_mappers;
44
    double sum_sparse_rate = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
45
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
46
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
47
48
49
50
51
52
53
54
55
56
57
58
      sum_sparse_rate += bin_mappers_.back()->sparse_rate();
    }
    sum_sparse_rate /= num_feature_;
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
59
60
61
62
63
64
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
65
66
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
67
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
68
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
69
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
79
    CreateBinData(num_data, is_multi_val_, true, false);
  }

  FeatureGroup(const FeatureGroup& other, int num_data) {
    num_feature_ = other.num_feature_;
    is_multi_val_ = other.is_multi_val_;
80
    is_dense_multi_val_ = other.is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
86
87
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
88
    }
Guolin Ke's avatar
Guolin Ke committed
89
    CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
Guolin Ke's avatar
Guolin Ke committed
90
  }
Guolin Ke's avatar
Guolin Ke committed
91

92
93
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
Nikita Titov's avatar
Nikita Titov committed
94
    CHECK_EQ(static_cast<int>(bin_mappers->size()), 1);
95
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
96
    num_total_bin_ = 1;
97
    is_dense_multi_val_ = false;
Guolin Ke's avatar
Guolin Ke committed
98
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
99
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
100
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
101
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
102
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
103
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
109
    CreateBinData(num_data, false, false, false);
Guolin Ke's avatar
Guolin Ke committed
110
  }
111

Guolin Ke's avatar
Guolin Ke committed
112
  /*!
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
   * \brief Constructor from memory
   * \param memory Pointer of memory
   * \param num_all_data Number of global data
   * \param local_used_indices Local used indices, empty means using all data
   */
Guolin Ke's avatar
Guolin Ke committed
118
  FeatureGroup(const void* memory, data_size_t num_all_data,
119
120
               const std::vector<data_size_t>& local_used_indices,
               int group_id) {
Guolin Ke's avatar
Guolin Ke committed
121
122
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
123
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
124
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
125
126
    is_dense_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_));
Guolin Ke's avatar
Guolin Ke committed
127
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
128
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
129
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
130
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
131
132
    // get bin mapper
    bin_mappers_.clear();
133
134
135
136
137
138

    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }

Guolin Ke's avatar
Guolin Ke committed
139
    bin_offsets_.clear();
140
141
142
143
144
145
146
147
148
149
150
151
    int offset = 1;
    if (is_dense_multi_val_) {
      offset = 0;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
Guolin Ke's avatar
Guolin Ke committed
152
153
154
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
155
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
156
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
157
158
159
160
161
162
163
164
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
165
166
167
168
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
169
170
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
171
        } else {
Guolin Ke's avatar
Guolin Ke committed
172
173
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
174
175
176
177
        }
        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_.back()->SizesInByte();
      }
Guolin Ke's avatar
Guolin Ke committed
178
    } else {
179
180
181
182
183
184
185
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      // get bin data
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
Guolin Ke's avatar
Guolin Ke committed
186
187
    }
  }
188

Guolin Ke's avatar
Guolin Ke committed
189
  /*! \brief Destructor */
Guolin Ke's avatar
Guolin Ke committed
190
  ~FeatureGroup() {}
Guolin Ke's avatar
Guolin Ke committed
191
192

  /*!
Guolin Ke's avatar
Guolin Ke committed
193
194
195
196
197
198
199
   * \brief Push one record, will auto convert to bin and push to bin data
   * \param tid Thread id
   * \param idx Index of record
   * \param value feature value of record
   */
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
                       double value) {
Guolin Ke's avatar
Guolin Ke committed
200
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
201
202
203
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
204
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
205
206
      bin -= 1;
    }
207
208
209
210
211
212
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
213
214
  }

Guolin Ke's avatar
Guolin Ke committed
215
216
217
218
219
220
221
222
223
224
  void ReSize(int num_data) {
    if (!is_multi_val_) {
      bin_data_->ReSize(num_data);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->ReSize(num_data);
      }
    }
  }

225
  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
226
    if (!is_multi_val_) {
227
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
228
229
    } else {
      for (int i = 0; i < num_feature_; ++i) {
230
        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
231
232
      }
    }
Guolin Ke's avatar
Guolin Ke committed
233
234
  }

235
236
237
238
239
240
241
242
  inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) {
    if (!is_multi_val_) {
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices);
    }
  }

243
  void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
244
245
    CHECK(is_multi_val_);
    CHECK(other->is_multi_val_);
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    // every time when new features are added, we need to reconsider sparse or dense
    double sum_sparse_rate = 0.0f;
    for (int i = 0; i < num_feature_; ++i) {
      sum_sparse_rate += bin_mappers_[i]->sparse_rate();
    }
    for (int i = 0; i < other->num_feature_; ++i) {
      sum_sparse_rate += other->bin_mappers_[i]->sparse_rate();
    }
    sum_sparse_rate /= (num_feature_ + other->num_feature_);
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    bin_offsets_.clear();
    num_total_bin_ = offset;
264
265
266
267
268
269
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
270
271
272
273
274
275
276
277
278
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
        num_bin -= offset;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
283
    for (int i = 0; i < other->num_feature_; ++i) {
      const auto& other_bin_mapper = other->bin_mappers_[i];
      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
      auto num_bin = other_bin_mapper->num_bin();
      if (other_bin_mapper->GetMostFreqBin() == 0) {
284
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
285
286
      }
      num_total_bin_ += num_bin;
287
      bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
288
289
290
291
292
      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
    }
    num_feature_ += other->num_feature_;
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
293
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
294
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
295
296
297
298
299
300
301
302
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
Guolin Ke's avatar
Guolin Ke committed
303
304
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
                                                       most_freq_bin);
305
306
307
308
309
310
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
311
#pragma omp parallel for schedule(guided)
312
313
314
315
316
317
318
319
320
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
321
  }
322

323
  inline BinIterator* FeatureGroupIterator() {
324
325
326
    if (is_multi_val_) {
      return nullptr;
    }
327
328
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
329
330
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
331
  }
Guolin Ke's avatar
Guolin Ke committed
332

333
334
335
336
337
338
339
340
341
342
343
  inline size_t FeatureGroupSizesInByte() {
    return bin_data_->SizesInByte();
  }

  inline void* FeatureGroupData() {
    if (is_multi_val_) {
      return nullptr;
    }
    return bin_data_->get_data();
  }

344
345
346
347
348
  inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
                           const data_size_t* data_indices, data_size_t cnt,
                           data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
349
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
350
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
351
352
353
354
355
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
356
357
358
359
360
361
362
363
364
        if (num_feature_ == 1) {
          return bin_data_->Split(max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        } else {
          return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        }
365
      } else {
366
367
368
369
370
371
372
373
374
        if (num_feature_ == 1) {
          return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
                                             num_threshold, data_indices, cnt,
                                             lte_indices, gt_indices);
        } else {
          return bin_data_->SplitCategorical(
              min_bin, max_bin, most_freq_bin, threshold, num_threshold,
              data_indices, cnt, lte_indices, gt_indices);
        }
375
      }
376
    } else {
377
378
379
380
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
381
382
383
        return multi_bin_data_[sub_feature]->Split(
            max_bin, default_bin, most_freq_bin, missing_type, default_left,
            *threshold, data_indices, cnt, lte_indices, gt_indices);
384
      } else {
385
386
387
        return multi_bin_data_[sub_feature]->SplitCategorical(
            max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
            lte_indices, gt_indices);
388
      }
389
    }
Guolin Ke's avatar
Guolin Ke committed
390
  }
391

Guolin Ke's avatar
Guolin Ke committed
392
  /*!
Guolin Ke's avatar
Guolin Ke committed
393
394
395
396
   * \brief From bin to feature value
   * \param bin
   * \return FeatureGroup value of this bin
   */
Guolin Ke's avatar
Guolin Ke committed
397
398
399
400
401
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
Guolin Ke's avatar
Guolin Ke committed
402
403
404
   * \brief Save binary data to file
   * \param file File want to write
   */
405
  void SaveBinaryToFile(const VirtualFileWriter* writer) const {
406
    writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
407
    writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
408
409
    writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
    writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
410
    for (int i = 0; i < num_feature_; ++i) {
411
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
412
    }
413
414
415
416
417
418
419
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->SaveBinaryToFile(writer);
      }
    } else {
      bin_data_->SaveBinaryToFile(writer);
    }
Guolin Ke's avatar
Guolin Ke committed
420
  }
421

Guolin Ke's avatar
Guolin Ke committed
422
  /*!
Guolin Ke's avatar
Guolin Ke committed
423
424
   * \brief Get sizes in byte of this object
   */
Guolin Ke's avatar
Guolin Ke committed
425
  size_t SizesInByte() const {
426
    size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
427
                 VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
428
429
                 VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
                 VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
430
431
432
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
433
434
435
436
437
438
439
    if (!is_multi_val_) {
      ret += bin_data_->SizesInByte();
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        ret += multi_bin_data_[i]->SizesInByte();
      }
    }
Guolin Ke's avatar
Guolin Ke committed
440
441
    return ret;
  }
442

Guolin Ke's avatar
Guolin Ke committed
443
444
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
445

446
  /*! \brief Deep copy */
447
448
  FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
    int group_id) {
449
    num_feature_ = other.num_feature_;
450
    is_multi_val_ = other.is_multi_val_;
451
    is_dense_multi_val_ = other.is_dense_multi_val_;
452
453
454
455
456
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
457
    for (auto& bin_mapper : other.bin_mappers_) {
458
459
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
460
461
462
463
464
465
466
467
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
468
469
470
471
472
473
474
475
476
477
478

    if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
      // this feature group was the first feature group, but now no longer is,
      // so we need to eliminate its special empty bin for multi val dense bin
      if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
        for (size_t i = 0; i < bin_offsets_.size(); ++i) {
          bin_offsets_[i] -= 1;
        }
        num_total_bin_ -= 1;
      }
    }
479
  }
Guolin Ke's avatar
Guolin Ke committed
480

481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    std::vector<BinIterator*>* bin_iterator,
    const int num_threads) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    }
  }

  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    BinIterator** bin_iterator) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator);
    }
  }

  uint32_t feature_max_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index + 1] - 1;
    } else {
      int addi = bin_mappers_[sub_feature_index]->GetMostFreqBin() == 0 ? 0 : 1;
      return bin_mappers_[sub_feature_index]->num_bin() - 1 + addi;
    }
  }

  uint32_t feature_min_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index];
    } else {
      return 1;
    }
  }

Nikita Titov's avatar
Nikita Titov committed
525
 private:
Guolin Ke's avatar
Guolin Ke committed
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
  void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
    if (is_multi_val) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
      is_multi_val_ = true;
    } else {
Guolin Ke's avatar
Guolin Ke committed
541
542
543
      if (force_sparse ||
          (!force_dense && num_feature_ == 1 &&
           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
Guolin Ke's avatar
Guolin Ke committed
544
545
546
547
548
549
550
551
552
553
        is_sparse_ = true;
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        is_sparse_ = false;
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      is_multi_val_ = false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
554
555
556
557
558
559
560
561
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
562
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
563
  /*! \brief True if this feature is sparse */
564
  bool is_multi_val_;
565
  bool is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
566
567
568
569
570
571
  bool is_sparse_;
  int num_total_bin_;
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
572
#endif  // LIGHTGBM_FEATURE_GROUP_H_