feature_group.h 20.8 KB
Newer Older
1
2
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

9
10
11
12
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>

13
14
15
16
#include <cstdio>
#include <memory>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
17
18
19
20
namespace LightGBM {

class Dataset;
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
21
struct TrainingShareStates;
22
class MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
23
24
/*! \brief Using to store data and providing some operations on one feature
 * group*/
Guolin Ke's avatar
Guolin Ke committed
25
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
  friend Dataset;
  friend DatasetLoader;
29
30
  friend TrainingShareStates;
  friend MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
31
32
33
34
35
36
37
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
  */
Guolin Ke's avatar
Guolin Ke committed
38
  FeatureGroup(int num_feature, int8_t is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
39
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
40
41
    data_size_t num_data, int group_id) :
    num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
Nikita Titov's avatar
Nikita Titov committed
42
    CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
Guolin Ke's avatar
Guolin Ke committed
43
    auto& ref_bin_mappers = *bin_mappers;
44
    double sum_sparse_rate = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
45
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
46
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
47
48
49
50
51
52
53
54
55
56
57
58
      sum_sparse_rate += bin_mappers_.back()->sparse_rate();
    }
    sum_sparse_rate /= num_feature_;
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
59
60
61
62
63
64
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
65
66
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
67
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
68
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
69
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
79
    CreateBinData(num_data, is_multi_val_, true, false);
  }

  FeatureGroup(const FeatureGroup& other, int num_data) {
    num_feature_ = other.num_feature_;
    is_multi_val_ = other.is_multi_val_;
80
    is_dense_multi_val_ = other.is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
86
87
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
88
    }
Guolin Ke's avatar
Guolin Ke committed
89
    CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
Guolin Ke's avatar
Guolin Ke committed
90
  }
Guolin Ke's avatar
Guolin Ke committed
91

92
93
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
Nikita Titov's avatar
Nikita Titov committed
94
    CHECK_EQ(static_cast<int>(bin_mappers->size()), 1);
95
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
96
    num_total_bin_ = 1;
97
    is_dense_multi_val_ = false;
Guolin Ke's avatar
Guolin Ke committed
98
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
99
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
100
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
101
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
102
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
103
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
109
    CreateBinData(num_data, false, false, false);
Guolin Ke's avatar
Guolin Ke committed
110
  }
111

Guolin Ke's avatar
Guolin Ke committed
112
  /*!
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
   * \brief Constructor from memory
   * \param memory Pointer of memory
   * \param num_all_data Number of global data
   * \param local_used_indices Local used indices, empty means using all data
   */
Guolin Ke's avatar
Guolin Ke committed
118
  FeatureGroup(const void* memory, data_size_t num_all_data,
119
120
               const std::vector<data_size_t>& local_used_indices,
               int group_id) {
Guolin Ke's avatar
Guolin Ke committed
121
122
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
123
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
124
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
125
126
    is_dense_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_));
Guolin Ke's avatar
Guolin Ke committed
127
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
128
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
129
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
130
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
131
132
    // get bin mapper
    bin_mappers_.clear();
133
134
135
136
137
138

    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }

Guolin Ke's avatar
Guolin Ke committed
139
    bin_offsets_.clear();
140
141
142
143
144
145
146
147
148
149
150
151
    int offset = 1;
    if (is_dense_multi_val_) {
      offset = 0;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
Guolin Ke's avatar
Guolin Ke committed
152
153
154
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
155
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
156
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
157
158
159
160
161
162
163
164
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
165
166
167
168
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
169
170
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
171
        } else {
Guolin Ke's avatar
Guolin Ke committed
172
173
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
174
175
176
177
        }
        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_.back()->SizesInByte();
      }
Guolin Ke's avatar
Guolin Ke committed
178
    } else {
179
180
181
182
183
184
185
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      // get bin data
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
Guolin Ke's avatar
Guolin Ke committed
186
187
    }
  }
188

Guolin Ke's avatar
Guolin Ke committed
189
  /*! \brief Destructor */
Guolin Ke's avatar
Guolin Ke committed
190
  ~FeatureGroup() {}
Guolin Ke's avatar
Guolin Ke committed
191

192
193
194
195
196
197
198
199
200
201
202
203
204
205
  /*!
  * \brief Initialize for pushing in a streaming fashion.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
  */
  void InitStreaming(int32_t num_thread) {
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->InitStreaming(num_thread);
      }
    } else {
      bin_data_->InitStreaming(num_thread);
    }
  }

Guolin Ke's avatar
Guolin Ke committed
206
  /*!
Guolin Ke's avatar
Guolin Ke committed
207
208
   * \brief Push one record, will auto convert to bin and push to bin data
   * \param tid Thread id
209
210
   * \param sub_feature_idx Index of the subfeature
   * \param line_idx Index of record
Guolin Ke's avatar
Guolin Ke committed
211
212
   * \param value feature value of record
   */
213
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
Guolin Ke's avatar
Guolin Ke committed
214
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
215
216
217
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
218
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
219
220
      bin -= 1;
    }
221
222
223
224
225
226
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
227
228
  }

Guolin Ke's avatar
Guolin Ke committed
229
230
231
232
233
234
235
236
237
238
  void ReSize(int num_data) {
    if (!is_multi_val_) {
      bin_data_->ReSize(num_data);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->ReSize(num_data);
      }
    }
  }

239
  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
240
    if (!is_multi_val_) {
241
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
242
243
    } else {
      for (int i = 0; i < num_feature_; ++i) {
244
        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
245
246
      }
    }
Guolin Ke's avatar
Guolin Ke committed
247
248
  }

249
250
251
252
253
254
255
256
  inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) {
    if (!is_multi_val_) {
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices);
    }
  }

257
  void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
258
259
    CHECK(is_multi_val_);
    CHECK(other->is_multi_val_);
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
    // every time when new features are added, we need to reconsider sparse or dense
    double sum_sparse_rate = 0.0f;
    for (int i = 0; i < num_feature_; ++i) {
      sum_sparse_rate += bin_mappers_[i]->sparse_rate();
    }
    for (int i = 0; i < other->num_feature_; ++i) {
      sum_sparse_rate += other->bin_mappers_[i]->sparse_rate();
    }
    sum_sparse_rate /= (num_feature_ + other->num_feature_);
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    bin_offsets_.clear();
    num_total_bin_ = offset;
278
279
280
281
282
283
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
284
285
286
287
288
289
290
291
292
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
        num_bin -= offset;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
293
294
295
296
297
    for (int i = 0; i < other->num_feature_; ++i) {
      const auto& other_bin_mapper = other->bin_mappers_[i];
      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
      auto num_bin = other_bin_mapper->num_bin();
      if (other_bin_mapper->GetMostFreqBin() == 0) {
298
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
299
300
      }
      num_total_bin_ += num_bin;
301
      bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
302
303
304
305
306
      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
    }
    num_feature_ += other->num_feature_;
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
307
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
308
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
309
310
311
312
313
314
315
316
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
Guolin Ke's avatar
Guolin Ke committed
317
318
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
                                                       most_freq_bin);
319
320
321
322
323
324
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
325
#pragma omp parallel for schedule(guided)
326
327
328
329
330
331
332
333
334
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
335
  }
336

337
  inline BinIterator* FeatureGroupIterator() {
338
339
340
    if (is_multi_val_) {
      return nullptr;
    }
341
342
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
343
344
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
345
  }
Guolin Ke's avatar
Guolin Ke committed
346

347
348
349
350
351
352
353
354
355
356
357
  inline size_t FeatureGroupSizesInByte() {
    return bin_data_->SizesInByte();
  }

  inline void* FeatureGroupData() {
    if (is_multi_val_) {
      return nullptr;
    }
    return bin_data_->get_data();
  }

358
359
360
361
362
  inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
                           const data_size_t* data_indices, data_size_t cnt,
                           data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
363
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
364
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
365
366
367
368
369
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
370
371
372
373
374
375
376
377
378
        if (num_feature_ == 1) {
          return bin_data_->Split(max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        } else {
          return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        }
379
      } else {
380
381
382
383
384
385
386
387
388
        if (num_feature_ == 1) {
          return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
                                             num_threshold, data_indices, cnt,
                                             lte_indices, gt_indices);
        } else {
          return bin_data_->SplitCategorical(
              min_bin, max_bin, most_freq_bin, threshold, num_threshold,
              data_indices, cnt, lte_indices, gt_indices);
        }
389
      }
390
    } else {
391
392
393
394
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
395
396
397
        return multi_bin_data_[sub_feature]->Split(
            max_bin, default_bin, most_freq_bin, missing_type, default_left,
            *threshold, data_indices, cnt, lte_indices, gt_indices);
398
      } else {
399
400
401
        return multi_bin_data_[sub_feature]->SplitCategorical(
            max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
            lte_indices, gt_indices);
402
      }
403
    }
Guolin Ke's avatar
Guolin Ke committed
404
  }
405

Guolin Ke's avatar
Guolin Ke committed
406
  /*!
Guolin Ke's avatar
Guolin Ke committed
407
408
409
410
   * \brief From bin to feature value
   * \param bin
   * \return FeatureGroup value of this bin
   */
Guolin Ke's avatar
Guolin Ke committed
411
412
413
414
415
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
Guolin Ke's avatar
Guolin Ke committed
416
417
418
   * \brief Save binary data to file
   * \param file File want to write
   */
419
  void SaveBinaryToFile(const VirtualFileWriter* writer) const {
420
    writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
421
    writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
422
423
    writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
    writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
424
    for (int i = 0; i < num_feature_; ++i) {
425
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
426
    }
427
428
429
430
431
432
433
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->SaveBinaryToFile(writer);
      }
    } else {
      bin_data_->SaveBinaryToFile(writer);
    }
Guolin Ke's avatar
Guolin Ke committed
434
  }
435

Guolin Ke's avatar
Guolin Ke committed
436
  /*!
Guolin Ke's avatar
Guolin Ke committed
437
438
   * \brief Get sizes in byte of this object
   */
Guolin Ke's avatar
Guolin Ke committed
439
  size_t SizesInByte() const {
440
    size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
441
                 VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
442
443
                 VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
                 VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
444
445
446
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
447
448
449
450
451
452
453
    if (!is_multi_val_) {
      ret += bin_data_->SizesInByte();
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        ret += multi_bin_data_[i]->SizesInByte();
      }
    }
Guolin Ke's avatar
Guolin Ke committed
454
455
    return ret;
  }
456

Guolin Ke's avatar
Guolin Ke committed
457
458
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
459

460
  /*! \brief Deep copy */
461
462
  FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
    int group_id) {
463
    num_feature_ = other.num_feature_;
464
    is_multi_val_ = other.is_multi_val_;
465
    is_dense_multi_val_ = other.is_dense_multi_val_;
466
467
468
469
470
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
471
    for (auto& bin_mapper : other.bin_mappers_) {
472
473
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
474
475
476
477
478
479
480
481
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
482
483
484
485
486
487
488
489
490
491
492

    if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
      // this feature group was the first feature group, but now no longer is,
      // so we need to eliminate its special empty bin for multi val dense bin
      if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
        for (size_t i = 0; i < bin_offsets_.size(); ++i) {
          bin_offsets_[i] -= 1;
        }
        num_total_bin_ -= 1;
      }
    }
493
  }
Guolin Ke's avatar
Guolin Ke committed
494

495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    std::vector<BinIterator*>* bin_iterator,
    const int num_threads) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    }
  }

  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    BinIterator** bin_iterator) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator);
    }
  }

  uint32_t feature_max_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index + 1] - 1;
    } else {
      int addi = bin_mappers_[sub_feature_index]->GetMostFreqBin() == 0 ? 0 : 1;
      return bin_mappers_[sub_feature_index]->num_bin() - 1 + addi;
    }
  }

  uint32_t feature_min_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index];
    } else {
      return 1;
    }
  }

Nikita Titov's avatar
Nikita Titov committed
539
 private:
Guolin Ke's avatar
Guolin Ke committed
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
  void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
    if (is_multi_val) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
      is_multi_val_ = true;
    } else {
Guolin Ke's avatar
Guolin Ke committed
555
556
557
      if (force_sparse ||
          (!force_dense && num_feature_ == 1 &&
           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
Guolin Ke's avatar
Guolin Ke committed
558
559
560
561
562
563
564
565
566
567
        is_sparse_ = true;
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        is_sparse_ = false;
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      is_multi_val_ = false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
568
569
570
571
572
573
574
575
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
576
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
577
  /*! \brief True if this feature is sparse */
578
  bool is_multi_val_;
579
  bool is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
580
581
582
583
584
585
  bool is_sparse_;
  int num_total_bin_;
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
586
#endif  // LIGHTGBM_FEATURE_GROUP_H_