feature_group.h 21 KB
Newer Older
1
2
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

9
10
11
12
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>

13
14
15
16
#include <cstdio>
#include <memory>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
17
18
19
20
namespace LightGBM {

class Dataset;
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
21
struct TrainingShareStates;
22
class MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
23
24
/*! \brief Using to store data and providing some operations on one feature
 * group*/
Guolin Ke's avatar
Guolin Ke committed
25
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
26
 public:
Guolin Ke's avatar
Guolin Ke committed
27
28
  friend Dataset;
  friend DatasetLoader;
29
30
  friend TrainingShareStates;
  friend MultiValBinWrapper;
Guolin Ke's avatar
Guolin Ke committed
31
32
33
34
35
36
37
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
  */
Guolin Ke's avatar
Guolin Ke committed
38
  FeatureGroup(int num_feature, int8_t is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
39
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
40
41
    data_size_t num_data, int group_id) :
    num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
Nikita Titov's avatar
Nikita Titov committed
42
    CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
Guolin Ke's avatar
Guolin Ke committed
43
    auto& ref_bin_mappers = *bin_mappers;
44
    double sum_sparse_rate = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
45
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
46
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
47
48
49
50
51
52
53
54
55
56
57
58
      sum_sparse_rate += bin_mappers_.back()->sparse_rate();
    }
    sum_sparse_rate /= num_feature_;
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
59
60
61
62
63
64
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
65
66
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
67
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
68
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
69
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
79
    CreateBinData(num_data, is_multi_val_, true, false);
  }

  FeatureGroup(const FeatureGroup& other, int num_data) {
    num_feature_ = other.num_feature_;
    is_multi_val_ = other.is_multi_val_;
80
    is_dense_multi_val_ = other.is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
86
87
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
88
    }
Guolin Ke's avatar
Guolin Ke committed
89
    CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
Guolin Ke's avatar
Guolin Ke committed
90
  }
Guolin Ke's avatar
Guolin Ke committed
91

92
93
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
Nikita Titov's avatar
Nikita Titov committed
94
    CHECK_EQ(static_cast<int>(bin_mappers->size()), 1);
95
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
96
    num_total_bin_ = 1;
97
    is_dense_multi_val_ = false;
Guolin Ke's avatar
Guolin Ke committed
98
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
99
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
100
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
101
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
102
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
103
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
109
    CreateBinData(num_data, false, false, false);
Guolin Ke's avatar
Guolin Ke committed
110
  }
111

Guolin Ke's avatar
Guolin Ke committed
112
  /*!
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
   * \brief Constructor from memory
   * \param memory Pointer of memory
   * \param num_all_data Number of global data
   * \param local_used_indices Local used indices, empty means using all data
   */
Guolin Ke's avatar
Guolin Ke committed
118
  FeatureGroup(const void* memory, data_size_t num_all_data,
119
120
               const std::vector<data_size_t>& local_used_indices,
               int group_id) {
Guolin Ke's avatar
Guolin Ke committed
121
122
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
123
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
124
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
125
126
    is_dense_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_));
Guolin Ke's avatar
Guolin Ke committed
127
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
128
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
129
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
130
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
131
132
    // get bin mapper
    bin_mappers_.clear();
133
134
135
136
137
138

    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }

Guolin Ke's avatar
Guolin Ke committed
139
    bin_offsets_.clear();
140
141
142
143
144
145
146
147
148
149
150
151
    int offset = 1;
    if (is_dense_multi_val_) {
      offset = 0;
    }
    // use bin at zero to store most_freq_bin only when not using dense multi val bin
    num_total_bin_ = offset;
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
Guolin Ke's avatar
Guolin Ke committed
152
153
154
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
155
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
156
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
157
158
159
160
161
162
163
164
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
165
166
167
168
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
169
170
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
171
        } else {
Guolin Ke's avatar
Guolin Ke committed
172
173
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
174
175
176
177
        }
        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_.back()->SizesInByte();
      }
Guolin Ke's avatar
Guolin Ke committed
178
    } else {
179
180
181
182
183
184
185
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      // get bin data
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
Guolin Ke's avatar
Guolin Ke committed
186
187
    }
  }
188

Guolin Ke's avatar
Guolin Ke committed
189
  /*! \brief Destructor */
Guolin Ke's avatar
Guolin Ke committed
190
  ~FeatureGroup() {}
Guolin Ke's avatar
Guolin Ke committed
191

192
193
194
  /*!
  * \brief Initialize for pushing in a streaming fashion.  By default, no action needed.
  * \param num_thread The number of external threads that will be calling the push APIs
195
  * \param omp_max_threads The maximum number of OpenMP threads to allocate for
196
  */
197
  void InitStreaming(int32_t num_thread, int32_t omp_max_threads) {
198
199
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
200
        multi_bin_data_[i]->InitStreaming(num_thread, omp_max_threads);
201
202
      }
    } else {
203
      bin_data_->InitStreaming(num_thread, omp_max_threads);
204
205
206
    }
  }

Guolin Ke's avatar
Guolin Ke committed
207
  /*!
Guolin Ke's avatar
Guolin Ke committed
208
209
   * \brief Push one record, will auto convert to bin and push to bin data
   * \param tid Thread id
210
211
   * \param sub_feature_idx Index of the subfeature
   * \param line_idx Index of record
Guolin Ke's avatar
Guolin Ke committed
212
213
   * \param value feature value of record
   */
214
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
Guolin Ke's avatar
Guolin Ke committed
215
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
216
217
218
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
219
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
220
221
      bin -= 1;
    }
222
223
224
225
226
227
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
228
229
  }

Guolin Ke's avatar
Guolin Ke committed
230
231
232
233
234
235
236
237
238
239
  void ReSize(int num_data) {
    if (!is_multi_val_) {
      bin_data_->ReSize(num_data);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->ReSize(num_data);
      }
    }
  }

240
  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
241
    if (!is_multi_val_) {
242
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
243
244
    } else {
      for (int i = 0; i < num_feature_; ++i) {
245
        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
246
247
      }
    }
Guolin Ke's avatar
Guolin Ke committed
248
249
  }

250
251
252
253
254
255
256
257
  inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) {
    if (!is_multi_val_) {
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices);
    }
  }

258
  void AddFeaturesFrom(const FeatureGroup* other, int group_id) {
Guolin Ke's avatar
Guolin Ke committed
259
260
    CHECK(is_multi_val_);
    CHECK(other->is_multi_val_);
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
    // every time when new features are added, we need to reconsider sparse or dense
    double sum_sparse_rate = 0.0f;
    for (int i = 0; i < num_feature_; ++i) {
      sum_sparse_rate += bin_mappers_[i]->sparse_rate();
    }
    for (int i = 0; i < other->num_feature_; ++i) {
      sum_sparse_rate += other->bin_mappers_[i]->sparse_rate();
    }
    sum_sparse_rate /= (num_feature_ + other->num_feature_);
    int offset = 1;
    is_dense_multi_val_ = false;
    if (sum_sparse_rate < MultiValBin::multi_val_bin_sparse_threshold && is_multi_val_) {
      // use dense multi val bin
      offset = 0;
      is_dense_multi_val_ = true;
    }
    bin_offsets_.clear();
    num_total_bin_ = offset;
279
280
281
282
283
284
    // however, we should force to leave one bin, if dense multi val bin is the first bin
    // and its first feature has most freq bin > 0
    if (group_id == 0 && num_feature_ > 0 && is_dense_multi_val_ &&
      bin_mappers_[0]->GetMostFreqBin() > 0) {
      num_total_bin_ = 1;
    }
285
286
287
288
289
290
291
292
293
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      auto num_bin = bin_mappers_[i]->num_bin();
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
        num_bin -= offset;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
294
295
296
297
298
    for (int i = 0; i < other->num_feature_; ++i) {
      const auto& other_bin_mapper = other->bin_mappers_[i];
      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
      auto num_bin = other_bin_mapper->num_bin();
      if (other_bin_mapper->GetMostFreqBin() == 0) {
299
        num_bin -= offset;
Guolin Ke's avatar
Guolin Ke committed
300
301
      }
      num_total_bin_ += num_bin;
302
      bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
303
304
305
306
307
      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
    }
    num_feature_ += other->num_feature_;
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
308
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
309
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
310
311
312
313
314
315
316
317
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
Guolin Ke's avatar
Guolin Ke committed
318
319
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
                                                       most_freq_bin);
320
321
322
323
324
325
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
326
#pragma omp parallel for schedule(guided)
327
328
329
330
331
332
333
334
335
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
336
  }
337

338
  inline BinIterator* FeatureGroupIterator() {
339
340
341
    if (is_multi_val_) {
      return nullptr;
    }
342
343
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
344
345
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
346
  }
Guolin Ke's avatar
Guolin Ke committed
347

348
349
350
351
352
353
354
355
356
357
358
  inline size_t FeatureGroupSizesInByte() {
    return bin_data_->SizesInByte();
  }

  inline void* FeatureGroupData() {
    if (is_multi_val_) {
      return nullptr;
    }
    return bin_data_->get_data();
  }

359
360
361
362
363
  inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
                           const data_size_t* data_indices, data_size_t cnt,
                           data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
364
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
365
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
366
367
368
369
370
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
371
372
373
374
375
376
377
378
379
        if (num_feature_ == 1) {
          return bin_data_->Split(max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        } else {
          return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        }
380
      } else {
381
382
383
384
385
386
387
388
389
        if (num_feature_ == 1) {
          return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
                                             num_threshold, data_indices, cnt,
                                             lte_indices, gt_indices);
        } else {
          return bin_data_->SplitCategorical(
              min_bin, max_bin, most_freq_bin, threshold, num_threshold,
              data_indices, cnt, lte_indices, gt_indices);
        }
390
      }
391
    } else {
392
393
394
395
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
396
397
398
        return multi_bin_data_[sub_feature]->Split(
            max_bin, default_bin, most_freq_bin, missing_type, default_left,
            *threshold, data_indices, cnt, lte_indices, gt_indices);
399
      } else {
400
401
402
        return multi_bin_data_[sub_feature]->SplitCategorical(
            max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
            lte_indices, gt_indices);
403
      }
404
    }
Guolin Ke's avatar
Guolin Ke committed
405
  }
406

Guolin Ke's avatar
Guolin Ke committed
407
  /*!
Guolin Ke's avatar
Guolin Ke committed
408
409
410
411
   * \brief From bin to feature value
   * \param bin
   * \return FeatureGroup value of this bin
   */
Guolin Ke's avatar
Guolin Ke committed
412
413
414
415
416
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
Guolin Ke's avatar
Guolin Ke committed
417
418
419
   * \brief Save binary data to file
   * \param file File want to write
   */
420
  void SaveBinaryToFile(const VirtualFileWriter* writer) const {
421
    writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
422
    writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
423
424
    writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
    writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
425
    for (int i = 0; i < num_feature_; ++i) {
426
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
427
    }
428
429
430
431
432
433
434
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->SaveBinaryToFile(writer);
      }
    } else {
      bin_data_->SaveBinaryToFile(writer);
    }
Guolin Ke's avatar
Guolin Ke committed
435
  }
436

Guolin Ke's avatar
Guolin Ke committed
437
  /*!
Guolin Ke's avatar
Guolin Ke committed
438
439
   * \brief Get sizes in byte of this object
   */
Guolin Ke's avatar
Guolin Ke committed
440
  size_t SizesInByte() const {
441
    size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
442
                 VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
443
444
                 VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
                 VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
445
446
447
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
448
449
450
451
452
453
454
    if (!is_multi_val_) {
      ret += bin_data_->SizesInByte();
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        ret += multi_bin_data_[i]->SizesInByte();
      }
    }
Guolin Ke's avatar
Guolin Ke committed
455
456
    return ret;
  }
457

Guolin Ke's avatar
Guolin Ke committed
458
459
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
460

461
  /*! \brief Deep copy */
462
463
  FeatureGroup(const FeatureGroup& other, bool should_handle_dense_mv,
    int group_id) {
464
    num_feature_ = other.num_feature_;
465
    is_multi_val_ = other.is_multi_val_;
466
    is_dense_multi_val_ = other.is_dense_multi_val_;
467
468
469
470
471
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
472
    for (auto& bin_mapper : other.bin_mappers_) {
473
474
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
475
476
477
478
479
480
481
482
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
483
484
485
486
487
488
489
490
491
492
493

    if (should_handle_dense_mv && is_dense_multi_val_ && group_id > 0) {
      // this feature group was the first feature group, but now no longer is,
      // so we need to eliminate its special empty bin for multi val dense bin
      if (bin_mappers_[0]->GetMostFreqBin() > 0 && bin_offsets_[0] == 1) {
        for (size_t i = 0; i < bin_offsets_.size(); ++i) {
          bin_offsets_[i] -= 1;
        }
        num_total_bin_ -= 1;
      }
    }
494
  }
Guolin Ke's avatar
Guolin Ke committed
495

496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    std::vector<BinIterator*>* bin_iterator,
    const int num_threads) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
    }
  }

  const void* GetColWiseData(const int sub_feature_index,
    uint8_t* bit_type,
    bool* is_sparse,
    BinIterator** bin_iterator) const {
    if (sub_feature_index >= 0) {
      CHECK(is_multi_val_);
      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator);
    } else {
      CHECK(!is_multi_val_);
      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator);
    }
  }

  uint32_t feature_max_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index + 1] - 1;
    } else {
      int addi = bin_mappers_[sub_feature_index]->GetMostFreqBin() == 0 ? 0 : 1;
      return bin_mappers_[sub_feature_index]->num_bin() - 1 + addi;
    }
  }

  uint32_t feature_min_bin(const int sub_feature_index) {
    if (!is_multi_val_) {
      return bin_offsets_[sub_feature_index];
    } else {
      return 1;
    }
  }

Nikita Titov's avatar
Nikita Titov committed
540
 private:
Guolin Ke's avatar
Guolin Ke committed
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
  void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
    if (is_multi_val) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
      is_multi_val_ = true;
    } else {
Guolin Ke's avatar
Guolin Ke committed
556
557
558
      if (force_sparse ||
          (!force_dense && num_feature_ == 1 &&
           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
Guolin Ke's avatar
Guolin Ke committed
559
560
561
562
563
564
565
566
567
568
        is_sparse_ = true;
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        is_sparse_ = false;
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      is_multi_val_ = false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
569
570
571
572
573
574
575
576
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
577
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
578
  /*! \brief True if this feature is sparse */
579
  bool is_multi_val_;
580
  bool is_dense_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
581
582
583
584
585
586
  bool is_sparse_;
  int num_total_bin_;
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
587
#endif  // LIGHTGBM_FEATURE_GROUP_H_