feature_group.h 15 KB
Newer Older
1
2
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

9
10
11
12
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>

13
14
15
16
#include <cstdio>
#include <memory>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
17
18
19
20
namespace LightGBM {

class Dataset;
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
21
22
/*! \brief Using to store data and providing some operations on one feature
 * group*/
Guolin Ke's avatar
Guolin Ke committed
23
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
24
 public:
Guolin Ke's avatar
Guolin Ke committed
25
26
27
28
29
30
31
32
33
  friend Dataset;
  friend DatasetLoader;
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
  */
Guolin Ke's avatar
Guolin Ke committed
34
  FeatureGroup(int num_feature, int8_t is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
35
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
36
    data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val > 0), is_sparse_(false) {
Nikita Titov's avatar
Nikita Titov committed
37
    CHECK_EQ(static_cast<int>(bin_mappers->size()), num_feature);
Guolin Ke's avatar
Guolin Ke committed
38
    // use bin at zero to store most_freq_bin
Guolin Ke's avatar
Guolin Ke committed
39
40
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
41
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
42
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
43
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
44
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
45
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
46
47
48
49
50
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
51
52
53
54
55
56
57
58
59
60
61
62
63
    CreateBinData(num_data, is_multi_val_, true, false);
  }

  FeatureGroup(const FeatureGroup& other, int num_data) {
    num_feature_ = other.num_feature_;
    is_multi_val_ = other.is_multi_val_;
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
64
    }
Guolin Ke's avatar
Guolin Ke committed
65
    CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
Guolin Ke's avatar
Guolin Ke committed
66
  }
Guolin Ke's avatar
Guolin Ke committed
67

68
69
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
Nikita Titov's avatar
Nikita Titov committed
70
    CHECK_EQ(static_cast<int>(bin_mappers->size()), 1);
71
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
72
73
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
Guolin Ke's avatar
Guolin Ke committed
74
    auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
75
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
76
      bin_mappers_.emplace_back(ref_bin_mappers[i].release());
Guolin Ke's avatar
Guolin Ke committed
77
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
78
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82
83
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
Guolin Ke's avatar
Guolin Ke committed
84
    CreateBinData(num_data, false, false, false);
Guolin Ke's avatar
Guolin Ke committed
85
  }
86

Guolin Ke's avatar
Guolin Ke committed
87
  /*!
Guolin Ke's avatar
Guolin Ke committed
88
89
90
91
92
   * \brief Constructor from memory
   * \param memory Pointer of memory
   * \param num_all_data Number of global data
   * \param local_used_indices Local used indices, empty means using all data
   */
Guolin Ke's avatar
Guolin Ke committed
93
  FeatureGroup(const void* memory, data_size_t num_all_data,
Guolin Ke's avatar
Guolin Ke committed
94
               const std::vector<data_size_t>& local_used_indices) {
Guolin Ke's avatar
Guolin Ke committed
95
96
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
97
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
98
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
Guolin Ke's avatar
Guolin Ke committed
99
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
100
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
101
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
102
    memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
103
104
105
106
107
108
109
110
111
    // get bin mapper
    bin_mappers_.clear();
    bin_offsets_.clear();
    // start from 1, due to need to store zero bin in this slot
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
112
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
118
119
120
121
122
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
123
124
125
126
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
127
128
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
129
        } else {
Guolin Ke's avatar
Guolin Ke committed
130
131
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
132
133
134
135
        }
        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_.back()->SizesInByte();
      }
Guolin Ke's avatar
Guolin Ke committed
136
    } else {
137
138
139
140
141
142
143
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      // get bin data
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
Guolin Ke's avatar
Guolin Ke committed
144
145
    }
  }
146

Guolin Ke's avatar
Guolin Ke committed
147
  /*! \brief Destructor */
Guolin Ke's avatar
Guolin Ke committed
148
  ~FeatureGroup() {}
Guolin Ke's avatar
Guolin Ke committed
149
150

  /*!
Guolin Ke's avatar
Guolin Ke committed
151
152
153
154
155
156
157
   * \brief Push one record, will auto convert to bin and push to bin data
   * \param tid Thread id
   * \param idx Index of record
   * \param value feature value of record
   */
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
                       double value) {
Guolin Ke's avatar
Guolin Ke committed
158
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
159
160
161
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
      return;
    }
Guolin Ke's avatar
Guolin Ke committed
162
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
163
164
      bin -= 1;
    }
165
166
167
168
169
170
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
171
172
  }

Guolin Ke's avatar
Guolin Ke committed
173
174
175
176
177
178
179
180
181
182
  void ReSize(int num_data) {
    if (!is_multi_val_) {
      bin_data_->ReSize(num_data);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->ReSize(num_data);
      }
    }
  }

183
  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
184
    if (!is_multi_val_) {
185
      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
186
187
    } else {
      for (int i = 0; i < num_feature_; ++i) {
188
        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
189
190
      }
    }
Guolin Ke's avatar
Guolin Ke committed
191
192
  }

Guolin Ke's avatar
Guolin Ke committed
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
  void AddFeaturesFrom(const FeatureGroup* other) {
    CHECK(is_multi_val_);
    CHECK(other->is_multi_val_);
    for (int i = 0; i < other->num_feature_; ++i) {
      const auto& other_bin_mapper = other->bin_mappers_[i];
      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
      auto num_bin = other_bin_mapper->num_bin();
      if (other_bin_mapper->GetMostFreqBin() == 0) {
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
    }
    num_feature_ += other->num_feature_;
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
210
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
211
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
212
213
214
215
216
217
218
219
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
Guolin Ke's avatar
Guolin Ke committed
220
221
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
                                                       most_freq_bin);
222
223
224
225
226
227
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
228
#pragma omp parallel for schedule(guided)
229
230
231
232
233
234
235
236
237
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
238
  }
239

240
  inline BinIterator* FeatureGroupIterator() {
241
242
243
    if (is_multi_val_) {
      return nullptr;
    }
244
245
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
246
247
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
248
  }
Guolin Ke's avatar
Guolin Ke committed
249

250
251
252
253
254
255
256
257
258
259
260
  inline size_t FeatureGroupSizesInByte() {
    return bin_data_->SizesInByte();
  }

  inline void* FeatureGroupData() {
    if (is_multi_val_) {
      return nullptr;
    }
    return bin_data_->get_data();
  }

261
262
263
264
265
  inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
                           const data_size_t* data_indices, data_size_t cnt,
                           data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
266
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
267
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
268
269
270
271
272
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
273
274
275
276
277
278
279
280
281
        if (num_feature_ == 1) {
          return bin_data_->Split(max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        } else {
          return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
                                  missing_type, default_left, *threshold,
                                  data_indices, cnt, lte_indices, gt_indices);
        }
282
      } else {
283
284
285
286
287
288
289
290
291
        if (num_feature_ == 1) {
          return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
                                             num_threshold, data_indices, cnt,
                                             lte_indices, gt_indices);
        } else {
          return bin_data_->SplitCategorical(
              min_bin, max_bin, most_freq_bin, threshold, num_threshold,
              data_indices, cnt, lte_indices, gt_indices);
        }
292
      }
293
    } else {
294
295
296
297
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
298
299
300
        return multi_bin_data_[sub_feature]->Split(
            max_bin, default_bin, most_freq_bin, missing_type, default_left,
            *threshold, data_indices, cnt, lte_indices, gt_indices);
301
      } else {
302
303
304
        return multi_bin_data_[sub_feature]->SplitCategorical(
            max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
            lte_indices, gt_indices);
305
      }
306
    }
Guolin Ke's avatar
Guolin Ke committed
307
  }
308

Guolin Ke's avatar
Guolin Ke committed
309
  /*!
Guolin Ke's avatar
Guolin Ke committed
310
311
312
313
   * \brief From bin to feature value
   * \param bin
   * \return FeatureGroup value of this bin
   */
Guolin Ke's avatar
Guolin Ke committed
314
315
316
317
318
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
Guolin Ke's avatar
Guolin Ke committed
319
320
321
   * \brief Save binary data to file
   * \param file File want to write
   */
322
  void SaveBinaryToFile(const VirtualFileWriter* writer) const {
323
324
325
    writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
    writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
    writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
326
    for (int i = 0; i < num_feature_; ++i) {
327
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
328
    }
329
330
331
332
333
334
335
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->SaveBinaryToFile(writer);
      }
    } else {
      bin_data_->SaveBinaryToFile(writer);
    }
Guolin Ke's avatar
Guolin Ke committed
336
  }
337

Guolin Ke's avatar
Guolin Ke committed
338
  /*!
Guolin Ke's avatar
Guolin Ke committed
339
340
   * \brief Get sizes in byte of this object
   */
Guolin Ke's avatar
Guolin Ke committed
341
  size_t SizesInByte() const {
342
343
344
    size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
                 VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
                 VirtualFileWriter::AlignedSize(sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
345
346
347
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
348
349
350
351
352
353
354
    if (!is_multi_val_) {
      ret += bin_data_->SizesInByte();
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        ret += multi_bin_data_[i]->SizesInByte();
      }
    }
Guolin Ke's avatar
Guolin Ke committed
355
356
    return ret;
  }
357

Guolin Ke's avatar
Guolin Ke committed
358
359
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
360

361
  /*! \brief Deep copy */
362
  FeatureGroup(const FeatureGroup& other) {
363
    num_feature_ = other.num_feature_;
364
    is_multi_val_ = other.is_multi_val_;
365
366
367
368
369
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
370
    for (auto& bin_mapper : other.bin_mappers_) {
371
372
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
373
374
375
376
377
378
379
380
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
381
  }
Guolin Ke's avatar
Guolin Ke committed
382

Nikita Titov's avatar
Nikita Titov committed
383
 private:
Guolin Ke's avatar
Guolin Ke committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
  void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
    if (is_multi_val) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
              num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(
              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
      is_multi_val_ = true;
    } else {
Guolin Ke's avatar
Guolin Ke committed
399
400
401
      if (force_sparse ||
          (!force_dense && num_feature_ == 1 &&
           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
Guolin Ke's avatar
Guolin Ke committed
402
403
404
405
406
407
408
409
410
411
        is_sparse_ = true;
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        is_sparse_ = false;
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      is_multi_val_ = false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
412
413
414
415
416
417
418
419
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
420
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
421
  /*! \brief True if this feature is sparse */
422
  bool is_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
423
424
425
426
427
428
  bool is_sparse_;
  int num_total_bin_;
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
429
#endif  // LIGHTGBM_FEATURE_GROUP_H_