feature_group.h 12.3 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_

#include <LightGBM/bin.h>
9
10
#include <LightGBM/meta.h>
#include <LightGBM/utils/random.h>
Guolin Ke's avatar
Guolin Ke committed
11
12
13
14
15
16
17
18
19
20
21

#include <cstdio>
#include <memory>
#include <vector>

namespace LightGBM {

class Dataset;
class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
class FeatureGroup {
Nikita Titov's avatar
Nikita Titov committed
22
 public:
Guolin Ke's avatar
Guolin Ke committed
23
24
25
26
27
28
29
30
  friend Dataset;
  friend DatasetLoader;
  /*!
  * \brief Constructor
  * \param num_feature number of features of this group
  * \param bin_mappers Bin mapper for features
  * \param num_data Total number of data
  * \param is_enable_sparse True if enable sparse feature
31
  * \param sparse_threshold Threshold for treating a feature as a sparse feature
Guolin Ke's avatar
Guolin Ke committed
32
  */
33
  FeatureGroup(int num_feature, bool is_multi_val,
Guolin Ke's avatar
Guolin Ke committed
34
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
35
    data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val), is_sparse_(false) {
Guolin Ke's avatar
Guolin Ke committed
36
    CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
Guolin Ke's avatar
Guolin Ke committed
37
    // use bin at zero to store most_freq_bin
Guolin Ke's avatar
Guolin Ke committed
38
39
40
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
41
      bin_mappers_.emplace_back(bin_mappers->at(i).release());
Guolin Ke's avatar
Guolin Ke committed
42
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
43
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
44
45
46
47
48
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
49
50
51
52
53
54
55
56
57
58
59
60
61
    if (is_multi_val_) {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
      }
    } else {
      bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
    }
Guolin Ke's avatar
Guolin Ke committed
62
  }
Guolin Ke's avatar
Guolin Ke committed
63

64
65
66
67
  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
    CHECK(static_cast<int>(bin_mappers->size()) == 1);
    // use bin at zero to store default_bin
Guolin Ke's avatar
Guolin Ke committed
68
69
70
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
71
      bin_mappers_.emplace_back(bin_mappers->at(i).release());
Guolin Ke's avatar
Guolin Ke committed
72
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
73
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
79
80
    if (bin_mappers_[0]->sparse_rate() >=  kSparseThreshold) {
      is_sparse_ = true;
Guolin Ke's avatar
Guolin Ke committed
81
82
      bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
    } else {
83
      is_sparse_ = false;
Guolin Ke's avatar
Guolin Ke committed
84
85
86
      bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
    }
  }
87

Guolin Ke's avatar
Guolin Ke committed
88
89
90
91
92
93
94
95
96
97
  /*!
  * \brief Constructor from memory
  * \param memory Pointer of memory
  * \param num_all_data Number of global data
  * \param local_used_indices Local used indices, empty means using all data
  */
  FeatureGroup(const void* memory, data_size_t num_all_data,
    const std::vector<data_size_t>& local_used_indices) {
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
98
99
    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += sizeof(is_multi_val_);
Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
104
105
106
107
108
109
110
111
112
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += sizeof(is_sparse_);
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
    memory_ptr += sizeof(num_feature_);
    // get bin mapper
    bin_mappers_.clear();
    bin_offsets_.clear();
    // start from 1, due to need to store zero bin in this slot
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(new BinMapper(memory_ptr));
      auto num_bin = bin_mappers_[i]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
113
      if (bin_mappers_[i]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
114
115
116
117
118
119
120
121
122
123
        num_bin -= 1;
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
      memory_ptr += bin_mappers_[i]->SizesInByte();
    }
    data_size_t num_data = num_all_data;
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
124
125
126
127
128
129
130
131
132
133
134
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        } else {
          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
        }
        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
        memory_ptr += multi_bin_data_.back()->SizesInByte();
      }
Guolin Ke's avatar
Guolin Ke committed
135
    } else {
136
137
138
139
140
141
142
      if (is_sparse_) {
        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
      } else {
        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
      }
      // get bin data
      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
Guolin Ke's avatar
Guolin Ke committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
    }
  }
  /*! \brief Destructor */
  ~FeatureGroup() {
  }

  /*!
  * \brief Push one record, will auto convert to bin and push to bin data
  * \param tid Thread id
  * \param idx Index of record
  * \param value feature value of record
  */
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
Guolin Ke's avatar
Guolin Ke committed
157
158
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
159
160
      bin -= 1;
    }
161
162
163
164
165
166
    if (is_multi_val_) {
      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
    } else {
      bin += bin_offsets_[sub_feature_idx];
      bin_data_->Push(tid, line_idx, bin);
    }
Guolin Ke's avatar
Guolin Ke committed
167
168
169
  }

  inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
170
171
172
173
174
175
176
    if (!is_multi_val_) {
      bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
177
178
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
179
  inline BinIterator* SubFeatureIterator(int sub_feature) {
Guolin Ke's avatar
Guolin Ke committed
180
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
    } else {
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
    }
  }

  inline void FinishLoad() {
    if (is_multi_val_) {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(guided)
      for (int i = 0; i < num_feature_; ++i) {
        OMP_LOOP_EX_BEGIN();
        multi_bin_data_[i]->FinishLoad();
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
    } else {
      bin_data_->FinishLoad();
    }
Guolin Ke's avatar
Guolin Ke committed
206
  }
207

208
209
210
211
212
213
  /*!
   * \brief Returns a BinIterator that can access the entire feature group's raw data.
   *        The RawGet() function of the iterator should be called for best efficiency.
   * \return A pointer to the BinIterator object
   */
  inline BinIterator* FeatureGroupIterator() {
214
215
216
    if (is_multi_val_) {
      return nullptr;
    }
217
218
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
Guolin Ke's avatar
Guolin Ke committed
219
220
    uint32_t most_freq_bin = 0;
    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
221
  }
Guolin Ke's avatar
Guolin Ke committed
222
223
224

  inline data_size_t Split(
    int sub_feature,
225
226
    const uint32_t* threshold,
    int num_threshold,
Guolin Ke's avatar
Guolin Ke committed
227
    bool default_left,
Guolin Ke's avatar
Guolin Ke committed
228
229
230
    data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices) const {
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
Guolin Ke's avatar
Guolin Ke committed
231
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
232
233
234
235
236
237
238
239
240
241
    if (!is_multi_val_) {
      uint32_t min_bin = bin_offsets_[sub_feature];
      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
        return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
          *threshold, data_indices, num_data, lte_indices, gt_indices);
      } else {
        return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
      }
242
    } else {
243
244
245
246
247
248
249
250
251
252
      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
      uint32_t min_bin = 1;
      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
        auto missing_type = bin_mappers_[sub_feature]->missing_type();
        return multi_bin_data_[sub_feature]->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
          *threshold, data_indices, num_data, lte_indices, gt_indices);
      } else {
        return multi_bin_data_[sub_feature]->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
      }
253
    }
Guolin Ke's avatar
Guolin Ke committed
254
255
256
257
258
259
260
261
262
263
264
265
266
267
  }
  /*!
  * \brief From bin to feature value
  * \param bin
  * \return FeatureGroup value of this bin
  */
  inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
    return bin_mappers_[sub_feature_idx]->BinToValue(bin);
  }

  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
268
  void SaveBinaryToFile(const VirtualFileWriter* writer) const {
269
    writer->Write(&is_multi_val_, sizeof(is_multi_val_));
270
271
    writer->Write(&is_sparse_, sizeof(is_sparse_));
    writer->Write(&num_feature_, sizeof(num_feature_));
Guolin Ke's avatar
Guolin Ke committed
272
    for (int i = 0; i < num_feature_; ++i) {
273
      bin_mappers_[i]->SaveBinaryToFile(writer);
Guolin Ke's avatar
Guolin Ke committed
274
    }
275
276
277
278
279
280
281
    if (is_multi_val_) {
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_[i]->SaveBinaryToFile(writer);
      }
    } else {
      bin_data_->SaveBinaryToFile(writer);
    }
Guolin Ke's avatar
Guolin Ke committed
282
283
284
285
286
  }
  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const {
287
    size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
Guolin Ke's avatar
Guolin Ke committed
288
289
290
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
291
292
293
294
295
296
297
    if (!is_multi_val_) {
      ret += bin_data_->SizesInByte();
    } else {
      for (int i = 0; i < num_feature_; ++i) {
        ret += multi_bin_data_[i]->SizesInByte();
      }
    }
Guolin Ke's avatar
Guolin Ke committed
298
299
300
301
    return ret;
  }
  /*! \brief Disable copy */
  FeatureGroup& operator=(const FeatureGroup&) = delete;
302
  /*! \brief Deep copy */
303
  FeatureGroup(const FeatureGroup& other) {
304
    num_feature_ = other.num_feature_;
305
    is_multi_val_ = other.is_multi_val_;
306
307
308
309
310
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;

    bin_mappers_.reserve(other.bin_mappers_.size());
311
    for (auto& bin_mapper : other.bin_mappers_) {
312
313
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
314
315
316
317
318
319
320
321
    if (!is_multi_val_) {
      bin_data_.reset(other.bin_data_->Clone());
    } else {
      multi_bin_data_.clear();
      for (int i = 0; i < num_feature_; ++i) {
        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
      }
    }
322
  }
Guolin Ke's avatar
Guolin Ke committed
323

Nikita Titov's avatar
Nikita Titov committed
324
 private:
Guolin Ke's avatar
Guolin Ke committed
325
326
327
328
329
330
331
332
  /*! \brief Number of features */
  int num_feature_;
  /*! \brief Bin mapper for sub features */
  std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
  /*! \brief Bin offsets for sub features */
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
333
  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
Guolin Ke's avatar
Guolin Ke committed
334
  /*! \brief True if this feature is sparse */
335
  bool is_multi_val_;
Guolin Ke's avatar
Guolin Ke committed
336
337
338
339
340
341
342
343
  bool is_sparse_;
  int num_total_bin_;
};


}  // namespace LightGBM

#endif   // LIGHTGBM_FEATURE_GROUP_H_