dataset.h 16.9 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#ifndef LIGHTGBM_DATASET_H_
#define LIGHTGBM_DATASET_H_
Guolin Ke's avatar
Guolin Ke committed
3
4
5

#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
6
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
7
8

#include <LightGBM/meta.h>
Guolin Ke's avatar
Guolin Ke committed
9
#include <LightGBM/config.h>
Guolin Ke's avatar
Guolin Ke committed
10
#include <LightGBM/feature_group.h>
Guolin Ke's avatar
Guolin Ke committed
11
12
13
14
15

#include <vector>
#include <utility>
#include <functional>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
16
#include <unordered_set>
17
#include <mutex>
Guolin Ke's avatar
Guolin Ke committed
18
19
20
21

namespace LightGBM {

/*! \brief forward declaration */
Guolin Ke's avatar
Guolin Ke committed
22
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
23
/*!
Hui Xue's avatar
Hui Xue committed
24
* \brief This class is used to store some meta(non-feature) data for training data,
Guolin Ke's avatar
Guolin Ke committed
25
26
*        e.g. labels, weights, initial scores, qurey level informations.
*
Qiwei Ye's avatar
Qiwei Ye committed
27
28
29
30
31
32
33
34
*        Some details:
*        1. Label, used for traning.
*        2. Weights, weighs of records, optional
*        3. Query Boundaries, necessary for lambdarank.
*           The documents of i-th query is in [ query_boundarise[i], query_boundarise[i+1] )
*        4. Query Weights, auto calculate by weights and query_boundarise(if both of them are existed)
*           the weight for i-th query is sum(query_boundarise[i] , .., query_boundarise[i+1]) / (query_boundarise[i + 1] -  query_boundarise[i+1])
*        5. Initial score. optional. if exsitng, the model will boost from this score, otherwise will start from 0.
Guolin Ke's avatar
Guolin Ke committed
35
36
37
*/
class Metadata {
public:
38
  /*!
Guolin Ke's avatar
Guolin Ke committed
39
40
41
42
  * \brief Null costructor
  */
  Metadata();
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
43
  * \brief Initialization will load qurey level informations, since it is need for sampling data
Guolin Ke's avatar
Guolin Ke committed
44
45
46
  * \param data_filename Filename of data
  * \param init_score_filename Filename of initial score
  */
47
  void Init(const char* data_filename);
Guolin Ke's avatar
Guolin Ke committed
48
  /*!
Guolin Ke's avatar
Guolin Ke committed
49
50
  * \brief init as subset
  * \param metadata Filename of data
51
  * \param used_indices
Guolin Ke's avatar
Guolin Ke committed
52
53
54
55
  * \param num_used_indices
  */
  void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
  /*!
Guolin Ke's avatar
Guolin Ke committed
56
57
58
59
60
61
62
63
  * \brief Initial with binary memory
  * \param memory Pointer to memory
  */
  void LoadFromMemory(const void* memory);
  /*! \brief Destructor */
  ~Metadata();

  /*!
Guolin Ke's avatar
Guolin Ke committed
64
  * \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
Guolin Ke's avatar
Guolin Ke committed
65
  * \param num_data Number of training data
Guolin Ke's avatar
Guolin Ke committed
66
67
  * \param weight_idx Index of weight column, < 0 means doesn't exists
  * \param query_idx Index of query id column, < 0 means doesn't exists
Guolin Ke's avatar
Guolin Ke committed
68
  */
69
  void Init(data_size_t num_data, int weight_idx, int query_idx);
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
76
77
78
79
80
81
82

  /*!
  * \brief Partition label by used indices
  * \param used_indices Indice of local used
  */
  void PartitionLabel(const std::vector<data_size_t>& used_indices);

  /*!
  * \brief Partition meta data according to local used indices if need
  * \param num_all_data Number of total training data, including other machines' data on parallel learning
  * \param used_data_indices Indices of local used training data
  */
  void CheckOrPartition(data_size_t num_all_data,
83
                        const std::vector<data_size_t>& used_data_indices);
Guolin Ke's avatar
Guolin Ke committed
84

Guolin Ke's avatar
Guolin Ke committed
85
86
87
88
  void SetLabel(const float* label, data_size_t len);

  void SetWeights(const float* weights, data_size_t len);

Guolin Ke's avatar
Guolin Ke committed
89
  void SetQuery(const data_size_t* query, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
90

Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
  /*!
  * \brief Set initial scores
  * \param init_score Initial scores, this class will manage memory for init_score.
  */
95
  void SetInitScore(const double* init_score, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112


  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
  void SaveBinaryToFile(FILE* file) const;

  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const;

  /*!
  * \brief Get pointer of label
  * \return Pointer of label
  */
Guolin Ke's avatar
Guolin Ke committed
113
  inline const float* label() const { return label_.data(); }
Guolin Ke's avatar
Guolin Ke committed
114
115
116
117
118
119

  /*!
  * \brief Set label for one record
  * \param idx Index of this record
  * \param value Label value of this record
  */
120
  inline void SetLabelAt(data_size_t idx, float value)
Guolin Ke's avatar
Guolin Ke committed
121
  {
122
    label_[idx] = value;
Guolin Ke's avatar
Guolin Ke committed
123
124
  }

Guolin Ke's avatar
Guolin Ke committed
125
126
127
128
129
  /*!
  * \brief Set Weight for one record
  * \param idx Index of this record
  * \param value Weight value of this record
  */
130
  inline void SetWeightAt(data_size_t idx, float value)
Guolin Ke's avatar
Guolin Ke committed
131
  {
132
    weights_[idx] = value;
Guolin Ke's avatar
Guolin Ke committed
133
134
135
136
137
138
139
  }

  /*!
  * \brief Set Query Id for one record
  * \param idx Index of this record
  * \param value Query Id value of this record
  */
Guolin Ke's avatar
Guolin Ke committed
140
  inline void SetQueryAt(data_size_t idx, data_size_t value)
Guolin Ke's avatar
Guolin Ke committed
141
142
143
144
  {
    queries_[idx] = static_cast<data_size_t>(value);
  }

Guolin Ke's avatar
Guolin Ke committed
145
  /*!
Hui Xue's avatar
Hui Xue committed
146
  * \brief Get weights, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
147
148
  * \return Pointer of weights
  */
Guolin Ke's avatar
Guolin Ke committed
149
  inline const float* weights() const {
Guolin Ke's avatar
Guolin Ke committed
150
    if (!weights_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
151
152
153
154
155
      return weights_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
156
157

  /*!
Hui Xue's avatar
Hui Xue committed
158
  * \brief Get data boundaries on queries, if not exists, will return nullptr
159
  *        we assume data will order by query,
Guolin Ke's avatar
Guolin Ke committed
160
161
162
163
  *        the interval of [query_boundaris[i], query_boundaris[i+1])
  *        is the data indices for query i.
  * \return Pointer of data boundaries on queries
  */
164
  inline const data_size_t* query_boundaries() const {
Guolin Ke's avatar
Guolin Ke committed
165
    if (!query_boundaries_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
166
167
168
169
170
      return query_boundaries_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175

  /*!
  * \brief Get Number of queries
  * \return Number of queries
  */
176
  inline data_size_t num_queries() const { return num_queries_; }
Guolin Ke's avatar
Guolin Ke committed
177
178

  /*!
Hui Xue's avatar
Hui Xue committed
179
  * \brief Get weights for queries, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
180
181
  * \return Pointer of weights for queries
  */
182
  inline const float* query_weights() const {
Guolin Ke's avatar
Guolin Ke committed
183
    if (!query_weights_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
184
185
186
187
188
      return query_weights_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
189
190

  /*!
Hui Xue's avatar
Hui Xue committed
191
  * \brief Get initial scores, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
192
193
  * \return Pointer of initial scores
  */
194
  inline const double* init_score() const {
Guolin Ke's avatar
Guolin Ke committed
195
    if (!init_score_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
200
      return init_score_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
201

202
203
204
  /*!
  * \brief Get size of initial scores
  */
Guolin Ke's avatar
Guolin Ke committed
205
  inline int64_t num_init_score() const { return num_init_score_; }
206

Guolin Ke's avatar
Guolin Ke committed
207
208
209
210
  /*! \brief Disable copy */
  Metadata& operator=(const Metadata&) = delete;
  /*! \brief Disable copy */
  Metadata(const Metadata&) = delete;
Guolin Ke's avatar
Guolin Ke committed
211
212

private:
Guolin Ke's avatar
Guolin Ke committed
213
214
  /*! \brief Load initial scores from file */
  void LoadInitialScore();
Guolin Ke's avatar
Guolin Ke committed
215
216
217
218
219
220
221
222
223
224
225
226
227
  /*! \brief Load wights from file */
  void LoadWeights();
  /*! \brief Load query boundaries from file */
  void LoadQueryBoundaries();
  /*! \brief Load query wights */
  void LoadQueryWeights();
  /*! \brief Filename of current data */
  const char* data_filename_;
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Number of weights, used to check correct weight file */
  data_size_t num_weights_;
  /*! \brief Label data */
Guolin Ke's avatar
Guolin Ke committed
228
  std::vector<float> label_;
Guolin Ke's avatar
Guolin Ke committed
229
  /*! \brief Weights data */
Guolin Ke's avatar
Guolin Ke committed
230
  std::vector<float> weights_;
Guolin Ke's avatar
Guolin Ke committed
231
  /*! \brief Query boundaries */
Guolin Ke's avatar
Guolin Ke committed
232
  std::vector<data_size_t> query_boundaries_;
Guolin Ke's avatar
Guolin Ke committed
233
  /*! \brief Query weights */
Guolin Ke's avatar
Guolin Ke committed
234
  std::vector<float> query_weights_;
Guolin Ke's avatar
Guolin Ke committed
235
236
237
  /*! \brief Number of querys */
  data_size_t num_queries_;
  /*! \brief Number of Initial score, used to check correct weight file */
Guolin Ke's avatar
Guolin Ke committed
238
  int64_t num_init_score_;
Guolin Ke's avatar
Guolin Ke committed
239
  /*! \brief Initial score */
Guolin Ke's avatar
Guolin Ke committed
240
  std::vector<double> init_score_;
Guolin Ke's avatar
Guolin Ke committed
241
  /*! \brief Queries data */
Guolin Ke's avatar
Guolin Ke committed
242
  std::vector<data_size_t> queries_;
243
244
  /*! \brief mutex for threading safe call */
  std::mutex mutex_;
245
246
247
  bool weight_load_from_file_;
  bool query_load_from_file_;
  bool init_score_load_from_file_;
Guolin Ke's avatar
Guolin Ke committed
248
249
250
251
252
253
};


/*! \brief Interface for Parser */
class Parser {
public:
Guolin Ke's avatar
Guolin Ke committed
254

Guolin Ke's avatar
Guolin Ke committed
255
256
257
258
259
260
  /*! \brief virtual destructor */
  virtual ~Parser() {}

  /*!
  * \brief Parse one line with label
  * \param str One line record, string format, should end with '\0'
Guolin Ke's avatar
Guolin Ke committed
261
262
  * \param out_features Output columns, store in (column_idx, values)
  * \param out_label Label will store to this if exists
Guolin Ke's avatar
Guolin Ke committed
263
264
  */
  virtual void ParseOneLine(const char* str,
265
                            std::vector<std::pair<int, double>>* out_features, double* out_label) const = 0;
Guolin Ke's avatar
Guolin Ke committed
266
267
268
269

  /*!
  * \brief Create a object of parser, will auto choose the format depend on file
  * \param filename One Filename of data
270
  * \param num_features Pass num_features of this data file if you know, <=0 means don't know
Guolin Ke's avatar
Guolin Ke committed
271
  * \param label_idx index of label column
Guolin Ke's avatar
Guolin Ke committed
272
273
  * \return Object of parser
  */
Guolin Ke's avatar
Guolin Ke committed
274
  static Parser* CreateParser(const char* filename, bool has_header, int num_features, int label_idx);
Guolin Ke's avatar
Guolin Ke committed
275
276
277
278
279
280
281
};

/*! \brief The main class of data set,
*          which are used to traning or validation
*/
class Dataset {
public:
Guolin Ke's avatar
Guolin Ke committed
282
  friend DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
283

284
  LIGHTGBM_EXPORT Dataset();
Guolin Ke's avatar
Guolin Ke committed
285

286
  LIGHTGBM_EXPORT Dataset(data_size_t num_data);
Guolin Ke's avatar
Guolin Ke committed
287

Guolin Ke's avatar
Guolin Ke committed
288
289
  void Construct(
    std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
290
291
    int** sample_non_zero_indices,
    const int* num_per_col,
Guolin Ke's avatar
Guolin Ke committed
292
293
294
    size_t total_sample_cnt,
    const IOConfig& io_config);

Guolin Ke's avatar
Guolin Ke committed
295
  /*! \brief Destructor */
296
  LIGHTGBM_EXPORT ~Dataset();
Guolin Ke's avatar
Guolin Ke committed
297

298
  LIGHTGBM_EXPORT bool CheckAlign(const Dataset& other) const {
299
300
301
302
303
304
305
306
307
308
    if (num_features_ != other.num_features_) {
      return false;
    }
    if (num_total_features_ != other.num_total_features_) {
      return false;
    }
    if (label_idx_ != other.label_idx_) {
      return false;
    }
    for (int i = 0; i < num_features_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
309
      if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
310
311
312
313
314
315
        return false;
      }
    }
    return true;
  }

Guolin Ke's avatar
Guolin Ke committed
316
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
Guolin Ke's avatar
Guolin Ke committed
317
    if (is_finish_load_) { return; }
Guolin Ke's avatar
Guolin Ke committed
318
    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
Guolin Ke's avatar
Guolin Ke committed
319
320
      int feature_idx = used_feature_map_[i];
      if (feature_idx >= 0) {
Guolin Ke's avatar
Guolin Ke committed
321
322
323
        const int group = feature2group_[feature_idx];
        const int sub_feature = feature2subfeature_[feature_idx];
        feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
Guolin Ke's avatar
Guolin Ke committed
324
325
326
327
      }
    }
  }

328
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
Guolin Ke's avatar
Guolin Ke committed
329
    if (is_finish_load_) { return; }
330
    for (auto& inner_data : feature_values) {
331
      if (inner_data.first >= num_total_features_) { continue; }
332
333
      int feature_idx = used_feature_map_[inner_data.first];
      if (feature_idx >= 0) {
Guolin Ke's avatar
Guolin Ke committed
334
335
336
        const int group = feature2group_[feature_idx];
        const int sub_feature = feature2subfeature_[feature_idx];
        feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
337
338
339
340
      }
    }
  }

Guolin Ke's avatar
Guolin Ke committed
341
342
343
344
345
346
347
348
349
  inline void PushOneData(int tid, data_size_t row_idx, int group, int sub_feature, double value) {
    feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
  }

  inline int RealFeatureIndex(int fidx) const {
    return real_feature_idx_[fidx];
  }

  inline int InnerFeatureIndex(int col_idx) const {
Guolin Ke's avatar
Guolin Ke committed
350
    return used_feature_map_[col_idx];
Guolin Ke's avatar
Guolin Ke committed
351
  }
Guolin Ke's avatar
Guolin Ke committed
352
353
354
355
356
357
358
359
360
  inline int Feature2Group(int feature_idx) const {
    return feature2group_[feature_idx];
  }
  inline int Feture2SubFeature(int feature_idx) const {
    return feature2subfeature_[feature_idx];
  }
  inline uint64_t NumTotalBin() const {
    return group_bin_boundaries_.back();
  }
Guolin Ke's avatar
Guolin Ke committed
361

Guolin Ke's avatar
Guolin Ke committed
362
363
364
  void ReSize(data_size_t num_data);

  void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
Guolin Ke's avatar
Guolin Ke committed
365

366
  LIGHTGBM_EXPORT void FinishLoad();
Guolin Ke's avatar
Guolin Ke committed
367

368
  LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
Guolin Ke's avatar
Guolin Ke committed
369

370
  LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
Guolin Ke's avatar
Guolin Ke committed
371

372
  LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
373

374
  LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
375

376
  LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
Guolin Ke's avatar
Guolin Ke committed
377

378
  LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
379

Guolin Ke's avatar
Guolin Ke committed
380
381
382
  /*!
  * \brief Save current dataset into binary file, will save to "filename.bin"
  */
383
  LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
Guolin Ke's avatar
Guolin Ke committed
384

385
  LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
Guolin Ke's avatar
Guolin Ke committed
386

Guolin Ke's avatar
Guolin Ke committed
387
388
  LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);

389
390
391
392
393
394
395
396
  void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                           const data_size_t* data_indices, data_size_t num_data,
                           int leaf_idx,
                           std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
                           const score_t* gradients, const score_t* hessians,
                           score_t* ordered_gradients, score_t* ordered_hessians,
                           bool is_constant_hessian,
                           HistogramBinEntry* histogram_data) const;
Guolin Ke's avatar
Guolin Ke committed
397
398

  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
399
                    HistogramBinEntry* data) const;
Guolin Ke's avatar
Guolin Ke committed
400

401
402
403
404
  inline data_size_t Split(int feature,
                           uint32_t threshold,
                           data_size_t* data_indices, data_size_t num_data,
                           data_size_t* lte_indices, data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
    const int group = feature2group_[feature];
    const int sub_feature = feature2subfeature_[feature];
    return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);
  }

  inline int SubFeatureBinOffset(int i) const {
    const int sub_feature = feature2subfeature_[i];
    if (sub_feature == 0) {
      return 1;
    } else {
      return 0;
    }
  }

  inline int FeatureNumBin(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
422
    return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
423
  }
424

Guolin Ke's avatar
Guolin Ke committed
425
426
427
428
429
430
431
432
433
  inline const BinMapper* FeatureBinMapper(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature].get();
  }

  inline BinIterator* FeatureIterator(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
zhangyafeikimi's avatar
zhangyafeikimi committed
434
    return feature_groups_[group]->SubFeatureIterator(sub_feature);
Guolin Ke's avatar
Guolin Ke committed
435
436
437
438
439
440
441
442
443
444
  }

  inline double RealThreshold(int i, uint32_t threshold) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
  }

  inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
    ordered_bins->resize(num_groups_);
445
446
    OMP_INIT_EX();
    #pragma omp parallel for schedule(guided)
Guolin Ke's avatar
Guolin Ke committed
447
    for (int i = 0; i < num_groups_; ++i) {
448
449
450
      OMP_LOOP_EX_BEGIN();
      ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
451
    }
452
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
453
  }
Guolin Ke's avatar
Guolin Ke committed
454
455
456
457
458
459
460
461
462
463

  /*!
  * \brief Get meta data pointer
  * \return Pointer of meta data
  */
  inline const Metadata& metadata() const { return metadata_; }

  /*! \brief Get Number of used features */
  inline int num_features() const { return num_features_; }

464
465
466
  /*! \brief Get Number of total features */
  inline int num_total_features() const { return num_total_features_; }

Guolin Ke's avatar
Guolin Ke committed
467
468
469
470
  /*! \brief Get the index of label column */
  inline int label_idx() const { return label_idx_; }

  /*! \brief Get names of current data set */
Guolin Ke's avatar
Guolin Ke committed
471
472
473
474
475
476
477
478
479
  inline const std::vector<std::string>& feature_names() const { return feature_names_; }

  inline void set_feature_names(const std::vector<std::string>& feature_names) {
    if (feature_names.size() != static_cast<size_t>(num_total_features_)) {
      Log::Warning("size of feature_names error, should equal with total number of features");
      return;
    }
    feature_names_ = std::vector<std::string>(feature_names);
  }
Guolin Ke's avatar
Guolin Ke committed
480

Guolin Ke's avatar
Guolin Ke committed
481
482
483
484
485
486
487
488
489
490
491
492
493
494
  inline std::vector<std::string> feature_infos() const {
    std::vector<std::string> bufs;
    for (int i = 0; i < num_total_features_; i++) {
      int fidx = used_feature_map_[i];
      if (fidx == -1) {
        bufs.push_back("none");
      } else {
        const auto bin_mapper = FeatureBinMapper(fidx);
        bufs.push_back(bin_mapper->bin_info());
      }
    }
    return bufs;
  }

Guolin Ke's avatar
Guolin Ke committed
495
496
497
498
499
500
501
502
503
504
505
  /*! \brief Get Number of data */
  inline data_size_t num_data() const { return num_data_; }

  /*! \brief Disable copy */
  Dataset& operator=(const Dataset&) = delete;
  /*! \brief Disable copy */
  Dataset(const Dataset&) = delete;

private:
  const char* data_filename_;
  /*! \brief Store used features */
Guolin Ke's avatar
Guolin Ke committed
506
  std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
Guolin Ke's avatar
Guolin Ke committed
507
508
509
510
  /*! \brief Mapper from real feature index to used index*/
  std::vector<int> used_feature_map_;
  /*! \brief Number of used features*/
  int num_features_;
511
512
  /*! \brief Number of total features*/
  int num_total_features_;
Guolin Ke's avatar
Guolin Ke committed
513
514
515
516
  /*! \brief Number of total data*/
  data_size_t num_data_;
  /*! \brief Store some label level data*/
  Metadata metadata_;
Guolin Ke's avatar
Guolin Ke committed
517
518
519
520
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
521
522
  /*! \brief store feature names */
  static const char* binary_file_token;
Guolin Ke's avatar
Guolin Ke committed
523
524
525
526
527
528
529
  int num_groups_;
  std::vector<int> real_feature_idx_;
  std::vector<int> feature2group_;
  std::vector<int> feature2subfeature_;
  std::vector<uint64_t> group_bin_boundaries_;
  std::vector<int> group_feature_start_;
  std::vector<int> group_feature_cnt_;
Guolin Ke's avatar
Guolin Ke committed
530
  bool is_finish_load_;
Guolin Ke's avatar
Guolin Ke committed
531
532
533
534
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
535
#endif   // LightGBM_DATA_H_