dataset.h 23.5 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
#ifndef LIGHTGBM_DATASET_H_
#define LIGHTGBM_DATASET_H_
Guolin Ke's avatar
Guolin Ke committed
7

8
9
10
11
12
13
14
#include <LightGBM/config.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>

Guolin Ke's avatar
Guolin Ke committed
15
#include <string>
16
17
#include <functional>
#include <memory>
18
#include <mutex>
19
20
21
#include <unordered_set>
#include <utility>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
22
23
24
25

namespace LightGBM {

/*! \brief forward declaration */
Guolin Ke's avatar
Guolin Ke committed
26
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
27
/*!
Hui Xue's avatar
Hui Xue committed
28
* \brief This class is used to store some meta(non-feature) data for training data,
29
*        e.g. labels, weights, initial scores, query level informations.
Guolin Ke's avatar
Guolin Ke committed
30
*
Qiwei Ye's avatar
Qiwei Ye committed
31
*        Some details:
32
*        1. Label, used for training.
Qiwei Ye's avatar
Qiwei Ye committed
33
34
*        2. Weights, weighs of records, optional
*        3. Query Boundaries, necessary for lambdarank.
35
36
37
38
*           The documents of i-th query is in [ query_boundaries[i], query_boundaries[i+1] )
*        4. Query Weights, auto calculate by weights and query_boundaries(if both of them are existed)
*           the weight for i-th query is sum(query_boundaries[i] , .., query_boundaries[i+1]) / (query_boundaries[i + 1] -  query_boundaries[i+1])
*        5. Initial score. optional. if existing, the model will boost from this score, otherwise will start from 0.
Guolin Ke's avatar
Guolin Ke committed
39
40
*/
class Metadata {
Nikita Titov's avatar
Nikita Titov committed
41
 public:
42
  /*!
43
  * \brief Null constructor
Guolin Ke's avatar
Guolin Ke committed
44
45
46
  */
  Metadata();
  /*!
47
  * \brief Initialization will load query level informations, since it is need for sampling data
Guolin Ke's avatar
Guolin Ke committed
48
49
  * \param data_filename Filename of data
  */
50
  void Init(const char* data_filename);
Guolin Ke's avatar
Guolin Ke committed
51
  /*!
Guolin Ke's avatar
Guolin Ke committed
52
53
  * \brief init as subset
  * \param metadata Filename of data
54
  * \param used_indices
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
  * \param num_used_indices
  */
  void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
  /*!
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
63
64
65
66
  * \brief Initial with binary memory
  * \param memory Pointer to memory
  */
  void LoadFromMemory(const void* memory);
  /*! \brief Destructor */
  ~Metadata();

  /*!
Guolin Ke's avatar
Guolin Ke committed
67
  * \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
Guolin Ke's avatar
Guolin Ke committed
68
  * \param num_data Number of training data
Guolin Ke's avatar
Guolin Ke committed
69
70
  * \param weight_idx Index of weight column, < 0 means doesn't exists
  * \param query_idx Index of query id column, < 0 means doesn't exists
Guolin Ke's avatar
Guolin Ke committed
71
  */
72
  void Init(data_size_t num_data, int weight_idx, int query_idx);
Guolin Ke's avatar
Guolin Ke committed
73
74
75

  /*!
  * \brief Partition label by used indices
76
  * \param used_indices Indices of local used
Guolin Ke's avatar
Guolin Ke committed
77
78
79
80
81
82
83
84
85
  */
  void PartitionLabel(const std::vector<data_size_t>& used_indices);

  /*!
  * \brief Partition meta data according to local used indices if need
  * \param num_all_data Number of total training data, including other machines' data on parallel learning
  * \param used_data_indices Indices of local used training data
  */
  void CheckOrPartition(data_size_t num_all_data,
86
                        const std::vector<data_size_t>& used_data_indices);
Guolin Ke's avatar
Guolin Ke committed
87

88
  void SetLabel(const label_t* label, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
89

90
  void SetWeights(const label_t* weights, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
91

Guolin Ke's avatar
Guolin Ke committed
92
  void SetQuery(const data_size_t* query, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
93

Guolin Ke's avatar
Guolin Ke committed
94
95
96
97
  /*!
  * \brief Set initial scores
  * \param init_score Initial scores, this class will manage memory for init_score.
  */
98
  void SetInitScore(const double* init_score, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
99
100
101
102
103
104


  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
105
  void SaveBinaryToFile(const VirtualFileWriter* writer) const;
Guolin Ke's avatar
Guolin Ke committed
106
107
108
109
110
111
112
113
114
115

  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const;

  /*!
  * \brief Get pointer of label
  * \return Pointer of label
  */
116
  inline const label_t* label() const { return label_.data(); }
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
121
122

  /*!
  * \brief Set label for one record
  * \param idx Index of this record
  * \param value Label value of this record
  */
123
  inline void SetLabelAt(data_size_t idx, label_t value) {
124
    label_[idx] = value;
Guolin Ke's avatar
Guolin Ke committed
125
126
  }

Guolin Ke's avatar
Guolin Ke committed
127
128
129
130
131
  /*!
  * \brief Set Weight for one record
  * \param idx Index of this record
  * \param value Weight value of this record
  */
132
  inline void SetWeightAt(data_size_t idx, label_t value) {
133
    weights_[idx] = value;
Guolin Ke's avatar
Guolin Ke committed
134
135
136
137
138
139
140
  }

  /*!
  * \brief Set Query Id for one record
  * \param idx Index of this record
  * \param value Query Id value of this record
  */
141
  inline void SetQueryAt(data_size_t idx, data_size_t value) {
Guolin Ke's avatar
Guolin Ke committed
142
143
144
    queries_[idx] = static_cast<data_size_t>(value);
  }

Guolin Ke's avatar
Guolin Ke committed
145
  /*!
Hui Xue's avatar
Hui Xue committed
146
  * \brief Get weights, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
147
148
  * \return Pointer of weights
  */
149
  inline const label_t* weights() const {
Guolin Ke's avatar
Guolin Ke committed
150
    if (!weights_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
151
152
153
154
155
      return weights_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
156
157

  /*!
Hui Xue's avatar
Hui Xue committed
158
  * \brief Get data boundaries on queries, if not exists, will return nullptr
159
  *        we assume data will order by query,
Guolin Ke's avatar
Guolin Ke committed
160
161
162
163
  *        the interval of [query_boundaris[i], query_boundaris[i+1])
  *        is the data indices for query i.
  * \return Pointer of data boundaries on queries
  */
164
  inline const data_size_t* query_boundaries() const {
Guolin Ke's avatar
Guolin Ke committed
165
    if (!query_boundaries_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
166
167
168
169
170
      return query_boundaries_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175

  /*!
  * \brief Get Number of queries
  * \return Number of queries
  */
176
  inline data_size_t num_queries() const { return num_queries_; }
Guolin Ke's avatar
Guolin Ke committed
177
178

  /*!
Hui Xue's avatar
Hui Xue committed
179
  * \brief Get weights for queries, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
180
181
  * \return Pointer of weights for queries
  */
182
  inline const label_t* query_weights() const {
Guolin Ke's avatar
Guolin Ke committed
183
    if (!query_weights_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
184
185
186
187
188
      return query_weights_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
189
190

  /*!
Hui Xue's avatar
Hui Xue committed
191
  * \brief Get initial scores, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
192
193
  * \return Pointer of initial scores
  */
194
  inline const double* init_score() const {
Guolin Ke's avatar
Guolin Ke committed
195
    if (!init_score_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
200
      return init_score_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
201

202
203
204
  /*!
  * \brief Get size of initial scores
  */
Guolin Ke's avatar
Guolin Ke committed
205
  inline int64_t num_init_score() const { return num_init_score_; }
206

Guolin Ke's avatar
Guolin Ke committed
207
208
209
210
  /*! \brief Disable copy */
  Metadata& operator=(const Metadata&) = delete;
  /*! \brief Disable copy */
  Metadata(const Metadata&) = delete;
Guolin Ke's avatar
Guolin Ke committed
211

Nikita Titov's avatar
Nikita Titov committed
212
 private:
Guolin Ke's avatar
Guolin Ke committed
213
  /*! \brief Load initial scores from file */
214
  void LoadInitialScore();
Guolin Ke's avatar
Guolin Ke committed
215
216
217
218
219
220
221
  /*! \brief Load wights from file */
  void LoadWeights();
  /*! \brief Load query boundaries from file */
  void LoadQueryBoundaries();
  /*! \brief Load query wights */
  void LoadQueryWeights();
  /*! \brief Filename of current data */
Guolin Ke's avatar
Guolin Ke committed
222
  std::string data_filename_;
Guolin Ke's avatar
Guolin Ke committed
223
224
225
226
227
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Number of weights, used to check correct weight file */
  data_size_t num_weights_;
  /*! \brief Label data */
228
  std::vector<label_t> label_;
Guolin Ke's avatar
Guolin Ke committed
229
  /*! \brief Weights data */
230
  std::vector<label_t> weights_;
Guolin Ke's avatar
Guolin Ke committed
231
  /*! \brief Query boundaries */
Guolin Ke's avatar
Guolin Ke committed
232
  std::vector<data_size_t> query_boundaries_;
Guolin Ke's avatar
Guolin Ke committed
233
  /*! \brief Query weights */
234
  std::vector<label_t> query_weights_;
Guolin Ke's avatar
Guolin Ke committed
235
236
237
  /*! \brief Number of querys */
  data_size_t num_queries_;
  /*! \brief Number of Initial score, used to check correct weight file */
Guolin Ke's avatar
Guolin Ke committed
238
  int64_t num_init_score_;
Guolin Ke's avatar
Guolin Ke committed
239
  /*! \brief Initial score */
Guolin Ke's avatar
Guolin Ke committed
240
  std::vector<double> init_score_;
Guolin Ke's avatar
Guolin Ke committed
241
  /*! \brief Queries data */
Guolin Ke's avatar
Guolin Ke committed
242
  std::vector<data_size_t> queries_;
243
244
  /*! \brief mutex for threading safe call */
  std::mutex mutex_;
245
246
247
  bool weight_load_from_file_;
  bool query_load_from_file_;
  bool init_score_load_from_file_;
Guolin Ke's avatar
Guolin Ke committed
248
249
250
251
252
};


/*! \brief Interface for Parser */
class Parser {
Nikita Titov's avatar
Nikita Titov committed
253
 public:
Guolin Ke's avatar
Guolin Ke committed
254
255
256
257
258
259
  /*! \brief virtual destructor */
  virtual ~Parser() {}

  /*!
  * \brief Parse one line with label
  * \param str One line record, string format, should end with '\0'
Guolin Ke's avatar
Guolin Ke committed
260
261
  * \param out_features Output columns, store in (column_idx, values)
  * \param out_label Label will store to this if exists
Guolin Ke's avatar
Guolin Ke committed
262
263
  */
  virtual void ParseOneLine(const char* str,
264
                            std::vector<std::pair<int, double>>* out_features, double* out_label) const = 0;
Guolin Ke's avatar
Guolin Ke committed
265

266
  virtual int NumFeatures() const = 0;
Guolin Ke's avatar
Guolin Ke committed
267

Guolin Ke's avatar
Guolin Ke committed
268
  /*!
269
  * \brief Create an object of parser, will auto choose the format depend on file
Guolin Ke's avatar
Guolin Ke committed
270
  * \param filename One Filename of data
271
  * \param num_features Pass num_features of this data file if you know, <=0 means don't know
Guolin Ke's avatar
Guolin Ke committed
272
  * \param label_idx index of label column
Guolin Ke's avatar
Guolin Ke committed
273
274
  * \return Object of parser
  */
Guolin Ke's avatar
Guolin Ke committed
275
  static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx);
Guolin Ke's avatar
Guolin Ke committed
276
277
};

278
struct TrainingShareStates {
279
  int num_threads = 0;
280
281
282
283
284
285
286
  bool is_colwise = true;
  bool is_use_subcol = false;
  bool is_use_subrow = false;
  bool is_subrow_copied = false;
  bool is_constant_hessian = true;
  const data_size_t* bagging_use_indices;
  data_size_t bagging_indices_cnt;
287
288
  int num_bin_aligned;
  std::unique_ptr<MultiValBin> multi_val_bin;
289
  std::unique_ptr<MultiValBin> multi_val_bin_subset;
290
291
292
  std::vector<uint32_t> hist_move_src;
  std::vector<uint32_t> hist_move_dest;
  std::vector<uint32_t> hist_move_size;
293
294
  std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
      hist_buf;
295
296

  void SetMultiValBin(MultiValBin* bin) {
297
    num_threads = OMP_NUM_THREADS();
298
299
300
301
    if (bin == nullptr) {
      return;
    }
    multi_val_bin.reset(bin);
Nikita Titov's avatar
Nikita Titov committed
302
    num_bin_aligned =
303
304
305
306
307
308
309
310
        (bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
    size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;
    if (new_size > hist_buf.size()) {
      hist_buf.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
    }
  }

  hist_t* TempBuf() {
311
    if (!is_use_subcol) {
312
313
314
315
316
317
      return nullptr;
    }
    return hist_buf.data() + hist_buf.size() - num_bin_aligned * 2;
  }

  void HistMove(const hist_t* src, hist_t* dest) {
318
    if (!is_use_subcol) {
319
320
321
322
323
324
325
326
327
328
      return;
    }
#pragma omp parallel for schedule(static)
    for (int i = 0; i < static_cast<int>(hist_move_src.size()); ++i) {
      std::copy_n(src + hist_move_src[i], hist_move_size[i],
                  dest + hist_move_dest[i]);
    }
  }
};

Guolin Ke's avatar
Guolin Ke committed
329
/*! \brief The main class of data set,
330
*          which are used to training or validation
Guolin Ke's avatar
Guolin Ke committed
331
332
*/
class Dataset {
Nikita Titov's avatar
Nikita Titov committed
333
 public:
Guolin Ke's avatar
Guolin Ke committed
334
  friend DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
335

336
  LIGHTGBM_EXPORT Dataset();
Guolin Ke's avatar
Guolin Ke committed
337

338
  LIGHTGBM_EXPORT Dataset(data_size_t num_data);
Guolin Ke's avatar
Guolin Ke committed
339

Guolin Ke's avatar
Guolin Ke committed
340
  void Construct(
Guolin Ke's avatar
Guolin Ke committed
341
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
342
    int num_total_features,
343
    const std::vector<std::vector<double>>& forced_bins,
344
    int** sample_non_zero_indices,
Guolin Ke's avatar
Guolin Ke committed
345
    double** sample_values,
346
    const int* num_per_col,
347
    int num_sample_col,
Guolin Ke's avatar
Guolin Ke committed
348
    size_t total_sample_cnt,
Guolin Ke's avatar
Guolin Ke committed
349
    const Config& io_config);
Guolin Ke's avatar
Guolin Ke committed
350

Guolin Ke's avatar
Guolin Ke committed
351
  /*! \brief Destructor */
352
  LIGHTGBM_EXPORT ~Dataset();
Guolin Ke's avatar
Guolin Ke committed
353

354
  LIGHTGBM_EXPORT bool CheckAlign(const Dataset& other) const {
355
356
357
358
359
360
361
362
363
364
    if (num_features_ != other.num_features_) {
      return false;
    }
    if (num_total_features_ != other.num_total_features_) {
      return false;
    }
    if (label_idx_ != other.label_idx_) {
      return false;
    }
    for (int i = 0; i < num_features_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
365
      if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
366
367
368
369
370
371
        return false;
      }
    }
    return true;
  }

Guolin Ke's avatar
Guolin Ke committed
372
373
374
375
376
377
378
379
380
381
  inline void FinishOneRow(int tid, data_size_t row_idx, const std::vector<bool>& is_feature_added) {
    if (is_finish_load_) { return; }
    for (auto fidx : feature_need_push_zeros_) {
      if (is_feature_added[fidx]) { continue; }
      const int group = feature2group_[fidx];
      const int sub_feature = feature2subfeature_[fidx];
      feature_groups_[group]->PushData(tid, sub_feature, row_idx, 0.0f);
    }
  }

Guolin Ke's avatar
Guolin Ke committed
382
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
Guolin Ke's avatar
Guolin Ke committed
383
    if (is_finish_load_) { return; }
Guolin Ke's avatar
Guolin Ke committed
384
    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
Guolin Ke's avatar
Guolin Ke committed
385
386
      int feature_idx = used_feature_map_[i];
      if (feature_idx >= 0) {
Guolin Ke's avatar
Guolin Ke committed
387
388
389
        const int group = feature2group_[feature_idx];
        const int sub_feature = feature2subfeature_[feature_idx];
        feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
Guolin Ke's avatar
Guolin Ke committed
390
391
392
393
      }
    }
  }

394
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
Guolin Ke's avatar
Guolin Ke committed
395
    if (is_finish_load_) { return; }
Guolin Ke's avatar
Guolin Ke committed
396
    std::vector<bool> is_feature_added(num_features_, false);
397
    for (auto& inner_data : feature_values) {
398
      if (inner_data.first >= num_total_features_) { continue; }
399
400
      int feature_idx = used_feature_map_[inner_data.first];
      if (feature_idx >= 0) {
Guolin Ke's avatar
Guolin Ke committed
401
        is_feature_added[feature_idx] = true;
Guolin Ke's avatar
Guolin Ke committed
402
403
404
        const int group = feature2group_[feature_idx];
        const int sub_feature = feature2subfeature_[feature_idx];
        feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
405
406
      }
    }
Guolin Ke's avatar
Guolin Ke committed
407
    FinishOneRow(tid, row_idx, is_feature_added);
408
409
  }

Guolin Ke's avatar
Guolin Ke committed
410
411
412
413
414
415
416
417
418
  inline void PushOneData(int tid, data_size_t row_idx, int group, int sub_feature, double value) {
    feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
  }

  inline int RealFeatureIndex(int fidx) const {
    return real_feature_idx_[fidx];
  }

  inline int InnerFeatureIndex(int col_idx) const {
Guolin Ke's avatar
Guolin Ke committed
419
    return used_feature_map_[col_idx];
Guolin Ke's avatar
Guolin Ke committed
420
  }
Guolin Ke's avatar
Guolin Ke committed
421
422
423
424
425
426
  inline int Feature2Group(int feature_idx) const {
    return feature2group_[feature_idx];
  }
  inline int Feture2SubFeature(int feature_idx) const {
    return feature2subfeature_[feature_idx];
  }
427
428
429
  inline uint64_t GroupBinBoundary(int group_idx) const {
    return group_bin_boundaries_[group_idx];
  }
Guolin Ke's avatar
Guolin Ke committed
430
431
432
  inline uint64_t NumTotalBin() const {
    return group_bin_boundaries_.back();
  }
433

434
435
436
437
438
439
440
441
442
  inline std::vector<int> ValidFeatureIndices() const {
    std::vector<int> ret;
    for (int i = 0; i < num_total_features_; ++i) {
      if (used_feature_map_[i] >= 0) {
        ret.push_back(i);
      }
    }
    return ret;
  }
Guolin Ke's avatar
Guolin Ke committed
443
444
  void ReSize(data_size_t num_data);

445
  void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
Guolin Ke's avatar
Guolin Ke committed
446

447
448
449
450
  MultiValBin* GetMultiBinFromSparseFeatures() const;

  MultiValBin* GetMultiBinFromAllFeatures() const;

451
452
453
454
  TrainingShareStates* GetShareStates(
      score_t* gradients, score_t* hessians,
      const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
      bool force_colwise, bool force_rowwise) const;
455

456
  LIGHTGBM_EXPORT void FinishLoad();
Guolin Ke's avatar
Guolin Ke committed
457

458
  LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
Guolin Ke's avatar
Guolin Ke committed
459

460
  LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
Guolin Ke's avatar
Guolin Ke committed
461

462
  LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
463

464
  LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
465

466
  LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
Guolin Ke's avatar
Guolin Ke committed
467

468
  LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
469

Guolin Ke's avatar
Guolin Ke committed
470
471
472
  /*!
  * \brief Save current dataset into binary file, will save to "filename.bin"
  */
473
  LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
Guolin Ke's avatar
Guolin Ke committed
474

475
476
  LIGHTGBM_EXPORT void DumpTextFile(const char* text_filename);

477
  LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
Guolin Ke's avatar
Guolin Ke committed
478

Guolin Ke's avatar
Guolin Ke committed
479
480
  LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);

481
  void InitTrain(const std::vector<int8_t>& is_feature_used,
482
                 TrainingShareStates* share_state) const;
483

Guolin Ke's avatar
Guolin Ke committed
484
485
486
487
488
489
490
491
492
493
494
  template <bool USE_INDICES, bool USE_HESSIAN>
  void ConstructHistogramsInner(const std::vector<int8_t>& is_feature_used,
                                const data_size_t* data_indices,
                                data_size_t num_data, const score_t* gradients,
                                const score_t* hessians,
                                score_t* ordered_gradients,
                                score_t* ordered_hessians,
                                TrainingShareStates* share_state,
                                hist_t* hist_data) const;

  template <bool USE_INDICES, bool ORDERED>
495
496
497
498
  void ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                   data_size_t num_data,
                                   const score_t* gradients,
                                   const score_t* hessians,
499
                                   TrainingShareStates* share_state,
Guolin Ke's avatar
Guolin Ke committed
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
                                   hist_t* hist_data) const;

  inline void ConstructHistograms(
      const std::vector<int8_t>& is_feature_used,
      const data_size_t* data_indices, data_size_t num_data,
      const score_t* gradients, const score_t* hessians,
      score_t* ordered_gradients, score_t* ordered_hessians,
      TrainingShareStates* share_state, hist_t* hist_data) const {
    if (num_data <= 0) {
      return;
    }
    bool use_indices = data_indices != nullptr && (num_data < num_data_);
    if (share_state->is_constant_hessian) {
      if (use_indices) {
        ConstructHistogramsInner<true, false>(
            is_feature_used, data_indices, num_data, gradients, hessians,
            ordered_gradients, ordered_hessians, share_state, hist_data);
      } else {
        ConstructHistogramsInner<false, false>(
            is_feature_used, data_indices, num_data, gradients, hessians,
            ordered_gradients, ordered_hessians, share_state, hist_data);
      }
    } else {
      if (use_indices) {
        ConstructHistogramsInner<true, true>(
            is_feature_used, data_indices, num_data, gradients, hessians,
            ordered_gradients, ordered_hessians, share_state, hist_data);
      } else {
        ConstructHistogramsInner<false, true>(
            is_feature_used, data_indices, num_data, gradients, hessians,
            ordered_gradients, ordered_hessians, share_state, hist_data);
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
534

535
  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
Guolin Ke's avatar
Guolin Ke committed
536

537
538
  inline data_size_t Split(int feature, const uint32_t* threshold,
                           int num_threshold, bool default_left,
539
                           const data_size_t* data_indices,
540
541
                           data_size_t cnt, data_size_t* lte_indices,
                           data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
542
543
    const int group = feature2group_[feature];
    const int sub_feature = feature2subfeature_[feature];
544
545
546
    return feature_groups_[group]->Split(
        sub_feature, threshold, num_threshold, default_left, data_indices,
        cnt, lte_indices, gt_indices);
Guolin Ke's avatar
Guolin Ke committed
547
548
549
550
551
552
553
554
555
556
557
558
559
560
  }

  inline int SubFeatureBinOffset(int i) const {
    const int sub_feature = feature2subfeature_[i];
    if (sub_feature == 0) {
      return 1;
    } else {
      return 0;
    }
  }

  inline int FeatureNumBin(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
561
    return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
562
  }
Guolin Ke's avatar
Guolin Ke committed
563

564
565
566
  inline int FeatureGroupNumBin(int group) const {
    return feature_groups_[group]->num_total_bin_;
  }
567

Guolin Ke's avatar
Guolin Ke committed
568
569
570
571
572
573
  inline const BinMapper* FeatureBinMapper(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature].get();
  }

574
575
576
577
  inline const Bin* FeatureGroupBin(int group) const {
    return feature_groups_[group]->bin_data_.get();
  }

Guolin Ke's avatar
Guolin Ke committed
578
579
580
  inline BinIterator* FeatureIterator(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
zhangyafeikimi's avatar
zhangyafeikimi committed
581
    return feature_groups_[group]->SubFeatureIterator(sub_feature);
Guolin Ke's avatar
Guolin Ke committed
582
583
  }

584
585
586
  inline BinIterator* FeatureGroupIterator(int group) const {
    return feature_groups_[group]->FeatureGroupIterator();
  }
587

588
589
590
591
  inline bool IsMultiGroup(int i) const {
    return feature_groups_[i]->is_multi_val_;
  }

Guolin Ke's avatar
Guolin Ke committed
592
593
594
595
596
597
  inline double RealThreshold(int i, uint32_t threshold) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
  }

598
599
600
601
602
603
604
  // given a real threshold, find the closest threshold bin
  inline uint32_t BinThreshold(int i, double threshold_double) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
  }

Guolin Ke's avatar
Guolin Ke committed
605
606
607
608
609
610
611
612
613
  /*!
  * \brief Get meta data pointer
  * \return Pointer of meta data
  */
  inline const Metadata& metadata() const { return metadata_; }

  /*! \brief Get Number of used features */
  inline int num_features() const { return num_features_; }

614
615
616
  /*! \brief Get Number of feature groups */
  inline int num_feature_groups() const { return num_groups_;}

617
618
619
  /*! \brief Get Number of total features */
  inline int num_total_features() const { return num_total_features_; }

Guolin Ke's avatar
Guolin Ke committed
620
621
622
623
  /*! \brief Get the index of label column */
  inline int label_idx() const { return label_idx_; }

  /*! \brief Get names of current data set */
Guolin Ke's avatar
Guolin Ke committed
624
625
626
627
  inline const std::vector<std::string>& feature_names() const { return feature_names_; }

  inline void set_feature_names(const std::vector<std::string>& feature_names) {
    if (feature_names.size() != static_cast<size_t>(num_total_features_)) {
628
      Log::Fatal("Size of feature_names error, should equal with total number of features");
Guolin Ke's avatar
Guolin Ke committed
629
630
    }
    feature_names_ = std::vector<std::string>(feature_names);
Guolin Ke's avatar
Guolin Ke committed
631
    std::unordered_set<std::string> feature_name_set;
632
633
    // replace ' ' in feature_names with '_'
    bool spaceInFeatureName = false;
634
    for (auto& feature_name : feature_names_) {
635
636
637
      // check json
      if (!Common::CheckAllowedJSON(feature_name)) {
        Log::Fatal("Do not support special JSON characters in feature name.");
638
      }
639
      if (feature_name.find(' ') != std::string::npos) {
640
641
642
        spaceInFeatureName = true;
        std::replace(feature_name.begin(), feature_name.end(), ' ', '_');
      }
Guolin Ke's avatar
Guolin Ke committed
643
644
645
646
      if (feature_name_set.count(feature_name) > 0) {
        Log::Fatal("Feature (%s) appears more than one time.", feature_name.c_str());
      }
      feature_name_set.insert(feature_name);
647
    }
648
    if (spaceInFeatureName) {
649
650
      Log::Warning("Find whitespaces in feature_names, replace with underlines");
    }
Guolin Ke's avatar
Guolin Ke committed
651
  }
Guolin Ke's avatar
Guolin Ke committed
652

Guolin Ke's avatar
Guolin Ke committed
653
654
  inline std::vector<std::string> feature_infos() const {
    std::vector<std::string> bufs;
655
    for (int i = 0; i < num_total_features_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
656
      int fidx = used_feature_map_[i];
657
      if (fidx < 0) {
Guolin Ke's avatar
Guolin Ke committed
658
659
660
        bufs.push_back("none");
      } else {
        const auto bin_mapper = FeatureBinMapper(fidx);
661
        bufs.push_back(bin_mapper->bin_info_string());
Guolin Ke's avatar
Guolin Ke committed
662
663
664
665
666
      }
    }
    return bufs;
  }

Guolin Ke's avatar
Guolin Ke committed
667
668
669
670
671
672
673
674
  /*! \brief Get Number of data */
  inline data_size_t num_data() const { return num_data_; }

  /*! \brief Disable copy */
  Dataset& operator=(const Dataset&) = delete;
  /*! \brief Disable copy */
  Dataset(const Dataset&) = delete;

675
  void AddFeaturesFrom(Dataset* other);
676

Nikita Titov's avatar
Nikita Titov committed
677
 private:
Guolin Ke's avatar
Guolin Ke committed
678
  std::string data_filename_;
Guolin Ke's avatar
Guolin Ke committed
679
  /*! \brief Store used features */
Guolin Ke's avatar
Guolin Ke committed
680
  std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
Guolin Ke's avatar
Guolin Ke committed
681
682
683
684
  /*! \brief Mapper from real feature index to used index*/
  std::vector<int> used_feature_map_;
  /*! \brief Number of used features*/
  int num_features_;
685
686
  /*! \brief Number of total features*/
  int num_total_features_;
Guolin Ke's avatar
Guolin Ke committed
687
688
689
690
  /*! \brief Number of total data*/
  data_size_t num_data_;
  /*! \brief Store some label level data*/
  Metadata metadata_;
Guolin Ke's avatar
Guolin Ke committed
691
692
693
694
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
695
696
  /*! \brief store feature names */
  static const char* binary_file_token;
Guolin Ke's avatar
Guolin Ke committed
697
698
699
700
701
702
703
  int num_groups_;
  std::vector<int> real_feature_idx_;
  std::vector<int> feature2group_;
  std::vector<int> feature2subfeature_;
  std::vector<uint64_t> group_bin_boundaries_;
  std::vector<int> group_feature_start_;
  std::vector<int> group_feature_cnt_;
Guolin Ke's avatar
Guolin Ke committed
704
  bool is_finish_load_;
705
  int max_bin_;
Belinda Trotta's avatar
Belinda Trotta committed
706
  std::vector<int32_t> max_bin_by_feature_;
707
  std::vector<std::vector<double>> forced_bin_bounds_;
708
709
710
711
  int bin_construct_sample_cnt_;
  int min_data_in_bin_;
  bool use_missing_;
  bool zero_as_missing_;
Guolin Ke's avatar
Guolin Ke committed
712
  std::vector<int> feature_need_push_zeros_;
Guolin Ke's avatar
Guolin Ke committed
713
714
715
716
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
717
#endif   // LightGBM_DATA_H_