dataset.h 22.3 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
#ifndef LIGHTGBM_DATASET_H_
#define LIGHTGBM_DATASET_H_
Guolin Ke's avatar
Guolin Ke committed
7

Guolin Ke's avatar
Guolin Ke committed
8
#include <LightGBM/config.h>
Guolin Ke's avatar
Guolin Ke committed
9
#include <LightGBM/feature_group.h>
10
#include <LightGBM/meta.h>
11
#include <LightGBM/utils/common.h>
12
13
14
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
Guolin Ke's avatar
Guolin Ke committed
15
16

#include <string>
17
18
#include <functional>
#include <memory>
19
#include <mutex>
20
21
22
#include <unordered_set>
#include <utility>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
23
24
25
26

namespace LightGBM {

/*! \brief forward declaration */
Guolin Ke's avatar
Guolin Ke committed
27
class DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
28
/*!
Hui Xue's avatar
Hui Xue committed
29
* \brief This class is used to store some meta(non-feature) data for training data,
30
*        e.g. labels, weights, initial scores, query level informations.
Guolin Ke's avatar
Guolin Ke committed
31
*
Qiwei Ye's avatar
Qiwei Ye committed
32
*        Some details:
33
*        1. Label, used for training.
Qiwei Ye's avatar
Qiwei Ye committed
34
35
*        2. Weights, weighs of records, optional
*        3. Query Boundaries, necessary for lambdarank.
36
37
38
39
*           The documents of i-th query is in [ query_boundaries[i], query_boundaries[i+1] )
*        4. Query Weights, auto calculate by weights and query_boundaries(if both of them are existed)
*           the weight for i-th query is sum(query_boundaries[i] , .., query_boundaries[i+1]) / (query_boundaries[i + 1] -  query_boundaries[i+1])
*        5. Initial score. optional. if existing, the model will boost from this score, otherwise will start from 0.
Guolin Ke's avatar
Guolin Ke committed
40
41
*/
class Metadata {
Nikita Titov's avatar
Nikita Titov committed
42
 public:
43
  /*!
44
  * \brief Null constructor
Guolin Ke's avatar
Guolin Ke committed
45
46
47
  */
  Metadata();
  /*!
48
  * \brief Initialization will load query level informations, since it is need for sampling data
Guolin Ke's avatar
Guolin Ke committed
49
50
51
  * \param data_filename Filename of data
  * \param init_score_filename Filename of initial score
  */
52
  void Init(const char* data_filename, const char* initscore_file);
Guolin Ke's avatar
Guolin Ke committed
53
  /*!
Guolin Ke's avatar
Guolin Ke committed
54
55
  * \brief init as subset
  * \param metadata Filename of data
56
  * \param used_indices
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
  * \param num_used_indices
  */
  void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
  /*!
Guolin Ke's avatar
Guolin Ke committed
61
62
63
64
65
66
67
68
  * \brief Initial with binary memory
  * \param memory Pointer to memory
  */
  void LoadFromMemory(const void* memory);
  /*! \brief Destructor */
  ~Metadata();

  /*!
Guolin Ke's avatar
Guolin Ke committed
69
  * \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
Guolin Ke's avatar
Guolin Ke committed
70
  * \param num_data Number of training data
Guolin Ke's avatar
Guolin Ke committed
71
72
  * \param weight_idx Index of weight column, < 0 means doesn't exists
  * \param query_idx Index of query id column, < 0 means doesn't exists
Guolin Ke's avatar
Guolin Ke committed
73
  */
74
  void Init(data_size_t num_data, int weight_idx, int query_idx);
Guolin Ke's avatar
Guolin Ke committed
75
76
77

  /*!
  * \brief Partition label by used indices
78
  * \param used_indices Indices of local used
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82
83
84
85
86
87
  */
  void PartitionLabel(const std::vector<data_size_t>& used_indices);

  /*!
  * \brief Partition meta data according to local used indices if need
  * \param num_all_data Number of total training data, including other machines' data on parallel learning
  * \param used_data_indices Indices of local used training data
  */
  void CheckOrPartition(data_size_t num_all_data,
88
                        const std::vector<data_size_t>& used_data_indices);
Guolin Ke's avatar
Guolin Ke committed
89

90
  void SetLabel(const label_t* label, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
91

92
  void SetWeights(const label_t* weights, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
93

Guolin Ke's avatar
Guolin Ke committed
94
  void SetQuery(const data_size_t* query, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
95

Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
  /*!
  * \brief Set initial scores
  * \param init_score Initial scores, this class will manage memory for init_score.
  */
100
  void SetInitScore(const double* init_score, data_size_t len);
Guolin Ke's avatar
Guolin Ke committed
101
102
103
104
105
106


  /*!
  * \brief Save binary data to file
  * \param file File want to write
  */
107
  void SaveBinaryToFile(const VirtualFileWriter* writer) const;
Guolin Ke's avatar
Guolin Ke committed
108
109
110
111
112
113
114
115
116
117

  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const;

  /*!
  * \brief Get pointer of label
  * \return Pointer of label
  */
118
  inline const label_t* label() const { return label_.data(); }
Guolin Ke's avatar
Guolin Ke committed
119
120
121
122
123
124

  /*!
  * \brief Set label for one record
  * \param idx Index of this record
  * \param value Label value of this record
  */
125
  inline void SetLabelAt(data_size_t idx, label_t value) {
126
    label_[idx] = value;
Guolin Ke's avatar
Guolin Ke committed
127
128
  }

Guolin Ke's avatar
Guolin Ke committed
129
130
131
132
133
  /*!
  * \brief Set Weight for one record
  * \param idx Index of this record
  * \param value Weight value of this record
  */
134
  inline void SetWeightAt(data_size_t idx, label_t value) {
135
    weights_[idx] = value;
Guolin Ke's avatar
Guolin Ke committed
136
137
138
139
140
141
142
  }

  /*!
  * \brief Set Query Id for one record
  * \param idx Index of this record
  * \param value Query Id value of this record
  */
143
  inline void SetQueryAt(data_size_t idx, data_size_t value) {
Guolin Ke's avatar
Guolin Ke committed
144
145
146
    queries_[idx] = static_cast<data_size_t>(value);
  }

Guolin Ke's avatar
Guolin Ke committed
147
  /*!
Hui Xue's avatar
Hui Xue committed
148
  * \brief Get weights, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
149
150
  * \return Pointer of weights
  */
151
  inline const label_t* weights() const {
Guolin Ke's avatar
Guolin Ke committed
152
    if (!weights_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
153
154
155
156
157
      return weights_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
158
159

  /*!
Hui Xue's avatar
Hui Xue committed
160
  * \brief Get data boundaries on queries, if not exists, will return nullptr
161
  *        we assume data will order by query,
Guolin Ke's avatar
Guolin Ke committed
162
163
164
165
  *        the interval of [query_boundaris[i], query_boundaris[i+1])
  *        is the data indices for query i.
  * \return Pointer of data boundaries on queries
  */
166
  inline const data_size_t* query_boundaries() const {
Guolin Ke's avatar
Guolin Ke committed
167
    if (!query_boundaries_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
168
169
170
171
172
      return query_boundaries_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
173
174
175
176
177

  /*!
  * \brief Get Number of queries
  * \return Number of queries
  */
178
  inline data_size_t num_queries() const { return num_queries_; }
Guolin Ke's avatar
Guolin Ke committed
179
180

  /*!
Hui Xue's avatar
Hui Xue committed
181
  * \brief Get weights for queries, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
182
183
  * \return Pointer of weights for queries
  */
184
  inline const label_t* query_weights() const {
Guolin Ke's avatar
Guolin Ke committed
185
    if (!query_weights_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
186
187
188
189
190
      return query_weights_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
191
192

  /*!
Hui Xue's avatar
Hui Xue committed
193
  * \brief Get initial scores, if not exists, will return nullptr
Guolin Ke's avatar
Guolin Ke committed
194
195
  * \return Pointer of initial scores
  */
196
  inline const double* init_score() const {
Guolin Ke's avatar
Guolin Ke committed
197
    if (!init_score_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
198
199
200
201
202
      return init_score_.data();
    } else {
      return nullptr;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
203

204
205
206
  /*!
  * \brief Get size of initial scores
  */
Guolin Ke's avatar
Guolin Ke committed
207
  inline int64_t num_init_score() const { return num_init_score_; }
208

Guolin Ke's avatar
Guolin Ke committed
209
210
211
212
  /*! \brief Disable copy */
  Metadata& operator=(const Metadata&) = delete;
  /*! \brief Disable copy */
  Metadata(const Metadata&) = delete;
Guolin Ke's avatar
Guolin Ke committed
213

Nikita Titov's avatar
Nikita Titov committed
214
 private:
Guolin Ke's avatar
Guolin Ke committed
215
  /*! \brief Load initial scores from file */
216
  void LoadInitialScore(const char* initscore_file);
Guolin Ke's avatar
Guolin Ke committed
217
218
219
220
221
222
223
  /*! \brief Load wights from file */
  void LoadWeights();
  /*! \brief Load query boundaries from file */
  void LoadQueryBoundaries();
  /*! \brief Load query wights */
  void LoadQueryWeights();
  /*! \brief Filename of current data */
Guolin Ke's avatar
Guolin Ke committed
224
  std::string data_filename_;
Guolin Ke's avatar
Guolin Ke committed
225
226
227
228
229
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Number of weights, used to check correct weight file */
  data_size_t num_weights_;
  /*! \brief Label data */
230
  std::vector<label_t> label_;
Guolin Ke's avatar
Guolin Ke committed
231
  /*! \brief Weights data */
232
  std::vector<label_t> weights_;
Guolin Ke's avatar
Guolin Ke committed
233
  /*! \brief Query boundaries */
Guolin Ke's avatar
Guolin Ke committed
234
  std::vector<data_size_t> query_boundaries_;
Guolin Ke's avatar
Guolin Ke committed
235
  /*! \brief Query weights */
236
  std::vector<label_t> query_weights_;
Guolin Ke's avatar
Guolin Ke committed
237
238
239
  /*! \brief Number of querys */
  data_size_t num_queries_;
  /*! \brief Number of Initial score, used to check correct weight file */
Guolin Ke's avatar
Guolin Ke committed
240
  int64_t num_init_score_;
Guolin Ke's avatar
Guolin Ke committed
241
  /*! \brief Initial score */
Guolin Ke's avatar
Guolin Ke committed
242
  std::vector<double> init_score_;
Guolin Ke's avatar
Guolin Ke committed
243
  /*! \brief Queries data */
Guolin Ke's avatar
Guolin Ke committed
244
  std::vector<data_size_t> queries_;
245
246
  /*! \brief mutex for threading safe call */
  std::mutex mutex_;
247
248
249
  bool weight_load_from_file_;
  bool query_load_from_file_;
  bool init_score_load_from_file_;
Guolin Ke's avatar
Guolin Ke committed
250
251
252
253
254
};


/*! \brief Interface for Parser */
class Parser {
Nikita Titov's avatar
Nikita Titov committed
255
 public:
Guolin Ke's avatar
Guolin Ke committed
256
257
258
259
260
261
  /*! \brief virtual destructor */
  virtual ~Parser() {}

  /*!
  * \brief Parse one line with label
  * \param str One line record, string format, should end with '\0'
Guolin Ke's avatar
Guolin Ke committed
262
263
  * \param out_features Output columns, store in (column_idx, values)
  * \param out_label Label will store to this if exists
Guolin Ke's avatar
Guolin Ke committed
264
265
  */
  virtual void ParseOneLine(const char* str,
266
                            std::vector<std::pair<int, double>>* out_features, double* out_label) const = 0;
Guolin Ke's avatar
Guolin Ke committed
267

268
  virtual int NumFeatures() const = 0;
Guolin Ke's avatar
Guolin Ke committed
269

Guolin Ke's avatar
Guolin Ke committed
270
  /*!
271
  * \brief Create an object of parser, will auto choose the format depend on file
Guolin Ke's avatar
Guolin Ke committed
272
  * \param filename One Filename of data
273
  * \param num_features Pass num_features of this data file if you know, <=0 means don't know
Guolin Ke's avatar
Guolin Ke committed
274
  * \param label_idx index of label column
Guolin Ke's avatar
Guolin Ke committed
275
276
  * \return Object of parser
  */
Guolin Ke's avatar
Guolin Ke committed
277
  static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx);
Guolin Ke's avatar
Guolin Ke committed
278
279
};

280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
struct TrainingTempState {
  std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
      hist_buf;
  int num_bin_aligned;
  bool use_subfeature;
  std::unique_ptr<MultiValBin> multi_val_bin;
  std::unique_ptr<MultiValBin> multi_val_bin_subfeature;
  std::vector<uint32_t> hist_move_src;
  std::vector<uint32_t> hist_move_dest;
  std::vector<uint32_t> hist_move_size;

  void SetMultiValBin(MultiValBin* bin) {
    if (bin == nullptr) {
      return;
    }
    multi_val_bin.reset(bin);
    int num_threads = 1;
#pragma omp parallel
#pragma omp master
    { num_threads = omp_get_num_threads(); }
Nikita Titov's avatar
Nikita Titov committed
300
    num_bin_aligned =
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
        (bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
    size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;
    if (new_size > hist_buf.size()) {
      hist_buf.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
    }
  }

  hist_t* TempBuf() {
    if (!use_subfeature) {
      return nullptr;
    }
    return hist_buf.data() + hist_buf.size() - num_bin_aligned * 2;
  }

  void HistMove(const hist_t* src, hist_t* dest) {
    if (!use_subfeature) {
      return;
    }
#pragma omp parallel for schedule(static)
    for (int i = 0; i < static_cast<int>(hist_move_src.size()); ++i) {
      std::copy_n(src + hist_move_src[i], hist_move_size[i],
                  dest + hist_move_dest[i]);
    }
  }
};

Guolin Ke's avatar
Guolin Ke committed
327
/*! \brief The main class of data set,
328
*          which are used to training or validation
Guolin Ke's avatar
Guolin Ke committed
329
330
*/
class Dataset {
Nikita Titov's avatar
Nikita Titov committed
331
 public:
Guolin Ke's avatar
Guolin Ke committed
332
  friend DatasetLoader;
Guolin Ke's avatar
Guolin Ke committed
333

334
  LIGHTGBM_EXPORT Dataset();
Guolin Ke's avatar
Guolin Ke committed
335

336
  LIGHTGBM_EXPORT Dataset(data_size_t num_data);
Guolin Ke's avatar
Guolin Ke committed
337

Guolin Ke's avatar
Guolin Ke committed
338
  void Construct(
Guolin Ke's avatar
Guolin Ke committed
339
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
340
    int num_total_features,
341
    const std::vector<std::vector<double>>& forced_bins,
342
    int** sample_non_zero_indices,
Guolin Ke's avatar
Guolin Ke committed
343
    double** sample_values,
344
    const int* num_per_col,
345
    int num_sample_col,
Guolin Ke's avatar
Guolin Ke committed
346
    size_t total_sample_cnt,
Guolin Ke's avatar
Guolin Ke committed
347
    const Config& io_config);
Guolin Ke's avatar
Guolin Ke committed
348

Guolin Ke's avatar
Guolin Ke committed
349
  /*! \brief Destructor */
350
  LIGHTGBM_EXPORT ~Dataset();
Guolin Ke's avatar
Guolin Ke committed
351

352
  LIGHTGBM_EXPORT bool CheckAlign(const Dataset& other) const {
353
354
355
356
357
358
359
360
361
362
    if (num_features_ != other.num_features_) {
      return false;
    }
    if (num_total_features_ != other.num_total_features_) {
      return false;
    }
    if (label_idx_ != other.label_idx_) {
      return false;
    }
    for (int i = 0; i < num_features_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
363
      if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
364
365
366
367
368
369
        return false;
      }
    }
    return true;
  }

Guolin Ke's avatar
Guolin Ke committed
370
371
372
373
374
375
376
377
378
379
  inline void FinishOneRow(int tid, data_size_t row_idx, const std::vector<bool>& is_feature_added) {
    if (is_finish_load_) { return; }
    for (auto fidx : feature_need_push_zeros_) {
      if (is_feature_added[fidx]) { continue; }
      const int group = feature2group_[fidx];
      const int sub_feature = feature2subfeature_[fidx];
      feature_groups_[group]->PushData(tid, sub_feature, row_idx, 0.0f);
    }
  }

Guolin Ke's avatar
Guolin Ke committed
380
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
Guolin Ke's avatar
Guolin Ke committed
381
    if (is_finish_load_) { return; }
Guolin Ke's avatar
Guolin Ke committed
382
    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
Guolin Ke's avatar
Guolin Ke committed
383
384
      int feature_idx = used_feature_map_[i];
      if (feature_idx >= 0) {
Guolin Ke's avatar
Guolin Ke committed
385
386
387
        const int group = feature2group_[feature_idx];
        const int sub_feature = feature2subfeature_[feature_idx];
        feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
Guolin Ke's avatar
Guolin Ke committed
388
389
390
391
      }
    }
  }

392
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
Guolin Ke's avatar
Guolin Ke committed
393
    if (is_finish_load_) { return; }
Guolin Ke's avatar
Guolin Ke committed
394
    std::vector<bool> is_feature_added(num_features_, false);
395
    for (auto& inner_data : feature_values) {
396
      if (inner_data.first >= num_total_features_) { continue; }
397
398
      int feature_idx = used_feature_map_[inner_data.first];
      if (feature_idx >= 0) {
Guolin Ke's avatar
Guolin Ke committed
399
        is_feature_added[feature_idx] = true;
Guolin Ke's avatar
Guolin Ke committed
400
401
402
        const int group = feature2group_[feature_idx];
        const int sub_feature = feature2subfeature_[feature_idx];
        feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
403
404
      }
    }
Guolin Ke's avatar
Guolin Ke committed
405
    FinishOneRow(tid, row_idx, is_feature_added);
406
407
  }

Guolin Ke's avatar
Guolin Ke committed
408
409
410
411
412
413
414
415
416
  inline void PushOneData(int tid, data_size_t row_idx, int group, int sub_feature, double value) {
    feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
  }

  inline int RealFeatureIndex(int fidx) const {
    return real_feature_idx_[fidx];
  }

  inline int InnerFeatureIndex(int col_idx) const {
Guolin Ke's avatar
Guolin Ke committed
417
    return used_feature_map_[col_idx];
Guolin Ke's avatar
Guolin Ke committed
418
  }
Guolin Ke's avatar
Guolin Ke committed
419
420
421
422
423
424
  inline int Feature2Group(int feature_idx) const {
    return feature2group_[feature_idx];
  }
  inline int Feture2SubFeature(int feature_idx) const {
    return feature2subfeature_[feature_idx];
  }
425
426
427
  inline uint64_t GroupBinBoundary(int group_idx) const {
    return group_bin_boundaries_[group_idx];
  }
Guolin Ke's avatar
Guolin Ke committed
428
429
430
  inline uint64_t NumTotalBin() const {
    return group_bin_boundaries_.back();
  }
431

432
433
434
435
436
437
438
439
440
  inline std::vector<int> ValidFeatureIndices() const {
    std::vector<int> ret;
    for (int i = 0; i < num_total_features_; ++i) {
      if (used_feature_map_[i] >= 0) {
        ret.push_back(i);
      }
    }
    return ret;
  }
Guolin Ke's avatar
Guolin Ke committed
441
442
443
  void ReSize(data_size_t num_data);

  void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
Guolin Ke's avatar
Guolin Ke committed
444

445
446
447
448
  MultiValBin* GetMultiBinFromSparseFeatures() const;

  MultiValBin* GetMultiBinFromAllFeatures() const;

449
  TrainingTempState* TestMultiThreadingMethod(
Nikita Titov's avatar
Nikita Titov committed
450
451
    score_t* gradients, score_t* hessians,
    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
452
453
    bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const;

454
  LIGHTGBM_EXPORT void FinishLoad();
Guolin Ke's avatar
Guolin Ke committed
455

456
  LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
Guolin Ke's avatar
Guolin Ke committed
457

458
  LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
Guolin Ke's avatar
Guolin Ke committed
459

460
  LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
461

462
  LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
463

464
  LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
Guolin Ke's avatar
Guolin Ke committed
465

466
  LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
467

Guolin Ke's avatar
Guolin Ke committed
468
469
470
  /*!
  * \brief Save current dataset into binary file, will save to "filename.bin"
  */
471
  LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
Guolin Ke's avatar
Guolin Ke committed
472

473
474
  LIGHTGBM_EXPORT void DumpTextFile(const char* text_filename);

475
  LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
Guolin Ke's avatar
Guolin Ke committed
476

Guolin Ke's avatar
Guolin Ke committed
477
478
  LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);

479
480
481
482
  void InitTrain(const std::vector<int8_t>& is_feature_used,
                 bool is_colwise,
                 TrainingTempState* temp_state) const;

483
  void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
484
485
486
487
488
                           const data_size_t* data_indices,
                           data_size_t num_data, const score_t* gradients,
                           const score_t* hessians, score_t* ordered_gradients,
                           score_t* ordered_hessians, bool is_constant_hessian,
                           bool is_colwise, TrainingTempState* temp_state,
489
490
                           hist_t* histogram_data) const;

491
492
493
494
  void ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                   data_size_t num_data,
                                   const score_t* gradients,
                                   const score_t* hessians,
495
                                   bool is_constant_hessian,
496
                                   TrainingTempState* temp_state,
497
                                   hist_t* histogram_data) const;
Guolin Ke's avatar
Guolin Ke committed
498

499
  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
Guolin Ke's avatar
Guolin Ke committed
500

501
  inline data_size_t Split(int feature,
502
                           const uint32_t* threshold, int num_threshold,  bool default_left,
503
504
                           data_size_t* data_indices, data_size_t num_data,
                           data_size_t* lte_indices, data_size_t* gt_indices) const {
Guolin Ke's avatar
Guolin Ke committed
505
506
    const int group = feature2group_[feature];
    const int sub_feature = feature2subfeature_[feature];
507
    return feature_groups_[group]->Split(sub_feature, threshold, num_threshold, default_left, data_indices, num_data, lte_indices, gt_indices);
Guolin Ke's avatar
Guolin Ke committed
508
509
510
511
512
513
514
515
516
517
518
519
520
521
  }

  inline int SubFeatureBinOffset(int i) const {
    const int sub_feature = feature2subfeature_[i];
    if (sub_feature == 0) {
      return 1;
    } else {
      return 0;
    }
  }

  inline int FeatureNumBin(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
522
    return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
Guolin Ke's avatar
Guolin Ke committed
523
  }
Guolin Ke's avatar
Guolin Ke committed
524

525
526
527
  inline int FeatureGroupNumBin(int group) const {
    return feature_groups_[group]->num_total_bin_;
  }
528

Guolin Ke's avatar
Guolin Ke committed
529
530
531
532
533
534
  inline const BinMapper* FeatureBinMapper(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature].get();
  }

535
536
537
538
  inline const Bin* FeatureGroupBin(int group) const {
    return feature_groups_[group]->bin_data_.get();
  }

Guolin Ke's avatar
Guolin Ke committed
539
540
541
  inline BinIterator* FeatureIterator(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
zhangyafeikimi's avatar
zhangyafeikimi committed
542
    return feature_groups_[group]->SubFeatureIterator(sub_feature);
Guolin Ke's avatar
Guolin Ke committed
543
544
  }

545
546
547
  inline BinIterator* FeatureGroupIterator(int group) const {
    return feature_groups_[group]->FeatureGroupIterator();
  }
548

549
550
551
552
  inline bool IsMultiGroup(int i) const {
    return feature_groups_[i]->is_multi_val_;
  }

Guolin Ke's avatar
Guolin Ke committed
553
554
555
556
557
558
  inline double RealThreshold(int i, uint32_t threshold) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
  }

559
560
561
562
563
564
565
  // given a real threshold, find the closest threshold bin
  inline uint32_t BinThreshold(int i, double threshold_double) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
    return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
  }

Guolin Ke's avatar
Guolin Ke committed
566
567
568
569
570
571
572
573
574
  /*!
  * \brief Get meta data pointer
  * \return Pointer of meta data
  */
  inline const Metadata& metadata() const { return metadata_; }

  /*! \brief Get Number of used features */
  inline int num_features() const { return num_features_; }

575
576
577
  /*! \brief Get Number of feature groups */
  inline int num_feature_groups() const { return num_groups_;}

578
579
580
  /*! \brief Get Number of total features */
  inline int num_total_features() const { return num_total_features_; }

Guolin Ke's avatar
Guolin Ke committed
581
582
583
584
  /*! \brief Get the index of label column */
  inline int label_idx() const { return label_idx_; }

  /*! \brief Get names of current data set */
Guolin Ke's avatar
Guolin Ke committed
585
586
587
588
  inline const std::vector<std::string>& feature_names() const { return feature_names_; }

  inline void set_feature_names(const std::vector<std::string>& feature_names) {
    if (feature_names.size() != static_cast<size_t>(num_total_features_)) {
589
      Log::Fatal("Size of feature_names error, should equal with total number of features");
Guolin Ke's avatar
Guolin Ke committed
590
591
    }
    feature_names_ = std::vector<std::string>(feature_names);
Guolin Ke's avatar
Guolin Ke committed
592
    std::unordered_set<std::string> feature_name_set;
593
594
    // replace ' ' in feature_names with '_'
    bool spaceInFeatureName = false;
595
    for (auto& feature_name : feature_names_) {
596
597
      // check ascii
      if (!Common::CheckASCII(feature_name)) {
598
599
600
601
602
        Log::Fatal("Do not support non-ASCII characters in feature name.");
      }
      // check json
      if (!Common::CheckAllowedJSON(feature_name)) {
        Log::Fatal("Do not support special JSON characters in feature name.");
603
      }
604
      if (feature_name.find(' ') != std::string::npos) {
605
606
607
        spaceInFeatureName = true;
        std::replace(feature_name.begin(), feature_name.end(), ' ', '_');
      }
Guolin Ke's avatar
Guolin Ke committed
608
609
610
611
      if (feature_name_set.count(feature_name) > 0) {
        Log::Fatal("Feature (%s) appears more than one time.", feature_name.c_str());
      }
      feature_name_set.insert(feature_name);
612
    }
613
    if (spaceInFeatureName) {
614
615
      Log::Warning("Find whitespaces in feature_names, replace with underlines");
    }
Guolin Ke's avatar
Guolin Ke committed
616
  }
Guolin Ke's avatar
Guolin Ke committed
617

Guolin Ke's avatar
Guolin Ke committed
618
619
620
621
622
623
624
625
626
627
628
629
630
631
  inline std::vector<std::string> feature_infos() const {
    std::vector<std::string> bufs;
    for (int i = 0; i < num_total_features_; i++) {
      int fidx = used_feature_map_[i];
      if (fidx == -1) {
        bufs.push_back("none");
      } else {
        const auto bin_mapper = FeatureBinMapper(fidx);
        bufs.push_back(bin_mapper->bin_info());
      }
    }
    return bufs;
  }

Guolin Ke's avatar
Guolin Ke committed
632
633
634
635
636
637
638
639
  /*! \brief Get Number of data */
  inline data_size_t num_data() const { return num_data_; }

  /*! \brief Disable copy */
  Dataset& operator=(const Dataset&) = delete;
  /*! \brief Disable copy */
  Dataset(const Dataset&) = delete;

640
  void AddFeaturesFrom(Dataset* other);
641

Nikita Titov's avatar
Nikita Titov committed
642
 private:
Guolin Ke's avatar
Guolin Ke committed
643
  std::string data_filename_;
Guolin Ke's avatar
Guolin Ke committed
644
  /*! \brief Store used features */
Guolin Ke's avatar
Guolin Ke committed
645
  std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
Guolin Ke's avatar
Guolin Ke committed
646
647
648
649
  /*! \brief Mapper from real feature index to used index*/
  std::vector<int> used_feature_map_;
  /*! \brief Number of used features*/
  int num_features_;
650
651
  /*! \brief Number of total features*/
  int num_total_features_;
Guolin Ke's avatar
Guolin Ke committed
652
653
654
655
  /*! \brief Number of total data*/
  data_size_t num_data_;
  /*! \brief Store some label level data*/
  Metadata metadata_;
Guolin Ke's avatar
Guolin Ke committed
656
657
658
659
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
660
661
  /*! \brief store feature names */
  static const char* binary_file_token;
Guolin Ke's avatar
Guolin Ke committed
662
663
664
665
666
667
668
  int num_groups_;
  std::vector<int> real_feature_idx_;
  std::vector<int> feature2group_;
  std::vector<int> feature2subfeature_;
  std::vector<uint64_t> group_bin_boundaries_;
  std::vector<int> group_feature_start_;
  std::vector<int> group_feature_cnt_;
Guolin Ke's avatar
Guolin Ke committed
669
  bool is_finish_load_;
670
  int max_bin_;
Belinda Trotta's avatar
Belinda Trotta committed
671
  std::vector<int32_t> max_bin_by_feature_;
672
  std::vector<std::vector<double>> forced_bin_bounds_;
673
674
675
676
  int bin_construct_sample_cnt_;
  int min_data_in_bin_;
  bool use_missing_;
  bool zero_as_missing_;
Guolin Ke's avatar
Guolin Ke committed
677
  std::vector<int> feature_need_push_zeros_;
Guolin Ke's avatar
Guolin Ke committed
678
679
680
681
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
682
#endif   // LightGBM_DATA_H_