dataset_loader.h 2.63 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#ifndef LIGHTGBM_DATASET_LOADER_H_
#define LIGHTGBM_DATASET_LOADER_H_

#include <LightGBM/dataset.h>

namespace LightGBM {

class DatasetLoader {
public:

  DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun);

  ~DatasetLoader();

Guolin Ke's avatar
Guolin Ke committed
15
  void SetHeader(const char* filename);
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19

  Dataset* LoadFromFile(const char* filename, int rank, int num_machines);

  Dataset* LoadFromFile(const char* filename) {
Guolin Ke's avatar
Guolin Ke committed
20
    return LoadFromFile(filename, 0, 1);
Guolin Ke's avatar
Guolin Ke committed
21
22
  }

Guolin Ke's avatar
Guolin Ke committed
23
  Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
Guolin Ke's avatar
Guolin Ke committed
24
25
26

  Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines);

Guolin Ke's avatar
Guolin Ke committed
27
  Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data);
Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

  /*! \brief Disable copy */
  DatasetLoader& operator=(const DatasetLoader&) = delete;
  /*! \brief Disable copy */
  DatasetLoader(const DatasetLoader&) = delete;

private:
  void CheckDataset(const Dataset* dataset);

  std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);

  std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from memory */
  void ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from file */
  void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);

  /*! \brief Check can load from binary file */
  bool CheckCanLoadFromBin(const char* filename);

  const IOConfig& io_config_;
  /*! \brief Random generator*/
  Random random_;
  /*! \brief prediction function for initial model */
  const PredictFunction& predict_fun_;
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief index of weight column */
Guolin Ke's avatar
Guolin Ke committed
62
  int weight_idx_ = NO_SPECIFIC;
Guolin Ke's avatar
Guolin Ke committed
63
  /*! \brief index of group column */
Guolin Ke's avatar
Guolin Ke committed
64
  int group_idx_ = NO_SPECIFIC;
Guolin Ke's avatar
Guolin Ke committed
65
66
67
68
69
70
71
72
73
74
  /*! \brief Mapper from real feature index to used index*/
  std::unordered_set<int> ignore_features_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;

};

}

#endif // LIGHTGBM_DATASET_LOADER_H_