dataset_loader.h 2.57 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#ifndef LIGHTGBM_DATASET_LOADER_H_
#define LIGHTGBM_DATASET_LOADER_H_

#include <LightGBM/dataset.h>

namespace LightGBM {

class DatasetLoader {
public:

  DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun);

  ~DatasetLoader();

  void SetHeadder(const char* filename);

  Dataset* LoadFromFile(const char* filename, int rank, int num_machines);

  Dataset* LoadFromFile(const char* filename) {
Guolin Ke's avatar
Guolin Ke committed
20
    return LoadFromFile(filename, 0, 1);
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
26
  }

  Dataset* LoadFromFileLikeOthers(const char* filename, const Dataset* other);

  Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines);

Guolin Ke's avatar
Guolin Ke committed
27
  Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, data_size_t num_data);
Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

  /*! \brief Disable copy */
  DatasetLoader& operator=(const DatasetLoader&) = delete;
  /*! \brief Disable copy */
  DatasetLoader(const DatasetLoader&) = delete;

private:
  void CheckDataset(const Dataset* dataset);

  std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);

  std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from memory */
  void ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from file */
  void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);

  /*! \brief Check can load from binary file */
  bool CheckCanLoadFromBin(const char* filename);


  const IOConfig& io_config_;
  /*! \brief Random generator*/
  Random random_;
  /*! \brief prediction function for initial model */
  const PredictFunction& predict_fun_;
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief index of weight column */
  int weight_idx_ = -1;
  /*! \brief index of group column */
  int group_idx_ = -1;
  /*! \brief Mapper from real feature index to used index*/
  std::unordered_set<int> ignore_features_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;

};

}

#endif // LIGHTGBM_DATASET_LOADER_H_