dataset_loader.h 2.46 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#ifndef LIGHTGBM_DATASET_LOADER_H_
#define LIGHTGBM_DATASET_LOADER_H_

#include <LightGBM/dataset.h>

namespace LightGBM {

class DatasetLoader {
public:

  DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun);

  ~DatasetLoader();

  void SetHeadder(const char* filename);

  Dataset* LoadFromFile(const char* filename, int rank, int num_machines);

  Dataset* LoadFromFile(const char* filename) {
    LoadFromFile(filename, 0, 1);
  }

  Dataset* LoadFromFileLikeOthers(const char* filename, const Dataset* other);

  Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines);


  /*! \brief Disable copy */
  DatasetLoader& operator=(const DatasetLoader&) = delete;
  /*! \brief Disable copy */
  DatasetLoader(const DatasetLoader&) = delete;

private:
  void CheckDataset(const Dataset* dataset);

  std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);

  std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from memory */
  void ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from file */
  void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);

  /*! \brief Check can load from binary file */
  bool CheckCanLoadFromBin(const char* filename);


  const IOConfig& io_config_;
  /*! \brief Random generator*/
  Random random_;
  /*! \brief prediction function for initial model */
  const PredictFunction& predict_fun_;
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief index of weight column */
  int weight_idx_ = -1;
  /*! \brief index of group column */
  int group_idx_ = -1;
  /*! \brief Mapper from real feature index to used index*/
  std::unordered_set<int> ignore_features_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;

};

}

#endif // LIGHTGBM_DATASET_LOADER_H_