dataset_loader.h 3.12 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
#ifndef LIGHTGBM_DATASET_LOADER_H_
#define LIGHTGBM_DATASET_LOADER_H_

#include <LightGBM/dataset.h>

namespace LightGBM {

class DatasetLoader {
Nikita Titov's avatar
Nikita Titov committed
9
 public:
Guolin Ke's avatar
Guolin Ke committed
10
  LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);
Guolin Ke's avatar
Guolin Ke committed
11

12
  LIGHTGBM_EXPORT ~DatasetLoader();
Guolin Ke's avatar
Guolin Ke committed
13

14
  LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file, int rank, int num_machines);
Guolin Ke's avatar
Guolin Ke committed
15

16
17
  LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file) {
    return LoadFromFile(filename, initscore_file, 0, 1);
Guolin Ke's avatar
Guolin Ke committed
18
19
  }

20
  LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const char* initscore_file, const Dataset* train_data);
Guolin Ke's avatar
Guolin Ke committed
21

22
23
  LIGHTGBM_EXPORT Dataset* CostructFromSampleData(double** sample_values,
    int** sample_indices, int num_col, const int* num_per_col,
Guolin Ke's avatar
Guolin Ke committed
24
    size_t total_sample_size, data_size_t num_data);
Guolin Ke's avatar
Guolin Ke committed
25
26
27
28
29
30

  /*! \brief Disable copy */
  DatasetLoader& operator=(const DatasetLoader&) = delete;
  /*! \brief Disable copy */
  DatasetLoader(const DatasetLoader&) = delete;

Nikita Titov's avatar
Nikita Titov committed
31
 private:
32
33
  Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

Guolin Ke's avatar
Guolin Ke committed
34
35
  void SetHeader(const char* filename);

Guolin Ke's avatar
Guolin Ke committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
  void CheckDataset(const Dataset* dataset);

  std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);

  std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from memory */
  void ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from file */
  void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);

  /*! \brief Check can load from binary file */
53
  std::string CheckCanLoadFromBin(const char* filename);
Guolin Ke's avatar
Guolin Ke committed
54

Guolin Ke's avatar
Guolin Ke committed
55
  const Config& config_;
Guolin Ke's avatar
Guolin Ke committed
56
57
58
59
  /*! \brief Random generator*/
  Random random_;
  /*! \brief prediction function for initial model */
  const PredictFunction& predict_fun_;
60
61
  /*! \brief number of classes */
  int num_class_;
Guolin Ke's avatar
Guolin Ke committed
62
  /*! \brief index of label column */
Guolin Ke's avatar
Guolin Ke committed
63
  int label_idx_;
Guolin Ke's avatar
Guolin Ke committed
64
  /*! \brief index of weight column */
Guolin Ke's avatar
Guolin Ke committed
65
  int weight_idx_;
Guolin Ke's avatar
Guolin Ke committed
66
  /*! \brief index of group column */
Guolin Ke's avatar
Guolin Ke committed
67
  int group_idx_;
Guolin Ke's avatar
Guolin Ke committed
68
69
70
71
  /*! \brief Mapper from real feature index to used index*/
  std::unordered_set<int> ignore_features_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
72
73
  /*! \brief Mapper from real feature index to used index*/
  std::unordered_set<int> categorical_features_;
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
};

}

78
#endif  // LIGHTGBM_DATASET_LOADER_H_