".github/vscode:/vscode.git/clone" did not exist on "a77260f03eb735923125024b0f36df9f63a6e0f3"
dataset_loader.h 4.69 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
6
#ifndef LIGHTGBM_INCLUDE_LIGHTGBM_DATASET_LOADER_H_
#define LIGHTGBM_INCLUDE_LIGHTGBM_DATASET_LOADER_H_
Guolin Ke's avatar
Guolin Ke committed
7

8
9
#include <LightGBM/dataset.h>

10
#include <memory>
11
12
13
14
#include <string>
#include <unordered_set>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
15
16
17
namespace LightGBM {

class DatasetLoader {
Nikita Titov's avatar
Nikita Titov committed
18
 public:
Guolin Ke's avatar
Guolin Ke committed
19
  LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);
Guolin Ke's avatar
Guolin Ke committed
20

21
  LIGHTGBM_EXPORT ~DatasetLoader();
Guolin Ke's avatar
Guolin Ke committed
22

23
  LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
Guolin Ke's avatar
Guolin Ke committed
24

25
26
  LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename) {
    return LoadFromFile(filename, 0, 1);
Guolin Ke's avatar
Guolin Ke committed
27
28
  }

29
  LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
Guolin Ke's avatar
Guolin Ke committed
30

31
32
  LIGHTGBM_EXPORT Dataset* LoadFromSerializedReference(const char* buffer, size_t buffer_size, data_size_t num_data, int32_t num_classes);

33
  LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values,
34
35
36
37
38
39
                                                   int** sample_indices,
                                                   int num_col,
                                                   const int* num_per_col,
                                                   size_t total_sample_size,
                                                   data_size_t num_local_data,
                                                   int64_t num_dist_data);
Guolin Ke's avatar
Guolin Ke committed
40
41
42
43
44
45

  /*! \brief Disable copy */
  DatasetLoader& operator=(const DatasetLoader&) = delete;
  /*! \brief Disable copy */
  DatasetLoader(const DatasetLoader&) = delete;

46
47
48
  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features,
                                                        const std::unordered_set<int>& categorical_features);

Nikita Titov's avatar
Nikita Titov committed
49
 private:
50
51
  void LoadHeaderFromMemory(Dataset* dataset, const char* buffer);

52
53
  Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

Guolin Ke's avatar
Guolin Ke committed
54
55
  void SetHeader(const char* filename);

56
  void CheckDataset(const Dataset* dataset, bool is_load_from_binary);
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
61
62
63
64
65
66

  std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);

  std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

  void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);

  /*! \brief Extract local features from memory */
Guolin Ke's avatar
Guolin Ke committed
67
  void ExtractFeaturesFromMemory(std::vector<std::string>* text_data, const Parser* parser, Dataset* dataset);
Guolin Ke's avatar
Guolin Ke committed
68
69
70
71
72

  /*! \brief Extract local features from file */
  void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);

  /*! \brief Check can load from binary file */
73
  std::string CheckCanLoadFromBin(const char* filename);
Guolin Ke's avatar
Guolin Ke committed
74

75
76
77
78
79
80
81
82
83
84
  /*! \brief Check the number of bins for categorical features.
   * The number of bins for categorical features may exceed the configured maximum value.
   * Log warnings when such cases happen.
   *
   * \param bin_mappers the bin_mappers of all features
   * \param max_bin max_bin from Config
   * \param max_bin_by_feature max_bin_by_feature from Config
   */
  void CheckCategoricalFeatureNumBin(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers, const int max_bin, const std::vector<int>& max_bin_by_feature) const;

Guolin Ke's avatar
Guolin Ke committed
85
  const Config& config_;
Guolin Ke's avatar
Guolin Ke committed
86
87
88
  /*! \brief Random generator*/
  Random random_;
  /*! \brief prediction function for initial model */
89
  const PredictFunction predict_fun_;
90
91
  /*! \brief number of classes */
  int num_class_;
Guolin Ke's avatar
Guolin Ke committed
92
  /*! \brief index of label column */
Guolin Ke's avatar
Guolin Ke committed
93
  int label_idx_;
Guolin Ke's avatar
Guolin Ke committed
94
  /*! \brief index of weight column */
Guolin Ke's avatar
Guolin Ke committed
95
  int weight_idx_;
Guolin Ke's avatar
Guolin Ke committed
96
  /*! \brief index of group column */
Guolin Ke's avatar
Guolin Ke committed
97
  int group_idx_;
Guolin Ke's avatar
Guolin Ke committed
98
99
100
101
  /*! \brief Mapper from real feature index to used index*/
  std::unordered_set<int> ignore_features_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
102
103
  /*! \brief Mapper from real feature index to used index*/
  std::unordered_set<int> categorical_features_;
104
105
  /*! \brief Whether to store raw feature values */
  bool store_raw_;
Guolin Ke's avatar
Guolin Ke committed
106
107
};

108
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
109

110
#endif  // LIGHTGBM_INCLUDE_LIGHTGBM_DATASET_LOADER_H_