Commit 1c08e71e authored by Guolin Ke's avatar Guolin Ke
Browse files

use dataset_loader to load data

parent 8696709e
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
namespace LightGBM { namespace LightGBM {
class DatasetLoader;
class Dataset; class Dataset;
class Boosting; class Boosting;
class ObjectiveFunction; class ObjectiveFunction;
...@@ -59,6 +60,8 @@ private: ...@@ -59,6 +60,8 @@ private:
/*! \brief All configs */ /*! \brief All configs */
OverallConfig config_; OverallConfig config_;
/*! \brief Dataset loader */
DatasetLoader* dataset_loader_;
/*! \brief Training data */ /*! \brief Training data */
Dataset* train_data_; Dataset* train_data_;
/*! \brief Validation data */ /*! \brief Validation data */
......
...@@ -93,7 +93,6 @@ public: ...@@ -93,7 +93,6 @@ public:
std::string output_model = "LightGBM_model.txt"; std::string output_model = "LightGBM_model.txt";
std::string output_result = "LightGBM_predict_result.txt"; std::string output_result = "LightGBM_predict_result.txt";
std::string input_model = ""; std::string input_model = "";
std::string input_init_score = "";
int verbosity = 1; int verbosity = 1;
int num_model_predict = -1; int num_model_predict = -1;
bool is_pre_partition = false; bool is_pre_partition = false;
...@@ -318,7 +317,6 @@ struct ParameterAlias { ...@@ -318,7 +317,6 @@ struct ParameterAlias {
{ "model_out", "output_model" }, { "model_out", "output_model" },
{ "model_input", "input_model" }, { "model_input", "input_model" },
{ "model_in", "input_model" }, { "model_in", "input_model" },
{ "init_score", "input_init_score"},
{ "predict_result", "output_result" }, { "predict_result", "output_result" },
{ "prediction_result", "output_result" }, { "prediction_result", "output_result" },
{ "valid", "valid_data" }, { "valid", "valid_data" },
......
#ifndef LIGHTGBM_DATA_H_ #ifndef LIGHTGBM_DATASET_H_
#define LIGHTGBM_DATA_H_ #define LIGHTGBM_DATASET_H_
#include <LightGBM/utils/random.h> #include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h> #include <LightGBM/utils/text_reader.h>
...@@ -18,6 +18,7 @@ namespace LightGBM { ...@@ -18,6 +18,7 @@ namespace LightGBM {
/*! \brief forward declaration */ /*! \brief forward declaration */
class Feature; class Feature;
class BinMapper; class BinMapper;
class DatasetLoader;
/*! /*!
* \brief This class is used to store some meta(non-feature) data for training data, * \brief This class is used to store some meta(non-feature) data for training data,
...@@ -44,13 +45,7 @@ public: ...@@ -44,13 +45,7 @@ public:
* \param init_score_filename Filename of initial score * \param init_score_filename Filename of initial score
* \param num_class Number of classes * \param num_class Number of classes
*/ */
void Init(const char* data_filename, const char* init_score_filename, const int num_class); void Init(const char* data_filename, const int num_class);
/*!
* \brief Initialize, only load initial score
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void Init(const char* init_score_filename, const int num_class);
/*! /*!
* \brief Initial with binary memory * \brief Initial with binary memory
* \param memory Pointer to memory * \param memory Pointer to memory
...@@ -177,11 +172,10 @@ public: ...@@ -177,11 +172,10 @@ public:
* \return Pointer of initial scores * \return Pointer of initial scores
*/ */
inline const float* init_score() const { return init_score_; } inline const float* init_score() const { return init_score_; }
/*! \brief Load initial scores from file */
void LoadInitialScore();
private: private:
/*! \brief Load initial scores from file */
void LoadInitialScore();
/*! \brief Load wights from file */ /*! \brief Load wights from file */
void LoadWeights(); void LoadWeights();
/*! \brief Load query boundaries from file */ /*! \brief Load query boundaries from file */
...@@ -190,8 +184,6 @@ private: ...@@ -190,8 +184,6 @@ private:
void LoadQueryWeights(); void LoadQueryWeights();
/*! \brief Filename of current data */ /*! \brief Filename of current data */
const char* data_filename_; const char* data_filename_;
/*! \brief Filename of initial scores */
const char* init_score_filename_;
/*! \brief Number of data */ /*! \brief Number of data */
data_size_t num_data_; data_size_t num_data_;
/*! \brief Number of classes */ /*! \brief Number of classes */
...@@ -251,79 +243,16 @@ using PredictFunction = ...@@ -251,79 +243,16 @@ using PredictFunction =
*/ */
class Dataset { class Dataset {
public: public:
/*! friend DatasetLoader;
* \brief Constructor
* \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const char* data_filename, const char* init_score_filename,
const IOConfig& io_config, const PredictFunction& predict_fun);
/*! Dataset();
* \brief Constructor
* \param data_filename Filename of dataset
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const char* data_filename,
const IOConfig& io_config, const PredictFunction& predict_fun)
: Dataset(data_filename, "", io_config, predict_fun) {
}
/*!
* \brief Constructor, without filename, used to load data from memory
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const IOConfig& io_config, const PredictFunction& predict_fun);
/*! \brief Destructor */ /*! \brief Destructor */
~Dataset(); ~Dataset();
/*! \brief Init Dataset with specific binmapper */
void InitByBinMapper(std::vector<const BinMapper*> bin_mappers, data_size_t num_data);
/*! \brief push raw data into dataset */
void PushData(const std::vector<std::vector<std::pair<int, float>>>& datas, data_size_t start_idx, bool is_finished);
void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type); void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type);
/*!
* \brief Load training data on parallel training
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param use_two_round_loading True if need to use two round loading
*/
void LoadTrainData(int rank, int num_machines, bool is_pre_partition,
bool use_two_round_loading);
/*!
* \brief Load training data on single machine training
* \param use_two_round_loading True if need to use two round loading
*/
inline void LoadTrainData(bool use_two_round_loading) {
LoadTrainData(0, 1, false, use_two_round_loading);
}
/*!
* \brief Load data and use bin mapper from other data set, general this function is used to extract feature for validation data
* \param train_set Other loaded data set
* \param use_two_round_loading True if need to use two round loading
*/
void LoadValidationData(const Dataset* train_set, bool use_two_round_loading);
/*!
* \brief Load data set from binary file
* \param bin_filename filename of bin data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataFromBinFile(const char* bin_filename, int rank, int num_machines, bool is_pre_partition);
/*! /*!
* \brief Save current dataset into binary file, will save to "filename.bin" * \brief Save current dataset into binary file, will save to "filename.bin"
*/ */
...@@ -331,6 +260,8 @@ public: ...@@ -331,6 +260,8 @@ public:
std::vector<const BinMapper*> GetBinMappers() const; std::vector<const BinMapper*> GetBinMappers() const;
void CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const;
/*! /*!
* \brief Get a feature pointer for specific index * \brief Get a feature pointer for specific index
* \param i Index for feature * \param i Index for feature
...@@ -365,57 +296,7 @@ public: ...@@ -365,57 +296,7 @@ public:
Dataset(const Dataset&) = delete; Dataset(const Dataset&) = delete;
private: private:
/*!
* \brief Load data content on memory. if num_machines > 1 and !is_pre_partition, will partition data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataToMemory(int rank, int num_machines, bool is_pre_partition);
/*!
* \brief Sample data from memory, need load data to memory first
* \param out_data Store the sampled data
*/
void SampleDataFromMemory(std::vector<std::string>* out_data);
/*!
* \brief Sample data from file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param out_data Store the sampled data
*/
void SampleDataFromFile(int rank, int num_machines,
bool is_pre_partition, std::vector<std::string>* out_data);
/*!
* \brief Get feature bin mapper from sampled data.
* if num_machines > 1, differnt machines will construct bin mapper for different features, then have a global sync up
* \param rank Rank of local machine
* \param num_machines Total number of all machines
*/
void ConstructBinMappers(int rank, int num_machines,
const std::vector<std::string>& sample_data);
/*! \brief Extract local features from memory */
void ExtractFeaturesFromMemory();
/*! \brief Extract local features from file */
void ExtractFeaturesFromFile();
/*! \brief Check can load from binary file */
void CheckCanLoadFromBin();
/*! \brief Check this data set is null or not */
void CheckDataset();
/*! \brief Filename of data */
const char* data_filename_; const char* data_filename_;
/*! \brief A reader class that can read text data */
TextReader<data_size_t>* text_reader_;
/*! \brief A parser class that can parse data */
Parser* parser_;
/*! \brief Store used features */ /*! \brief Store used features */
std::vector<Feature*> features_; std::vector<Feature*> features_;
/*! \brief Mapper from real feature index to used index*/ /*! \brief Mapper from real feature index to used index*/
...@@ -430,32 +311,12 @@ private: ...@@ -430,32 +311,12 @@ private:
int num_class_; int num_class_;
/*! \brief Store some label level data*/ /*! \brief Store some label level data*/
Metadata metadata_; Metadata metadata_;
/*! \brief Random generator*/
Random random_;
/*! \brief The maximal number of bin that feature values will bucket in */
int max_bin_;
/*! \brief True if enable sparse */
bool is_enable_sparse_;
/*! \brief True if dataset is loaded from binary file */ /*! \brief True if dataset is loaded from binary file */
bool is_loading_from_binfile_; bool is_loading_from_binfile_;
/*! \brief Number of global data, used for distributed learning */
size_t global_num_data_ = 0;
/*! \brief used to local used data indices */
std::vector<data_size_t> used_data_indices_;
/*! \brief prediction function for initial model */
const PredictFunction& predict_fun_;
/*! \brief index of label column */ /*! \brief index of label column */
int label_idx_ = 0; int label_idx_ = 0;
/*! \brief index of weight column */
int weight_idx_ = -1;
/*! \brief index of group column */
int group_idx_ = -1;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> ignore_features_;
/*! \brief store feature names */ /*! \brief store feature names */
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief store feature names */
int bin_construct_sample_cnt_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <LightGBM/network.h> #include <LightGBM/network.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/boosting.h> #include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h> #include <LightGBM/objective_function.h>
#include <LightGBM/metric.h> #include <LightGBM/metric.h>
...@@ -26,7 +27,7 @@ ...@@ -26,7 +27,7 @@
namespace LightGBM { namespace LightGBM {
Application::Application(int argc, char** argv) Application::Application(int argc, char** argv)
:train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) { :dataset_loader_(nullptr), train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) {
LoadParameters(argc, argv); LoadParameters(argc, argv);
// set number of threads for openmp // set number of threads for openmp
if (config_.num_threads > 0) { if (config_.num_threads > 0) {
...@@ -35,6 +36,7 @@ Application::Application(int argc, char** argv) ...@@ -35,6 +36,7 @@ Application::Application(int argc, char** argv)
} }
Application::~Application() { Application::~Application() {
if (dataset_loader_ != nullptr) { delete dataset_loader_; }
if (train_data_ != nullptr) { delete train_data_; } if (train_data_ != nullptr) { delete train_data_; }
for (auto& data : valid_datas_) { for (auto& data : valid_datas_) {
if (data != nullptr) { delete data; } if (data != nullptr) { delete data; }
...@@ -141,19 +143,17 @@ void Application::LoadData() { ...@@ -141,19 +143,17 @@ void Application::LoadData() {
config_.io_config.data_random_seed = config_.io_config.data_random_seed =
GlobalSyncUpByMin<int>(config_.io_config.data_random_seed); GlobalSyncUpByMin<int>(config_.io_config.data_random_seed);
} }
train_data_ = new Dataset(config_.io_config.data_filename.c_str(),
config_.io_config.input_init_score.c_str(), dataset_loader_ = new DatasetLoader(config_.io_config, predict_fun);
config_.io_config, dataset_loader_->SetHeadder(config_.io_config.data_filename.c_str());
predict_fun);
// load Training data // load Training data
if (config_.is_parallel_find_bin) { if (config_.is_parallel_find_bin) {
// load data for parallel training // load data for parallel training
train_data_->LoadTrainData(Network::rank(), Network::num_machines(), train_data_ = dataset_loader_->LoadFromFile(config_.io_config.data_filename.c_str(),
config_.io_config.is_pre_partition, Network::rank(), Network::num_machines());
config_.io_config.use_two_round_loading);
} else { } else {
// load data for single machine // load data for single machine
train_data_->LoadTrainData(config_.io_config.use_two_round_loading); train_data_ = dataset_loader_->LoadFromFile(config_.io_config.data_filename.c_str(), 0, 1);
} }
// need save binary file // need save binary file
if (config_.io_config.is_save_binary_file) { if (config_.io_config.is_save_binary_file) {
...@@ -173,13 +173,8 @@ void Application::LoadData() { ...@@ -173,13 +173,8 @@ void Application::LoadData() {
// Add validation data, if it exists // Add validation data, if it exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) { for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add // add
valid_datas_.push_back( valid_datas_.push_back(dataset_loader_->LoadFromFileLikeOthers(config_.io_config.valid_data_filenames[i].c_str(),
new Dataset(config_.io_config.valid_data_filenames[i].c_str(), train_data_));
config_.io_config,
predict_fun));
// load validation data like train data
valid_datas_.back()->LoadValidationData(train_data_,
config_.io_config.use_two_round_loading);
// need save binary file // need save binary file
if (config_.io_config.is_save_binary_file) { if (config_.io_config.is_save_binary_file) {
valid_datas_.back()->SaveBinaryFile(nullptr); valid_datas_.back()->SaveBinaryFile(nullptr);
......
#include <LightGBM/c_api.h> #include <LightGBM/c_api.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/boosting.h> #include <LightGBM/boosting.h>
...@@ -10,6 +9,7 @@ ...@@ -10,6 +9,7 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <cstring> #include <cstring>
#include <memory>
namespace LightGBM { namespace LightGBM {
...@@ -100,3 +100,6 @@ private: ...@@ -100,3 +100,6 @@ private:
}; };
} }
using namespace LightGBM;
...@@ -202,7 +202,6 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) { ...@@ -202,7 +202,6 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString(params, "output_model", &output_model); GetString(params, "output_model", &output_model);
GetString(params, "input_model", &input_model); GetString(params, "input_model", &input_model);
GetString(params, "output_result", &output_result); GetString(params, "output_result", &output_result);
GetString(params, "input_init_score", &input_init_score);
std::string tmp_str = ""; std::string tmp_str = "";
if (GetString(params, "valid_data", &tmp_str)) { if (GetString(params, "valid_data", &tmp_str)) {
valid_data_filenames = Common::Split(tmp_str.c_str(), ','); valid_data_filenames = Common::Split(tmp_str.c_str(), ',');
......
This diff is collapsed.
This diff is collapsed.
...@@ -14,9 +14,8 @@ Metadata::Metadata() ...@@ -14,9 +14,8 @@ Metadata::Metadata()
} }
void Metadata::Init(const char * data_filename, const char* init_score_filename, const int num_class) { void Metadata::Init(const char * data_filename, const int num_class) {
data_filename_ = data_filename; data_filename_ = data_filename;
init_score_filename_ = init_score_filename;
num_class_ = num_class; num_class_ = num_class;
// for lambdarank, it needs query data for partition data in parallel learning // for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries(); LoadQueryBoundaries();
...@@ -25,11 +24,6 @@ void Metadata::Init(const char * data_filename, const char* init_score_filename, ...@@ -25,11 +24,6 @@ void Metadata::Init(const char * data_filename, const char* init_score_filename,
LoadInitialScore(); LoadInitialScore();
} }
void Metadata::Init(const char* init_score_filename, const int num_class) {
init_score_filename_ = init_score_filename;
num_class_ = num_class;
LoadInitialScore();
}
Metadata::~Metadata() { Metadata::~Metadata() {
...@@ -294,10 +288,14 @@ void Metadata::LoadWeights() { ...@@ -294,10 +288,14 @@ void Metadata::LoadWeights() {
void Metadata::LoadInitialScore() { void Metadata::LoadInitialScore() {
num_init_score_ = 0; num_init_score_ = 0;
if (init_score_filename_[0] == '\0') { return; } std::string init_score_filename(data_filename_);
TextReader<size_t> reader(init_score_filename_, false); // default weight file name
init_score_filename.append(".init");
TextReader<size_t> reader(init_score_filename.c_str(), false);
reader.ReadAllLines(); reader.ReadAllLines();
if (reader.Lines().size() <= 0) {
return;
}
Log::Info("Loading initial scores..."); Log::Info("Loading initial scores...");
num_init_score_ = static_cast<data_size_t>(reader.Lines().size()); num_init_score_ = static_cast<data_size_t>(reader.Lines().size());
......
...@@ -161,6 +161,7 @@ ...@@ -161,6 +161,7 @@
<ClInclude Include="..\include\LightGBM\config.h" /> <ClInclude Include="..\include\LightGBM\config.h" />
<ClInclude Include="..\include\LightGBM\c_api.h" /> <ClInclude Include="..\include\LightGBM\c_api.h" />
<ClInclude Include="..\include\LightGBM\dataset.h" /> <ClInclude Include="..\include\LightGBM\dataset.h" />
<ClInclude Include="..\include\LightGBM\dataset_loader.h" />
<ClInclude Include="..\include\LightGBM\feature.h" /> <ClInclude Include="..\include\LightGBM\feature.h" />
<ClInclude Include="..\include\LightGBM\meta.h" /> <ClInclude Include="..\include\LightGBM\meta.h" />
<ClInclude Include="..\include\LightGBM\metric.h" /> <ClInclude Include="..\include\LightGBM\metric.h" />
...@@ -208,6 +209,7 @@ ...@@ -208,6 +209,7 @@
<ClCompile Include="..\src\io\bin.cpp" /> <ClCompile Include="..\src\io\bin.cpp" />
<ClCompile Include="..\src\io\config.cpp" /> <ClCompile Include="..\src\io\config.cpp" />
<ClCompile Include="..\src\io\dataset.cpp" /> <ClCompile Include="..\src\io\dataset.cpp" />
<ClCompile Include="..\src\io\dataset_loader.cpp" />
<ClCompile Include="..\src\io\metadata.cpp" /> <ClCompile Include="..\src\io\metadata.cpp" />
<ClCompile Include="..\src\io\parser.cpp" /> <ClCompile Include="..\src\io\parser.cpp" />
<ClCompile Include="..\src\io\tree.cpp" /> <ClCompile Include="..\src\io\tree.cpp" />
......
...@@ -168,6 +168,9 @@ ...@@ -168,6 +168,9 @@
<ClInclude Include="..\include\LightGBM\c_api.h"> <ClInclude Include="..\include\LightGBM\c_api.h">
<Filter>include\LightGBM</Filter> <Filter>include\LightGBM</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\include\LightGBM\dataset_loader.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="..\src\application\application.cpp"> <ClCompile Include="..\src\application\application.cpp">
...@@ -236,5 +239,8 @@ ...@@ -236,5 +239,8 @@
<ClCompile Include="..\src\c_api.cpp"> <ClCompile Include="..\src\c_api.cpp">
<Filter>src</Filter> <Filter>src</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\src\io\dataset_loader.cpp">
<Filter>src\io</Filter>
</ClCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment