Commit 1c08e71e authored by Guolin Ke's avatar Guolin Ke
Browse files

use dataset_loader to load data

parent 8696709e
......@@ -8,6 +8,7 @@
namespace LightGBM {
class DatasetLoader;
class Dataset;
class Boosting;
class ObjectiveFunction;
......@@ -59,6 +60,8 @@ private:
/*! \brief All configs */
OverallConfig config_;
/*! \brief Dataset loader */
DatasetLoader* dataset_loader_;
/*! \brief Training data */
Dataset* train_data_;
/*! \brief Validation data */
......
......@@ -93,7 +93,6 @@ public:
std::string output_model = "LightGBM_model.txt";
std::string output_result = "LightGBM_predict_result.txt";
std::string input_model = "";
std::string input_init_score = "";
int verbosity = 1;
int num_model_predict = -1;
bool is_pre_partition = false;
......@@ -318,7 +317,6 @@ struct ParameterAlias {
{ "model_out", "output_model" },
{ "model_input", "input_model" },
{ "model_in", "input_model" },
{ "init_score", "input_init_score"},
{ "predict_result", "output_result" },
{ "prediction_result", "output_result" },
{ "valid", "valid_data" },
......
#ifndef LIGHTGBM_DATA_H_
#define LIGHTGBM_DATA_H_
#ifndef LIGHTGBM_DATASET_H_
#define LIGHTGBM_DATASET_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
......@@ -18,6 +18,7 @@ namespace LightGBM {
/*! \brief forward declaration */
class Feature;
class BinMapper;
class DatasetLoader;
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
......@@ -44,13 +45,7 @@ public:
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void Init(const char* data_filename, const char* init_score_filename, const int num_class);
/*!
* \brief Initialize, only load initial score
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void Init(const char* init_score_filename, const int num_class);
void Init(const char* data_filename, const int num_class);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
......@@ -177,11 +172,10 @@ public:
* \return Pointer of initial scores
*/
inline const float* init_score() const { return init_score_; }
/*! \brief Load initial scores from file */
void LoadInitialScore();
private:
/*! \brief Load initial scores from file */
void LoadInitialScore();
/*! \brief Load wights from file */
void LoadWeights();
/*! \brief Load query boundaries from file */
......@@ -190,8 +184,6 @@ private:
void LoadQueryWeights();
/*! \brief Filename of current data */
const char* data_filename_;
/*! \brief Filename of initial scores */
const char* init_score_filename_;
/*! \brief Number of data */
data_size_t num_data_;
/*! \brief Number of classes */
......@@ -251,79 +243,16 @@ using PredictFunction =
*/
class Dataset {
public:
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const char* data_filename, const char* init_score_filename,
const IOConfig& io_config, const PredictFunction& predict_fun);
friend DatasetLoader;
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const char* data_filename,
const IOConfig& io_config, const PredictFunction& predict_fun)
: Dataset(data_filename, "", io_config, predict_fun) {
}
/*!
* \brief Constructor, without filename, used to load data from memory
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const IOConfig& io_config, const PredictFunction& predict_fun);
Dataset();
/*! \brief Destructor */
~Dataset();
/*! \brief Init Dataset with specific binmapper */
void InitByBinMapper(std::vector<const BinMapper*> bin_mappers, data_size_t num_data);
/*! \brief push raw data into dataset */
void PushData(const std::vector<std::vector<std::pair<int, float>>>& datas, data_size_t start_idx, bool is_finished);
void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type);
/*!
* \brief Load training data on parallel training
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param use_two_round_loading True if need to use two round loading
*/
void LoadTrainData(int rank, int num_machines, bool is_pre_partition,
bool use_two_round_loading);
/*!
* \brief Load training data on single machine training
* \param use_two_round_loading True if need to use two round loading
*/
inline void LoadTrainData(bool use_two_round_loading) {
LoadTrainData(0, 1, false, use_two_round_loading);
}
/*!
* \brief Load data and use bin mapper from other data set, general this function is used to extract feature for validation data
* \param train_set Other loaded data set
* \param use_two_round_loading True if need to use two round loading
*/
void LoadValidationData(const Dataset* train_set, bool use_two_round_loading);
/*!
* \brief Load data set from binary file
* \param bin_filename filename of bin data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataFromBinFile(const char* bin_filename, int rank, int num_machines, bool is_pre_partition);
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
......@@ -331,6 +260,8 @@ public:
std::vector<const BinMapper*> GetBinMappers() const;
void CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const;
/*!
* \brief Get a feature pointer for specific index
* \param i Index for feature
......@@ -365,57 +296,7 @@ public:
Dataset(const Dataset&) = delete;
private:
/*!
* \brief Load data content on memory. if num_machines > 1 and !is_pre_partition, will partition data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataToMemory(int rank, int num_machines, bool is_pre_partition);
/*!
* \brief Sample data from memory, need load data to memory first
* \param out_data Store the sampled data
*/
void SampleDataFromMemory(std::vector<std::string>* out_data);
/*!
* \brief Sample data from file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param out_data Store the sampled data
*/
void SampleDataFromFile(int rank, int num_machines,
bool is_pre_partition, std::vector<std::string>* out_data);
/*!
* \brief Get feature bin mapper from sampled data.
* if num_machines > 1, differnt machines will construct bin mapper for different features, then have a global sync up
* \param rank Rank of local machine
* \param num_machines Total number of all machines
*/
void ConstructBinMappers(int rank, int num_machines,
const std::vector<std::string>& sample_data);
/*! \brief Extract local features from memory */
void ExtractFeaturesFromMemory();
/*! \brief Extract local features from file */
void ExtractFeaturesFromFile();
/*! \brief Check can load from binary file */
void CheckCanLoadFromBin();
/*! \brief Check this data set is null or not */
void CheckDataset();
/*! \brief Filename of data */
const char* data_filename_;
/*! \brief A reader class that can read text data */
TextReader<data_size_t>* text_reader_;
/*! \brief A parser class that can parse data */
Parser* parser_;
/*! \brief Store used features */
std::vector<Feature*> features_;
/*! \brief Mapper from real feature index to used index*/
......@@ -430,32 +311,12 @@ private:
int num_class_;
/*! \brief Store some label level data*/
Metadata metadata_;
/*! \brief Random generator*/
Random random_;
/*! \brief The maximal number of bin that feature values will bucket in */
int max_bin_;
/*! \brief True if enable sparse */
bool is_enable_sparse_;
/*! \brief True if dataset is loaded from binary file */
bool is_loading_from_binfile_;
/*! \brief Number of global data, used for distributed learning */
size_t global_num_data_ = 0;
/*! \brief used to local used data indices */
std::vector<data_size_t> used_data_indices_;
/*! \brief prediction function for initial model */
const PredictFunction& predict_fun_;
/*! \brief index of label column */
int label_idx_ = 0;
/*! \brief index of weight column */
int weight_idx_ = -1;
/*! \brief index of group column */
int group_idx_ = -1;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> ignore_features_;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
/*! \brief store feature names */
int bin_construct_sample_cnt_;
};
} // namespace LightGBM
......
......@@ -5,6 +5,7 @@
#include <LightGBM/network.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
......@@ -26,7 +27,7 @@
namespace LightGBM {
Application::Application(int argc, char** argv)
:train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) {
:dataset_loader_(nullptr), train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) {
LoadParameters(argc, argv);
// set number of threads for openmp
if (config_.num_threads > 0) {
......@@ -35,6 +36,7 @@ Application::Application(int argc, char** argv)
}
Application::~Application() {
if (dataset_loader_ != nullptr) { delete dataset_loader_; }
if (train_data_ != nullptr) { delete train_data_; }
for (auto& data : valid_datas_) {
if (data != nullptr) { delete data; }
......@@ -141,19 +143,17 @@ void Application::LoadData() {
config_.io_config.data_random_seed =
GlobalSyncUpByMin<int>(config_.io_config.data_random_seed);
}
train_data_ = new Dataset(config_.io_config.data_filename.c_str(),
config_.io_config.input_init_score.c_str(),
config_.io_config,
predict_fun);
dataset_loader_ = new DatasetLoader(config_.io_config, predict_fun);
dataset_loader_->SetHeadder(config_.io_config.data_filename.c_str());
// load Training data
if (config_.is_parallel_find_bin) {
// load data for parallel training
train_data_->LoadTrainData(Network::rank(), Network::num_machines(),
config_.io_config.is_pre_partition,
config_.io_config.use_two_round_loading);
train_data_ = dataset_loader_->LoadFromFile(config_.io_config.data_filename.c_str(),
Network::rank(), Network::num_machines());
} else {
// load data for single machine
train_data_->LoadTrainData(config_.io_config.use_two_round_loading);
train_data_ = dataset_loader_->LoadFromFile(config_.io_config.data_filename.c_str(), 0, 1);
}
// need save binary file
if (config_.io_config.is_save_binary_file) {
......@@ -173,13 +173,8 @@ void Application::LoadData() {
// Add validation data, if it exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add
valid_datas_.push_back(
new Dataset(config_.io_config.valid_data_filenames[i].c_str(),
config_.io_config,
predict_fun));
// load validation data like train data
valid_datas_.back()->LoadValidationData(train_data_,
config_.io_config.use_two_round_loading);
valid_datas_.push_back(dataset_loader_->LoadFromFileLikeOthers(config_.io_config.valid_data_filenames[i].c_str(),
train_data_));
// need save binary file
if (config_.io_config.is_save_binary_file) {
valid_datas_.back()->SaveBinaryFile(nullptr);
......
#include <LightGBM/c_api.h>
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
......@@ -10,6 +9,7 @@
#include <vector>
#include <string>
#include <cstring>
#include <memory>
namespace LightGBM {
......@@ -100,3 +100,6 @@ private:
};
}
using namespace LightGBM;
......@@ -202,7 +202,6 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString(params, "output_model", &output_model);
GetString(params, "input_model", &input_model);
GetString(params, "output_result", &output_result);
GetString(params, "input_init_score", &input_init_score);
std::string tmp_str = "";
if (GetString(params, "valid_data", &tmp_str)) {
valid_data_filenames = Common::Split(tmp_str.c_str(), ',');
......
This diff is collapsed.
This diff is collapsed.
......@@ -14,9 +14,8 @@ Metadata::Metadata()
}
void Metadata::Init(const char * data_filename, const char* init_score_filename, const int num_class) {
void Metadata::Init(const char * data_filename, const int num_class) {
data_filename_ = data_filename;
init_score_filename_ = init_score_filename;
num_class_ = num_class;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries();
......@@ -25,11 +24,6 @@ void Metadata::Init(const char * data_filename, const char* init_score_filename,
LoadInitialScore();
}
void Metadata::Init(const char* init_score_filename, const int num_class) {
init_score_filename_ = init_score_filename;
num_class_ = num_class;
LoadInitialScore();
}
Metadata::~Metadata() {
......@@ -294,10 +288,14 @@ void Metadata::LoadWeights() {
void Metadata::LoadInitialScore() {
num_init_score_ = 0;
if (init_score_filename_[0] == '\0') { return; }
TextReader<size_t> reader(init_score_filename_, false);
std::string init_score_filename(data_filename_);
// default weight file name
init_score_filename.append(".init");
TextReader<size_t> reader(init_score_filename.c_str(), false);
reader.ReadAllLines();
if (reader.Lines().size() <= 0) {
return;
}
Log::Info("Loading initial scores...");
num_init_score_ = static_cast<data_size_t>(reader.Lines().size());
......
......@@ -161,6 +161,7 @@
<ClInclude Include="..\include\LightGBM\config.h" />
<ClInclude Include="..\include\LightGBM\c_api.h" />
<ClInclude Include="..\include\LightGBM\dataset.h" />
<ClInclude Include="..\include\LightGBM\dataset_loader.h" />
<ClInclude Include="..\include\LightGBM\feature.h" />
<ClInclude Include="..\include\LightGBM\meta.h" />
<ClInclude Include="..\include\LightGBM\metric.h" />
......@@ -208,6 +209,7 @@
<ClCompile Include="..\src\io\bin.cpp" />
<ClCompile Include="..\src\io\config.cpp" />
<ClCompile Include="..\src\io\dataset.cpp" />
<ClCompile Include="..\src\io\dataset_loader.cpp" />
<ClCompile Include="..\src\io\metadata.cpp" />
<ClCompile Include="..\src\io\parser.cpp" />
<ClCompile Include="..\src\io\tree.cpp" />
......
......@@ -168,6 +168,9 @@
<ClInclude Include="..\include\LightGBM\c_api.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\dataset_loader.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp">
......@@ -236,5 +239,8 @@
<ClCompile Include="..\src\c_api.cpp">
<Filter>src</Filter>
</ClCompile>
<ClCompile Include="..\src\io\dataset_loader.cpp">
<Filter>src\io</Filter>
</ClCompile>
</ItemGroup>
</Project>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment