use dataset_loader to load data

1c08e71e · Guolin Ke · 8696709e · 1c08e71e · 1c08e71e · 1c08e71e
Commit 1c08e71e authored Nov 04, 2016 by Guolin Ke
11 changed files
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -8,6 +8,7 @@

 namespace LightGBM {

+class DatasetLoader;
 class Dataset;
 class Boosting;
 class ObjectiveFunction;
@@ -59,6 +60,8 @@ private:

  /*! \brief All configs */
  OverallConfig config_;
+  /*! \brief Dataset loader */
+  DatasetLoader* dataset_loader_;
  /*! \brief Training data */
  Dataset* train_data_;
  /*! \brief Validation data */

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -93,7 +93,6 @@ public:
  std::string output_model = "LightGBM_model.txt";
  std::string output_result = "LightGBM_predict_result.txt";
  std::string input_model = "";
-  std::string input_init_score = "";
  int verbosity = 1;
  int num_model_predict = -1;
  bool is_pre_partition = false;
@@ -318,7 +317,6 @@ struct ParameterAlias {
      { "model_out", "output_model" },
      { "model_input", "input_model" },
      { "model_in", "input_model" },
-      { "init_score", "input_init_score"},
      { "predict_result", "output_result" },
      { "prediction_result", "output_result" },
      { "valid", "valid_data" },

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
-#ifndef LIGHTGBM_DATA_H_
-#define LIGHTGBM_DATA_H_
+#ifndef LIGHTGBM_DATASET_H_
+#define LIGHTGBM_DATASET_H_

 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/text_reader.h>
@@ -18,6 +18,7 @@ namespace LightGBM {
 /*! \brief forward declaration */
 class Feature;
 class BinMapper;
+class DatasetLoader;

 /*!
 * \brief This class is used to store some meta(non-feature) data for training data,
@@ -44,13 +45,7 @@ public:
  * \param init_score_filename Filename of initial score
  * \param num_class Number of classes
  */
-  void Init(const char* data_filename, const char* init_score_filename, const int num_class);
-  /*!
-  * \brief Initialize, only load initial score
-  * \param init_score_filename Filename of initial score
-  * \param num_class Number of classes
-  */
-  void Init(const char* init_score_filename, const int num_class);
+  void Init(const char* data_filename, const int num_class);
  /*!
  * \brief Initial with binary memory
  * \param memory Pointer to memory
@@ -177,11 +172,10 @@ public:
  * \return Pointer of initial scores
  */
  inline const float* init_score() const { return init_score_; }
-  
-  /*! \brief Load initial scores from file */
-  void LoadInitialScore();

 private:
+  /*! \brief Load initial scores from file */
+  void LoadInitialScore();
  /*! \brief Load wights from file */
  void LoadWeights();
  /*! \brief Load query boundaries from file */
@@ -190,8 +184,6 @@ private:
  void LoadQueryWeights();
  /*! \brief Filename of current data */
  const char* data_filename_;
-  /*! \brief Filename of initial scores */
-  const char* init_score_filename_;
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Number of classes */
@@ -251,79 +243,16 @@ using PredictFunction =
 */
 class Dataset {
 public:
-  /*!
-  * \brief Constructor
-  * \param data_filename Filename of dataset
-  * \param init_score_filename Filename of initial score
-  * \param io_config configs for IO
-  * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
-  */
-  Dataset(const char* data_filename, const char* init_score_filename,
-    const IOConfig& io_config, const PredictFunction& predict_fun);
+  friend DatasetLoader;

-  /*!
-  * \brief Constructor
-  * \param data_filename Filename of dataset
-  * \param io_config configs for IO
-  * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
-  */
-  Dataset(const char* data_filename,
-    const IOConfig& io_config, const PredictFunction& predict_fun)
-    : Dataset(data_filename, "", io_config, predict_fun) {
-  }
-
-  /*!
-  * \brief Constructor, without filename, used to load data from memory 
-  * \param io_config configs for IO
-  * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
-  */
-  Dataset(const IOConfig& io_config, const PredictFunction& predict_fun);
+  Dataset();

  /*! \brief Destructor */
  ~Dataset();

-  /*! \brief Init Dataset with specific binmapper */
-  void InitByBinMapper(std::vector<const BinMapper*> bin_mappers, data_size_t num_data);
-
-  /*! \brief push raw data into dataset */
-  void PushData(const std::vector<std::vector<std::pair<int, float>>>& datas, data_size_t start_idx, bool is_finished);

  void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type);

-  /*!
-  * \brief Load training data on parallel training
-  * \param rank Rank of local machine
-  * \param num_machines Total number of all machines
-  * \param is_pre_partition True if data file is pre-partitioned
-  * \param use_two_round_loading True if need to use two round loading
-  */
-  void LoadTrainData(int rank, int num_machines, bool is_pre_partition,
-                                           bool use_two_round_loading);
-
-  /*!
-  * \brief Load training data on single machine training
-  * \param use_two_round_loading True if need to use two round loading
-  */
-  inline void LoadTrainData(bool use_two_round_loading) {
-    LoadTrainData(0, 1, false, use_two_round_loading);
-  }
-
-  /*!
-  * \brief Load data and use bin mapper from other data set, general this function is used to extract feature for validation data
-  * \param train_set Other loaded data set
-  * \param use_two_round_loading True if need to use two round loading
-  */
-  void LoadValidationData(const Dataset* train_set, bool use_two_round_loading);
-
-  /*!
-  * \brief Load data set from binary file
-  * \param bin_filename filename of bin data 
-  * \param rank Rank of local machine
-  * \param num_machines Total number of all machines
-  * \param is_pre_partition True if data file is pre-partitioned
-  */
-  void LoadDataFromBinFile(const char* bin_filename, int rank, int num_machines, bool is_pre_partition);
-
  /*!
  * \brief Save current dataset into binary file, will save to "filename.bin"
  */
@@ -331,6 +260,8 @@ public:

  std::vector<const BinMapper*> GetBinMappers() const;

+  void CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const;
+
  /*!
  * \brief Get a feature pointer for specific index
  * \param i Index for feature
@@ -365,57 +296,7 @@ public:
  Dataset(const Dataset&) = delete;

 private:
-  /*!
-  * \brief Load data content on memory. if num_machines > 1 and !is_pre_partition, will partition data
-  * \param rank Rank of local machine
-  * \param num_machines Total number of all machines
-  * \param is_pre_partition True if data file is pre-partitioned
-  */
-  void LoadDataToMemory(int rank, int num_machines, bool is_pre_partition);
-
-  /*!
-  * \brief Sample data from memory, need load data to memory first
-  * \param out_data Store the sampled data
-  */
-  void SampleDataFromMemory(std::vector<std::string>* out_data);
-
-  /*!
-  * \brief Sample data from file
-  * \param rank Rank of local machine
-  * \param num_machines Total number of all machines
-  * \param is_pre_partition True if data file is pre-partitioned
-  * \param out_data Store the sampled data
-  */
-  void SampleDataFromFile(int rank, int num_machines,
-    bool is_pre_partition, std::vector<std::string>* out_data);
-
-  /*!
-  * \brief Get feature bin mapper from sampled data.
-  * if num_machines > 1, differnt machines will construct bin mapper for different features, then have a global sync up
-  * \param rank Rank of local machine
-  * \param num_machines Total number of all machines
-  */
-  void ConstructBinMappers(int rank, int num_machines,
-         const std::vector<std::string>& sample_data);
-
-  /*! \brief Extract local features from memory */
-  void ExtractFeaturesFromMemory();
-
-  /*! \brief Extract local features from file */
-  void ExtractFeaturesFromFile();
-
-  /*! \brief Check can load from binary file */
-  void CheckCanLoadFromBin();
-
-  /*! \brief Check this data set is null or not */
-  void CheckDataset();
-
-  /*! \brief Filename of data */
  const char* data_filename_;
-  /*! \brief A reader class that can read text data */
-  TextReader<data_size_t>* text_reader_;
-  /*! \brief A parser class that can parse data */
-  Parser* parser_;
  /*! \brief Store used features */
  std::vector<Feature*> features_;
  /*! \brief Mapper from real feature index to used index*/
@@ -430,32 +311,12 @@ private:
  int num_class_;
  /*! \brief Store some label level data*/
  Metadata metadata_;
-  /*! \brief Random generator*/
-  Random random_;
-  /*! \brief The maximal number of bin that feature values will bucket in */
-  int max_bin_;
-  /*! \brief True if enable sparse */
-  bool is_enable_sparse_;
  /*! \brief True if dataset is loaded from binary file */
  bool is_loading_from_binfile_;
-  /*! \brief Number of global data, used for distributed learning */
-  size_t global_num_data_ = 0;
-  /*! \brief used to local used data indices */
-  std::vector<data_size_t> used_data_indices_;
-  /*! \brief prediction function for initial model */
-  const PredictFunction& predict_fun_;
  /*! \brief index of label column */
  int label_idx_ = 0;
-  /*! \brief index of weight column */
-  int weight_idx_ = -1;
-  /*! \brief index of group column */
-  int group_idx_ = -1;
-  /*! \brief Mapper from real feature index to used index*/
-  std::unordered_set<int> ignore_features_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
-  /*! \brief store feature names */
-  int bin_construct_sample_cnt_;
 };

 }  // namespace LightGBM

--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -5,6 +5,7 @@

 #include <LightGBM/network.h>
 #include <LightGBM/dataset.h>
+#include <LightGBM/dataset_loader.h>
 #include <LightGBM/boosting.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/metric.h>
@@ -26,7 +27,7 @@
 namespace LightGBM {

 Application::Application(int argc, char** argv)
-  :train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) {
+  :dataset_loader_(nullptr), train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) {
  LoadParameters(argc, argv);
  // set number of threads for openmp
  if (config_.num_threads > 0) {
@@ -35,6 +36,7 @@ Application::Application(int argc, char** argv)
 }

 Application::~Application() {
+  if (dataset_loader_ != nullptr) { delete dataset_loader_; }
  if (train_data_ != nullptr) { delete train_data_; }
  for (auto& data : valid_datas_) {
    if (data != nullptr) { delete data; }
@@ -141,19 +143,17 @@ void Application::LoadData() {
    config_.io_config.data_random_seed =
       GlobalSyncUpByMin<int>(config_.io_config.data_random_seed);
  }
-  train_data_ = new Dataset(config_.io_config.data_filename.c_str(),
-                         config_.io_config.input_init_score.c_str(),
-                                                  config_.io_config,
-                                                       predict_fun);
+
+  dataset_loader_ = new DatasetLoader(config_.io_config, predict_fun);
+  dataset_loader_->SetHeadder(config_.io_config.data_filename.c_str());
  // load Training data
  if (config_.is_parallel_find_bin) {
    // load data for parallel training
-    train_data_->LoadTrainData(Network::rank(), Network::num_machines(),
-                                     config_.io_config.is_pre_partition,
-                               config_.io_config.use_two_round_loading);
+    train_data_ = dataset_loader_->LoadFromFile(config_.io_config.data_filename.c_str(),
+      Network::rank(), Network::num_machines());
  } else {
    // load data for single machine
-    train_data_->LoadTrainData(config_.io_config.use_two_round_loading);
+    train_data_ = dataset_loader_->LoadFromFile(config_.io_config.data_filename.c_str(), 0, 1);
  }
  // need save binary file
  if (config_.io_config.is_save_binary_file) {
@@ -173,13 +173,8 @@ void Application::LoadData() {
  // Add validation data, if it exists
  for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
    // add
-    valid_datas_.push_back(
-      new Dataset(config_.io_config.valid_data_filenames[i].c_str(),
-                                                  config_.io_config,
-                                                      predict_fun));
-    // load validation data like train data
-    valid_datas_.back()->LoadValidationData(train_data_,
-                config_.io_config.use_two_round_loading);
+    valid_datas_.push_back(dataset_loader_->LoadFromFileLikeOthers(config_.io_config.valid_data_filenames[i].c_str(),
+      train_data_));
    // need save binary file
    if (config_.io_config.is_save_binary_file) {
      valid_datas_.back()->SaveBinaryFile(nullptr);

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
-
 #include <LightGBM/c_api.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/boosting.h>
@@ -10,6 +9,7 @@
 #include <vector>
 #include <string>
 #include <cstring>
+#include <memory>

 namespace LightGBM {

@@ -100,3 +100,6 @@ private:
 };

 }
+
+using namespace LightGBM;
+
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -202,7 +202,6 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
  GetString(params, "output_model", &output_model);
  GetString(params, "input_model", &input_model);
  GetString(params, "output_result", &output_result);
-  GetString(params, "input_init_score", &input_init_score);
  std::string tmp_str = "";
  if (GetString(params, "valid_data", &tmp_str)) {
    valid_data_filenames = Common::Split(tmp_str.c_str(), ',');

--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -15,295 +15,32 @@

 namespace LightGBM {

-Dataset::Dataset(const char* data_filename, const char* init_score_filename,
-  const IOConfig& io_config, const PredictFunction& predict_fun)
-  :data_filename_(data_filename), random_(io_config.data_random_seed),
-  max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse), 
-  predict_fun_(predict_fun), bin_construct_sample_cnt_(io_config.bin_construct_sample_cnt) {
-  num_class_ = io_config.num_class;
-  if (io_config.enable_load_from_binary_file) {
-    CheckCanLoadFromBin();
-  }
-  if (is_loading_from_binfile_ && predict_fun != nullptr) {
-    Log::Info("Cannot initialize prediction by using a binary file, using text file instead");
-    is_loading_from_binfile_ = false;
-  }
-
-  if (!is_loading_from_binfile_) {
-    // load weight, query information and initialize score
-    metadata_.Init(data_filename, init_score_filename, num_class_);
-    // create text reader
-    text_reader_ = new TextReader<data_size_t>(data_filename, io_config.has_header);
-
-    std::unordered_map<std::string, int> name2idx;
-    // get column names
-    if (io_config.has_header) {
-      std::string first_line = text_reader_->first_line();
-      feature_names_ = Common::Split(first_line.c_str(), "\t ,");
-      for (size_t i = 0; i < feature_names_.size(); ++i) {
-        name2idx[feature_names_[i]] = static_cast<int>(i);
-      }
-    }
-    std::string name_prefix("name:");
-
-    // load label idx
-    if (io_config.label_column.size() > 0) {
-      if (Common::StartsWith(io_config.label_column, name_prefix)) {
-        std::string name = io_config.label_column.substr(name_prefix.size());
-        if (name2idx.count(name) > 0) {
-          label_idx_ = name2idx[name];
-          Log::Info("Using column %s as label", name.c_str());
-        } else {
-          Log::Fatal("Could not find label column %s in data file", name.c_str());
-        }
-      } else {
-        if (!Common::AtoiAndCheck(io_config.label_column.c_str(), &label_idx_)) {
-          Log::Fatal("label_column is not a number, \
-                      if you want to use a column name, \
-                      please add the prefix \"name:\" to the column name");
-        }
-        Log::Info("Using column number %d as label", label_idx_);
-      }
-    }
-    if (feature_names_.size() > 0) {
-      // erase label column name
-      feature_names_.erase(feature_names_.begin() + label_idx_);
-    }
-    // load ignore columns
-    if (io_config.ignore_column.size() > 0) {
-      if (Common::StartsWith(io_config.ignore_column, name_prefix)) {
-        std::string names = io_config.ignore_column.substr(name_prefix.size());
-        for (auto name : Common::Split(names.c_str(), ',')) {
-          if (name2idx.count(name) > 0) {
-            int tmp = name2idx[name];
-            // skip for label column
-            if (tmp > label_idx_) { tmp -= 1; }
-            ignore_features_.emplace(tmp);
-          } else {
-            Log::Fatal("Could not find ignore column %s in data file", name.c_str());
-          }
-        }
-      } else {
-        for (auto token : Common::Split(io_config.ignore_column.c_str(), ',')) {
-          int tmp = 0;
-          if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
-            Log::Fatal("ignore_column is not a number, \
-                        if you want to use a column name, \
-                        please add the prefix \"name:\" to the column name");
-          }
-          // skip for label column
-          if (tmp > label_idx_) { tmp -= 1; }
-          ignore_features_.emplace(tmp);
-        }
-      }
-
-    }
-
-    // load weight idx
-    if (io_config.weight_column.size() > 0) {
-      if (Common::StartsWith(io_config.weight_column, name_prefix)) {
-        std::string name = io_config.weight_column.substr(name_prefix.size());
-        if (name2idx.count(name) > 0) {
-          weight_idx_ = name2idx[name];
-          Log::Info("Using column %s as weight", name.c_str());
-        } else {
-          Log::Fatal("Could not find weight column %s in data file", name.c_str());
-        }
-      } else {
-        if (!Common::AtoiAndCheck(io_config.weight_column.c_str(), &weight_idx_)) {
-          Log::Fatal("weight_column is not a number, \
-                      if you want to use a column name, \
-                      please add the prefix \"name:\" to the column name");
-        }
-        Log::Info("Using column number %d as weight", weight_idx_);
-      }
-      // skip for label column
-      if (weight_idx_ > label_idx_) {
-        weight_idx_ -= 1;
-      }
-      ignore_features_.emplace(weight_idx_);
-    }
-
-    if (io_config.group_column.size() > 0) {
-      if (Common::StartsWith(io_config.group_column, name_prefix)) {
-        std::string name = io_config.group_column.substr(name_prefix.size());
-        if (name2idx.count(name) > 0) {
-          group_idx_ = name2idx[name];
-          Log::Info("Using column %s as group/query id", name.c_str());
-        } else {
-          Log::Fatal("Could not find group/query column %s in data file", name.c_str());
-        }
-      } else {
-        if (!Common::AtoiAndCheck(io_config.group_column.c_str(), &group_idx_)) {
-          Log::Fatal("group_column is not a number, \
-                      if you want to use a column name, \
-                      please add the prefix \"name:\" to the column name");
-        }
-        Log::Info("Using column number %d as group/query id", group_idx_);
-      }
-      // skip for label column
-      if (group_idx_ > label_idx_) {
-        group_idx_ -= 1;
-      }
-      ignore_features_.emplace(group_idx_);
-    }
-
-    // create text parser
-    parser_ = Parser::CreateParser(data_filename_, io_config.has_header, 0, label_idx_);
-    if (parser_ == nullptr) {
-      Log::Fatal("Could not recognize data format of %s", data_filename_);
-    }
-  } else {
-    // only need to load initialize score, other meta data will be loaded from binary file
-    metadata_.Init(init_score_filename, num_class_);
-    Log::Info("Loading data set from binary file");
-    parser_ = nullptr;
-    text_reader_ = nullptr;
-  }

+Dataset::Dataset() {
+  num_class_ = 1;
+  num_data_ = 0;
+  is_loading_from_binfile_ = false;
 }

-Dataset::Dataset(const IOConfig& io_config, const PredictFunction& predict_fun)
-  :data_filename_(""), random_(io_config.data_random_seed),
-  max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse),
-  predict_fun_(predict_fun), bin_construct_sample_cnt_(io_config.bin_construct_sample_cnt) {
-  num_class_ = io_config.num_class;
-  parser_ = nullptr;
-  text_reader_ = nullptr;
-}
-
-
-
 Dataset::~Dataset() {
-  if (parser_ != nullptr) { delete parser_; }
-  if (text_reader_ != nullptr) { delete text_reader_; }
  for (auto& feature : features_) {
    delete feature;
  }
  features_.clear();
 }

-void Dataset::LoadDataToMemory(int rank, int num_machines, bool is_pre_partition) {
-  used_data_indices_.clear();
-  if (num_machines == 1 || is_pre_partition) {
-    // read all lines
-    num_data_ = text_reader_->ReadAllLines();
-    global_num_data_ = num_data_;
-  } else {  // need partition data
-    // get query data
-    const data_size_t* query_boundaries = metadata_.query_boundaries();
-
-    if (query_boundaries == nullptr) {
-      // if not contain query data, minimal sample unit is one record
-      global_num_data_ = text_reader_->ReadAndFilterLines([this, rank, num_machines](data_size_t) {
-        if (random_.NextInt(0, num_machines) == rank) {
-          return true;
-        } else {
-          return false;
-        }
-      }, &used_data_indices_);
-    } else {
-      // if contain query data, minimal sample unit is one query
-      data_size_t num_queries = metadata_.num_queries();
-      data_size_t qid = -1;
-      bool is_query_used = false;
-      global_num_data_ = text_reader_->ReadAndFilterLines(
-        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
-      (data_size_t line_idx) {
-        if (qid >= num_queries) {
-          Log::Fatal("Current query exceeds the range of the query file, please ensure the query file is correct");
-        }
-        if (line_idx >= query_boundaries[qid + 1]) {
-          // if is new query
-          is_query_used = false;
-          if (random_.NextInt(0, num_machines) == rank) {
-            is_query_used = true;
-          }
-          ++qid;
-        }
-        return is_query_used;
-      }, &used_data_indices_);
-    }
-    // set number of data
-    num_data_ = static_cast<data_size_t>(used_data_indices_.size());
-  }
-}
-
-void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) {
-  const size_t sample_cnt = static_cast<size_t>(num_data_ < bin_construct_sample_cnt_ ? num_data_ : bin_construct_sample_cnt_);
-  std::vector<size_t> sample_indices = random_.Sample(num_data_, sample_cnt);
-  out_data->clear();
-  for (size_t i = 0; i < sample_indices.size(); ++i) {
-    const size_t idx = sample_indices[i];
-    out_data->push_back(text_reader_->Lines()[idx]);
-  }
-}

-void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partition,
-                                             std::vector<std::string>* out_data) {
-  used_data_indices_.clear();
-  const data_size_t sample_cnt = static_cast<data_size_t>(bin_construct_sample_cnt_);
-  if (num_machines == 1 || is_pre_partition) {
-    num_data_ = static_cast<data_size_t>(text_reader_->SampleFromFile(random_, sample_cnt, out_data));
-    global_num_data_ = num_data_;
-  } else {  // need partition data
-    // get query data
-    const data_size_t* query_boundaries = metadata_.query_boundaries();
-    if (query_boundaries == nullptr) {
-      // if not contain query file, minimal sample unit is one record
-      global_num_data_ = text_reader_->SampleAndFilterFromFile([this, rank, num_machines]
-      (data_size_t) {
-        if (random_.NextInt(0, num_machines) == rank) {
-          return true;
-        } else {
-          return false;
-        }
-      }, &used_data_indices_, random_, sample_cnt, out_data);
-    } else {
-      // if contain query file, minimal sample unit is one query
-      data_size_t num_queries = metadata_.num_queries();
-      data_size_t qid = -1;
-      bool is_query_used = false;
-      global_num_data_ = text_reader_->SampleAndFilterFromFile(
-        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
-      (data_size_t line_idx) {
-        if (qid >= num_queries) {
-          Log::Fatal("Query id exceeds the range of the query file, \
-                      please ensure the query file is correct");
-        }
-        if (line_idx >= query_boundaries[qid + 1]) {
-          // if is new query
-          is_query_used = false;
-          if (random_.NextInt(0, num_machines) == rank) {
-            is_query_used = true;
-          }
-          ++qid;
-        }
-        return is_query_used;
-      }, &used_data_indices_, random_, sample_cnt, out_data);
-    }
-    num_data_ = static_cast<data_size_t>(used_data_indices_.size());
-  }
-}
-
-void Dataset::InitByBinMapper(std::vector<const BinMapper*> bin_mappers, data_size_t num_data) {
-  num_data_ = num_data;
-  global_num_data_ = num_data_;
-  // initialize label
-  metadata_.Init(num_data_, num_class_, -1, -1);
-  // free old memory
-  for (auto& feature : features_) {
-    delete feature;
-  }
-  features_.clear();
-  used_feature_map_ = std::vector<int>(bin_mappers.size(), -1);
-  for (size_t i = 0; i < bin_mappers.size(); ++i) {
-    if (bin_mappers[i] != nullptr) {
-      features_.push_back(new Feature(static_cast<int>(i), new BinMapper(bin_mappers[i]), num_data_, is_enable_sparse_));
-      used_feature_map_[i] = static_cast<int>(features_.size());
-    }
+void Dataset::CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const {
+  dataset->features_.clear();
+  // copy feature bin mapper data
+  for (Feature* feature : features_) {
+    dataset->features_.push_back(new Feature(feature->feature_index(),
+      new BinMapper(*feature->bin_mapper()), dataset->num_data_, is_enable_sparse));
  }
-  num_features_ = static_cast<int>(features_.size());
+  dataset->used_feature_map_ = used_feature_map_;
+  dataset->num_features_ = static_cast<int>(dataset->features_.size());
+  dataset->num_total_features_ = num_total_features_;
+  dataset->feature_names_ = feature_names_;
 }

 std::vector<const BinMapper*> Dataset::GetBinMappers() const {
@@ -314,27 +51,6 @@ std::vector<const BinMapper*> Dataset::GetBinMappers() const {
  return ret;
 }

-void Dataset::PushData(const std::vector<std::vector<std::pair<int, float>>>& datas, data_size_t start_idx, bool is_finished) {
-  // if doesn't need to prediction with initial model
-#pragma omp parallel for schedule(guided) 
-  for (data_size_t i = 0; i < static_cast<int>(datas.size()); ++i) {
-    const int tid = omp_get_thread_num();
-    for (auto& inner_data : datas[i]) {
-      int feature_idx = used_feature_map_[inner_data.first];
-      if (feature_idx >= 0) {
-        // if is used feature
-        features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
-      } 
-    }
-  }
-  if (is_finished) {
-#pragma omp parallel for schedule(guided)
-    for (int i = 0; i < num_features_; ++i) {
-      features_[i]->FinishLoad();
-    }
-  }
-}
-
 void Dataset::SetField(const char* field_name, const void* field_data, data_size_t num_element, int type) {
  std::string name(field_name);
  name = Common::Trim(name);
@@ -343,7 +59,7 @@ void Dataset::SetField(const char* field_name, const void* field_data, data_size
      Log::Fatal("type of label should be float");
    }
    metadata_.SetLabel(static_cast<const float*>(field_data), num_element);
-  }else if (name == std::string("weight") || name == std::string("weights")) {
+  } else if (name == std::string("weight") || name == std::string("weights")) {
    if (type != 0) {
      Log::Fatal("type of weights should be float");
    }
@@ -363,396 +79,8 @@ void Dataset::SetField(const char* field_name, const void* field_data, data_size
  }
 }

-void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<std::string>& sample_data) {
-  // sample_values[i][j], means the value of j-th sample on i-th feature
-  std::vector<std::vector<double>> sample_values;
-  // temp buffer for one line features and label
-  std::vector<std::pair<int, double>> oneline_features;
-  double label;
-  for (size_t i = 0; i < sample_data.size(); ++i) {
-    oneline_features.clear();
-    // parse features
-    parser_->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
-    // push 0 first, then edit the value according existing feature values
-    for (auto& feature_values : sample_values) {
-      feature_values.push_back(0.0);
-    }
-    for (std::pair<int, double>& inner_data : oneline_features) {
-      if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
-        // if need expand feature set
-        size_t need_size = inner_data.first - sample_values.size() + 1;
-        for (size_t j = 0; j < need_size; ++j) {
-          // push i+1 0
-          sample_values.emplace_back(i + 1, 0.0f);
-        }
-      }
-      // edit the feature value
-      sample_values[inner_data.first][i] = inner_data.second;
-    }
-  }
-
-  features_.clear();
-
-  // -1 means doesn't use this feature
-  used_feature_map_ = std::vector<int>(sample_values.size(), -1);
-  num_total_features_ = static_cast<int>(sample_values.size());
-
-  // check the range of label_idx, weight_idx and group_idx
-  CHECK(label_idx_ >= 0 && label_idx_ <= num_total_features_);
-  CHECK(weight_idx_ < 0 || weight_idx_ < num_total_features_);
-  CHECK(group_idx_ < 0 || group_idx_ < num_total_features_);
-
-  // fill feature_names_ if not header
-  if (feature_names_.size() <= 0) {
-    for (int i = 0; i < num_total_features_; ++i) {
-      std::stringstream str_buf;
-      str_buf << "Column_" << i;
-      feature_names_.push_back(str_buf.str());
-    }
-  }
-
-  // start find bins
-  if (num_machines == 1) {
-    std::vector<BinMapper*> bin_mappers(sample_values.size());
-    // if only one machine, find bin locally
-    #pragma omp parallel for schedule(guided)
-    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
-      if (ignore_features_.count(i) > 0) {
-        bin_mappers[i] = nullptr;
-        continue;
-      }
-      bin_mappers[i] = new BinMapper();
-      bin_mappers[i]->FindBin(&sample_values[i], max_bin_);
-    }
-
-    for (size_t i = 0; i < sample_values.size(); ++i) {
-      if (bin_mappers[i] == nullptr) {
-        Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
-      }
-      else if (!bin_mappers[i]->is_trival()) {
-        // map real feature index to used feature index
-        used_feature_map_[i] = static_cast<int>(features_.size());
-        // push new feature
-        features_.push_back(new Feature(static_cast<int>(i), bin_mappers[i],
-                                             num_data_, is_enable_sparse_));
-      } else {
-        // if feature is trival(only 1 bin), free spaces
-        Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
-        delete bin_mappers[i];
-      }
-    }
-  } else {
-    // if have multi-machines, need find bin distributed
-    // different machines will find bin for different features
-
-    // start and len will store the process feature indices for different machines
-    // machine i will find bins for features in [ strat[i], start[i] + len[i] )
-    int* start = new int[num_machines];
-    int* len = new int[num_machines];
-    int total_num_feature = static_cast<int>(sample_values.size());
-    int step = (total_num_feature + num_machines - 1) / num_machines;
-    if (step < 1) { step = 1; }
-
-    start[0] = 0;
-    for (int i = 0; i < num_machines - 1; ++i) {
-      len[i] = Common::Min<int>(step, total_num_feature - start[i]);
-      start[i + 1] = start[i] + len[i];
-    }
-    len[num_machines - 1] = total_num_feature - start[num_machines - 1];
-    // get size of bin mapper with max_bin_ size
-    int type_size = BinMapper::SizeForSpecificBin(max_bin_);
-    // since sizes of different feature may not be same, we expand all bin mapper to type_size
-    int buffer_size = type_size * total_num_feature;
-    char* input_buffer = new char[buffer_size];
-    char* output_buffer = new char[buffer_size];
-
-    // find local feature bins and copy to buffer
-    #pragma omp parallel for schedule(guided)
-    for (int i = 0; i < len[rank]; ++i) {
-      BinMapper* bin_mapper = new BinMapper();
-      bin_mapper->FindBin(&sample_values[start[rank] + i], max_bin_);
-      bin_mapper->CopyTo(input_buffer + i * type_size);
-      // don't need this any more
-      delete bin_mapper;
-    }
-    // convert to binary size
-    for (int i = 0; i < num_machines; ++i) {
-      start[i] *= type_size;
-      len[i] *= type_size;
-    }
-    // gather global feature bin mappers
-    Network::Allgather(input_buffer, buffer_size, start, len, output_buffer);
-    // restore features bins from buffer
-    for (int i = 0; i < total_num_feature; ++i) {
-      if (ignore_features_.count(i) > 0) {
-        Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
-        continue;
-      }
-      BinMapper* bin_mapper = new BinMapper();
-      bin_mapper->CopyFrom(output_buffer + i * type_size);
-      if (!bin_mapper->is_trival()) {
-        used_feature_map_[i] = static_cast<int>(features_.size());
-        features_.push_back(new Feature(static_cast<int>(i), bin_mapper, num_data_, is_enable_sparse_));
-      } else {
-        Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
-        delete bin_mapper;
-      }
-    }
-    // free buffer
-    delete[] start;
-    delete[] len;
-    delete[] input_buffer;
-    delete[] output_buffer;
-  }
-  num_features_ = static_cast<int>(features_.size());
-}
-
-
-void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, bool use_two_round_loading) {
-  // don't support query id in data file when training in parallel
-  if (num_machines > 1 && !is_pre_partition) {
-    if (group_idx_ > 0) {
-      Log::Fatal("Using a query id without pre-partitioning the data file is not supported for parallel training. \
-                  Please use an additional query file or pre-partition the data");
-    }
-  }
-  used_data_indices_.clear();
-  if (!is_loading_from_binfile_ ) {
-    if (!use_two_round_loading) {
-      // read data to memory
-      LoadDataToMemory(rank, num_machines, is_pre_partition);
-      std::vector<std::string> sample_data;
-      // sample data
-      SampleDataFromMemory(&sample_data);
-      // construct feature bin mappers
-      ConstructBinMappers(rank, num_machines, sample_data);
-      // initialize label
-      metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
-      // extract features
-      ExtractFeaturesFromMemory();
-    } else {
-      std::vector<std::string> sample_data;
-      // sample data from file
-      SampleDataFromFile(rank, num_machines, is_pre_partition, &sample_data);
-      // construct feature bin mappers
-      ConstructBinMappers(rank, num_machines, sample_data);
-      // initialize label
-      metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
-
-      // extract features
-      ExtractFeaturesFromFile();
-    }
-  } else {
-    std::string bin_filename(data_filename_);
-    bin_filename.append(".bin");
-    // load data from binary file
-    LoadDataFromBinFile(bin_filename.c_str(), rank, num_machines, is_pre_partition);
-  }
-  // check meta data
-  metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_);
-  // free memory
-  used_data_indices_.clear();
-  used_data_indices_.shrink_to_fit();
-  // need to check training data
-  CheckDataset();
-}
-
-void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_loading) {
-  used_data_indices_.clear();
-  if (!is_loading_from_binfile_ ) {
-    if (!use_two_round_loading) {
-      // read data in memory
-      LoadDataToMemory(0, 1, false);
-      // initialize label
-      metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
-      features_.clear();
-      // copy feature bin mapper data
-      for (Feature* feature : train_set->features_) {
-        features_.push_back(new Feature(feature->feature_index(), new BinMapper(*feature->bin_mapper()), num_data_, is_enable_sparse_));
-      }
-      used_feature_map_ = train_set->used_feature_map_;
-      num_features_ = static_cast<int>(features_.size());
-      num_total_features_ = train_set->num_total_features_;
-      feature_names_ = train_set->feature_names_;
-      // extract features
-      ExtractFeaturesFromMemory();
-    } else {
-      // Get number of lines of data file
-      num_data_ = static_cast<data_size_t>(text_reader_->CountLine());
-      // initialize label
-      metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
-      features_.clear();
-      // copy feature bin mapper data
-      for (Feature* feature : train_set->features_) {
-        features_.push_back(new Feature(feature->feature_index(), new BinMapper(*feature->bin_mapper()), num_data_, is_enable_sparse_));
-      }
-      used_feature_map_ = train_set->used_feature_map_;
-      num_features_ = static_cast<int>(features_.size());
-      num_total_features_ = train_set->num_total_features_;
-      feature_names_ = train_set->feature_names_;
-      // extract features
-      ExtractFeaturesFromFile();
-    }
-  } else {
-    std::string bin_filename(data_filename_);
-    bin_filename.append(".bin");
-    // load from binary file
-    LoadDataFromBinFile(bin_filename.c_str(), 0, 1, false);
-  }
-  // not need to check validation data
-  // check meta data
-  metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_);
-  // CheckDataset();
-}
-
-void Dataset::ExtractFeaturesFromMemory() {
-  std::vector<std::pair<int, double>> oneline_features;
-  double tmp_label = 0.0f;
-  if (predict_fun_ == nullptr) {
-    // if doesn't need to prediction with initial model
-    #pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
-    for (data_size_t i = 0; i < num_data_; ++i) {
-      const int tid = omp_get_thread_num();
-      oneline_features.clear();
-      // parser
-      parser_->ParseOneLine(text_reader_->Lines()[i].c_str(), &oneline_features, &tmp_label);
-      // set label
-      metadata_.SetLabelAt(i, static_cast<float>(tmp_label));
-      // free processed line:
-      text_reader_->Lines()[i].clear();
-      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
-      // text_reader_->Lines()[i].shrink_to_fit();
-      // push data
-      for (auto& inner_data : oneline_features) {
-        int feature_idx = used_feature_map_[inner_data.first];
-        if (feature_idx >= 0) {
-          // if is used feature
-          features_[feature_idx]->PushData(tid, i, inner_data.second);
-        }
-        else {
-          if (inner_data.first == weight_idx_) {
-            metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
-          } else if (inner_data.first == group_idx_) {
-            metadata_.SetQueryAt(i, static_cast<data_size_t>(inner_data.second));
-          }
-        }
-      }
-    }
-  } else {
-    // if need to prediction with initial model
-    float* init_score = new float[num_data_ * num_class_];
-    #pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
-    for (data_size_t i = 0; i < num_data_; ++i) {
-      const int tid = omp_get_thread_num();
-      oneline_features.clear();
-      // parser
-      parser_->ParseOneLine(text_reader_->Lines()[i].c_str(), &oneline_features, &tmp_label);
-      // set initial score
-      std::vector<double> oneline_init_score = predict_fun_(oneline_features);
-      for (int k = 0; k < num_class_; ++k){
-        init_score[k * num_data_ + i] = static_cast<float>(oneline_init_score[k]);
-      }
-      // set label
-      metadata_.SetLabelAt(i, static_cast<float>(tmp_label));
-      // free processed line:
-      text_reader_->Lines()[i].clear();
-      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
-      // text_reader_->Lines()[i].shrink_to_fit();
-      // push data
-      for (auto& inner_data : oneline_features) {
-        int feature_idx = used_feature_map_[inner_data.first];
-        if (feature_idx >= 0) {
-          // if is used feature
-          features_[feature_idx]->PushData(tid, i, inner_data.second);
-        }
-        else {
-          if (inner_data.first == weight_idx_) {
-            metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
-          } else if (inner_data.first == group_idx_) {
-            metadata_.SetQueryAt(i, static_cast<data_size_t>(inner_data.second));
-          }
-        }
-      }
-    }
-    // metadata_ will manage space of init_score
-    metadata_.SetInitScore(init_score, num_data_ * num_class_);
-    delete[] init_score;
-  }
-
-  #pragma omp parallel for schedule(guided)
-  for (int i = 0; i < num_features_; ++i) {
-    features_[i]->FinishLoad();
-  }
-  // text data can be free after loaded feature values
-  text_reader_->Clear();
-}
-
-
-void Dataset::ExtractFeaturesFromFile() {
-  float* init_score = nullptr;
-  if (predict_fun_ != nullptr) {
-    init_score = new float[num_data_ * num_class_];
-  }
-  std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
-    [this, &init_score]
-  (data_size_t start_idx, const std::vector<std::string>& lines) {
-    std::vector<std::pair<int, double>> oneline_features;
-    double tmp_label = 0.0f;
-    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
-    for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
-      const int tid = omp_get_thread_num();
-      oneline_features.clear();
-      // parser
-      parser_->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
-      // set initial score
-      if (init_score != nullptr) {
-        std::vector<double> oneline_init_score = predict_fun_(oneline_features);
-        for (int k = 0; k < num_class_; ++k){
-            init_score[k * num_data_ + start_idx + i] = static_cast<float>(oneline_init_score[k]);
-        }
-      }
-      // set label
-      metadata_.SetLabelAt(start_idx + i, static_cast<float>(tmp_label));
-      // push data
-      for (auto& inner_data : oneline_features) {
-        int feature_idx = used_feature_map_[inner_data.first];
-        if (feature_idx >= 0) {
-          // if is used feature
-          features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
-        }
-        else {
-          if (inner_data.first == weight_idx_) {
-            metadata_.SetWeightAt(start_idx + i, static_cast<float>(inner_data.second));
-          } else if (inner_data.first == group_idx_) {
-            metadata_.SetQueryAt(start_idx + i, static_cast<data_size_t>(inner_data.second));
-          }
-        }
-      }
-    }
-  };
-
-  if (used_data_indices_.size() > 0) {
-    // only need part of data
-    text_reader_->ReadPartAndProcessParallel(used_data_indices_, process_fun);
-  } else {
-    // need full data
-    text_reader_->ReadAllAndProcessParallel(process_fun);
-  }
-
-  // metadata_ will manage space of init_score
-  if (init_score != nullptr) {
-    metadata_.SetInitScore(init_score, num_data_ * num_class_);
-    delete[] init_score;
-  }
-
-  #pragma omp parallel for schedule(guided)
-  for (int i = 0; i < num_features_; ++i) {
-    features_[i]->FinishLoad();
-  }
-}
-
 void Dataset::SaveBinaryFile(const char* bin_filename) {
-  
+
  if (!is_loading_from_binfile_) {
    // if not pass a filename, just append ".bin" of original file
    if (bin_filename == nullptr || bin_filename[0] == '\0') {
@@ -761,11 +89,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
      bin_filename = bin_filename_str.c_str();
    }
    FILE* file;
-    #ifdef _MSC_VER
+#ifdef _MSC_VER
    fopen_s(&file, bin_filename, "wb");
-    #else
+#else
    file = fopen(bin_filename, "wb");
-    #endif
+#endif
    if (file == NULL) {
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
    }
@@ -773,17 +101,14 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
    Log::Info("Saving data to binary file %s", data_filename_);

    // get size of header
-    size_t size_of_header = sizeof(global_num_data_) + sizeof(is_enable_sparse_)
-      + sizeof(max_bin_) + sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) +sizeof(size_t) + sizeof(int) * used_feature_map_.size();
+    size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) 
+      + sizeof(size_t) + sizeof(int) * used_feature_map_.size();
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
      size_of_header += feature_names_[i].size() + sizeof(int);
    }
    fwrite(&size_of_header, sizeof(size_of_header), 1, file);
    // write header
-    fwrite(&global_num_data_, sizeof(global_num_data_), 1, file);
-    fwrite(&is_enable_sparse_, sizeof(is_enable_sparse_), 1, file);
-    fwrite(&max_bin_, sizeof(max_bin_), 1, file);
    fwrite(&num_data_, sizeof(num_data_), 1, file);
    fwrite(&num_features_, sizeof(num_features_), 1, file);
    fwrite(&num_total_features_, sizeof(num_features_), 1, file);
@@ -817,196 +142,4 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
  }
 }

-void Dataset::CheckCanLoadFromBin() {
-  std::string bin_filename(data_filename_);
-  bin_filename.append(".bin");
-
-  FILE* file;
-
-  #ifdef _MSC_VER
-  fopen_s(&file, bin_filename.c_str(), "rb");
-  #else
-  file = fopen(bin_filename.c_str(), "rb");
-  #endif
-
-  if (file == NULL) {
-    is_loading_from_binfile_ = false;
-  } else {
-    is_loading_from_binfile_ = true;
-    fclose(file);
-  }
-}
-
-void Dataset::LoadDataFromBinFile(const char* bin_filename, int rank, int num_machines, bool is_pre_partition) {
-
-  FILE* file;
-
-  #ifdef _MSC_VER
-  fopen_s(&file, bin_filename, "rb");
-  #else
-  file = fopen(bin_filename, "rb");
-  #endif
-
-  if (file == NULL) {
-    Log::Fatal("Cannot read binary data from %s", bin_filename);
-  }
-
-  // buffer to read binary file
-  size_t buffer_size = 16 * 1024 * 1024;
-  char* buffer = new char[buffer_size];
-
-  // read size of header
-  size_t read_cnt = fread(buffer, sizeof(size_t), 1, file);
-
-  if (read_cnt != 1) {
-    Log::Fatal("Binary file error: header has the wrong size");
-  }
-
-  size_t size_of_head = *(reinterpret_cast<size_t*>(buffer));
-
-  // re-allocmate space if not enough
-  if (size_of_head > buffer_size) {
-    delete[] buffer;
-    buffer_size = size_of_head;
-    buffer = new char[buffer_size];
-  }
-  // read header
-  read_cnt = fread(buffer, 1, size_of_head, file);
-
-  if (read_cnt != size_of_head) {
-    Log::Fatal("Binary file error: header is incorrect");
-  }
-  // get header
-  const char* mem_ptr = buffer;
-  global_num_data_ = *(reinterpret_cast<const size_t*>(mem_ptr));
-  mem_ptr += sizeof(global_num_data_);
-  is_enable_sparse_ = *(reinterpret_cast<const bool*>(mem_ptr));
-  mem_ptr += sizeof(is_enable_sparse_);
-  max_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
-  mem_ptr += sizeof(max_bin_);
-  num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
-  mem_ptr += sizeof(num_data_);
-  num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
-  mem_ptr += sizeof(num_features_);
-  num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
-  mem_ptr += sizeof(num_total_features_);
-  size_t num_used_feature_map = *(reinterpret_cast<const size_t*>(mem_ptr));
-  mem_ptr += sizeof(num_used_feature_map);
-  const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
-  used_feature_map_.clear();
-  for (size_t i = 0; i < num_used_feature_map; ++i) {
-    used_feature_map_.push_back(tmp_feature_map[i]);
-  }
-  mem_ptr += sizeof(int) * num_used_feature_map;
-  // get feature names
-  feature_names_.clear();
-  // write feature names
-  for (int i = 0; i < num_total_features_; ++i) {
-    int str_len = *(reinterpret_cast<const int*>(mem_ptr));
-    mem_ptr += sizeof(int);
-    std::stringstream str_buf;
-    for (int j = 0; j < str_len; ++j) {
-      char tmp_char = *(reinterpret_cast<const char*>(mem_ptr));
-      mem_ptr += sizeof(char);
-      str_buf << tmp_char;
-    }
-    feature_names_.emplace_back(str_buf.str());
-  }
-
-  // read size of meta data
-  read_cnt = fread(buffer, sizeof(size_t), 1, file);
-
-  if (read_cnt != 1) {
-    Log::Fatal("Binary file error: meta data has the wrong size");
-  }
-
-  size_t size_of_metadata = *(reinterpret_cast<size_t*>(buffer));
-
-  // re-allocate space if not enough
-  if (size_of_metadata > buffer_size) {
-    delete[] buffer;
-    buffer_size = size_of_metadata;
-    buffer = new char[buffer_size];
-  }
-  //  read meta data
-  read_cnt = fread(buffer, 1, size_of_metadata, file);
-
-  if (read_cnt != size_of_metadata) {
-    Log::Fatal("Binary file error: meta data is incorrect");
-  }
-  // load meta data
-  metadata_.LoadFromMemory(buffer);
-
-  used_data_indices_.clear();
-  global_num_data_ = num_data_;
-  // sample local used data if need to partition
-  if (num_machines > 1 && !is_pre_partition) {
-    const data_size_t* query_boundaries = metadata_.query_boundaries();
-    if (query_boundaries == nullptr) {
-      // if not contain query file, minimal sample unit is one record
-      for (data_size_t i = 0; i < num_data_; ++i) {
-        if (random_.NextInt(0, num_machines) == rank) {
-          used_data_indices_.push_back(i);
-        }
-      }
-    } else {
-      // if contain query file, minimal sample unit is one query
-      data_size_t num_queries = metadata_.num_queries();
-      data_size_t qid = -1;
-      bool is_query_used = false;
-      for (data_size_t i = 0; i < num_data_; ++i) {
-        if (qid >= num_queries) {
-          Log::Fatal("Current query exceeds the range of the query file, please ensure the query file is correct");
-        }
-        if (i >= query_boundaries[qid + 1]) {
-          // if is new query
-          is_query_used = false;
-          if (random_.NextInt(0, num_machines) == rank) {
-            is_query_used = true;
-          }
-          ++qid;
-        }
-        if (is_query_used) {
-          used_data_indices_.push_back(i);
-        }
-      }
-    }
-    num_data_ = static_cast<data_size_t>(used_data_indices_.size());
-  }
-  metadata_.PartitionLabel(used_data_indices_);
-  // read feature data
-  for (int i = 0; i < num_features_; ++i) {
-    // read feature size
-    read_cnt = fread(buffer, sizeof(size_t), 1, file);
-    if (read_cnt != 1) {
-      Log::Fatal("Binary file error: feature %d has the wrong size", i);
-    }
-    size_t size_of_feature = *(reinterpret_cast<size_t*>(buffer));
-    // re-allocate space if not enough
-    if (size_of_feature > buffer_size) {
-      delete[] buffer;
-      buffer_size = size_of_feature;
-      buffer = new char[buffer_size];
-    }
-
-    read_cnt = fread(buffer, 1, size_of_feature, file);
-
-    if (read_cnt != size_of_feature) {
-      Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt);
-    }
-    features_.push_back(new Feature(buffer, static_cast<data_size_t>(global_num_data_), used_data_indices_));
-  }
-  delete[] buffer;
-  fclose(file);
-}
-
-void Dataset::CheckDataset() {
-  if (num_data_ <= 0) {
-    Log::Fatal("Data file %s is empty", data_filename_);
-  }
-  if (features_.size() <= 0) {
-    Log::Fatal("No usable features in data file %s", data_filename_);
-  }
-}
-
 }  // namespace LightGBM
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
+#include <omp.h>
+
+#include <LightGBM/utils/log.h>
+#include <LightGBM/dataset_loader.h>
+#include <LightGBM/feature.h>
+#include <LightGBM/network.h>
+
+
+namespace LightGBM {
+
+DatasetLoader::DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun)
+  :io_config_(io_config), predict_fun_(predict_fun){
+
+}
+
+DatasetLoader::~DatasetLoader() {
+
+}
+
+void DatasetLoader::SetHeadder(const char* filename) {
+  TextReader<data_size_t> text_reader(filename, io_config_.has_header);
+  std::unordered_map<std::string, int> name2idx;
+
+  // get column names
+  if (io_config_.has_header) {
+    std::string first_line = text_reader.first_line();
+    feature_names_ = Common::Split(first_line.c_str(), "\t ,");
+    for (size_t i = 0; i < feature_names_.size(); ++i) {
+      name2idx[feature_names_[i]] = static_cast<int>(i);
+    }
+  }
+  std::string name_prefix("name:");
+
+  // load label idx
+  if (io_config_.label_column.size() > 0) {
+    if (Common::StartsWith(io_config_.label_column, name_prefix)) {
+      std::string name = io_config_.label_column.substr(name_prefix.size());
+      if (name2idx.count(name) > 0) {
+        label_idx_ = name2idx[name];
+        Log::Info("Using column %s as label", name.c_str());
+      } else {
+        Log::Fatal("Could not find label column %s in data file", name.c_str());
+      }
+    } else {
+      if (!Common::AtoiAndCheck(io_config_.label_column.c_str(), &label_idx_)) {
+        Log::Fatal("label_column is not a number, \
+                      if you want to use a column name, \
+                      please add the prefix \"name:\" to the column name");
+      }
+      Log::Info("Using column number %d as label", label_idx_);
+    }
+  }
+  if (feature_names_.size() > 0) {
+    // erase label column name
+    feature_names_.erase(feature_names_.begin() + label_idx_);
+  }
+  // load ignore columns
+  if (io_config_.ignore_column.size() > 0) {
+    if (Common::StartsWith(io_config_.ignore_column, name_prefix)) {
+      std::string names = io_config_.ignore_column.substr(name_prefix.size());
+      for (auto name : Common::Split(names.c_str(), ',')) {
+        if (name2idx.count(name) > 0) {
+          int tmp = name2idx[name];
+          // skip for label column
+          if (tmp > label_idx_) { tmp -= 1; }
+          ignore_features_.emplace(tmp);
+        } else {
+          Log::Fatal("Could not find ignore column %s in data file", name.c_str());
+        }
+      }
+    } else {
+      for (auto token : Common::Split(io_config_.ignore_column.c_str(), ',')) {
+        int tmp = 0;
+        if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
+          Log::Fatal("ignore_column is not a number, \
+                        if you want to use a column name, \
+                        please add the prefix \"name:\" to the column name");
+        }
+        // skip for label column
+        if (tmp > label_idx_) { tmp -= 1; }
+        ignore_features_.emplace(tmp);
+      }
+    }
+
+  }
+
+  // load weight idx
+  if (io_config_.weight_column.size() > 0) {
+    if (Common::StartsWith(io_config_.weight_column, name_prefix)) {
+      std::string name = io_config_.weight_column.substr(name_prefix.size());
+      if (name2idx.count(name) > 0) {
+        weight_idx_ = name2idx[name];
+        Log::Info("Using column %s as weight", name.c_str());
+      } else {
+        Log::Fatal("Could not find weight column %s in data file", name.c_str());
+      }
+    } else {
+      if (!Common::AtoiAndCheck(io_config_.weight_column.c_str(), &weight_idx_)) {
+        Log::Fatal("weight_column is not a number, \
+                      if you want to use a column name, \
+                      please add the prefix \"name:\" to the column name");
+      }
+      Log::Info("Using column number %d as weight", weight_idx_);
+    }
+    // skip for label column
+    if (weight_idx_ > label_idx_) {
+      weight_idx_ -= 1;
+    }
+    ignore_features_.emplace(weight_idx_);
+  }
+
+  if (io_config_.group_column.size() > 0) {
+    if (Common::StartsWith(io_config_.group_column, name_prefix)) {
+      std::string name = io_config_.group_column.substr(name_prefix.size());
+      if (name2idx.count(name) > 0) {
+        group_idx_ = name2idx[name];
+        Log::Info("Using column %s as group/query id", name.c_str());
+      } else {
+        Log::Fatal("Could not find group/query column %s in data file", name.c_str());
+      }
+    } else {
+      if (!Common::AtoiAndCheck(io_config_.group_column.c_str(), &group_idx_)) {
+        Log::Fatal("group_column is not a number, \
+                      if you want to use a column name, \
+                      please add the prefix \"name:\" to the column name");
+      }
+      Log::Info("Using column number %d as group/query id", group_idx_);
+    }
+    // skip for label column
+    if (group_idx_ > label_idx_) {
+      group_idx_ -= 1;
+    }
+    ignore_features_.emplace(group_idx_);
+  }
+}
+
+Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
+  // don't support query id in data file when training in parallel
+  if (num_machines > 1 && !io_config_.is_pre_partition) {
+    if (group_idx_ > 0) {
+      Log::Fatal("Using a query id without pre-partitioning the data file is not supported for parallel training. \
+                  Please use an additional query file or pre-partition the data");
+    }
+  }
+  auto parser = Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_);
+  if (parser == nullptr) {
+    Log::Fatal("Could not recognize data format of %s", filename);
+  }
+  data_size_t num_global_data = 0;
+  std::vector<data_size_t> used_data_indices;
+  Dataset* dataset = new Dataset();
+  dataset->data_filename_ = filename;
+  dataset->num_class_ = io_config_.num_class;
+  dataset->metadata_.Init(filename, dataset->num_class_);
+  bool is_loading_from_binfile = CheckCanLoadFromBin(filename);
+  if (!is_loading_from_binfile) {
+    if (!io_config_.use_two_round_loading) {
+      // read data to memory
+      auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, rank, num_machines,&num_global_data, &used_data_indices);
+      dataset->num_data_ = static_cast<data_size_t>(text_data.size());
+      // sample data
+      auto sample_data = SampleTextDataFromMemory(text_data);
+      // construct feature bin mappers
+      ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser, dataset);
+      // initialize label
+      dataset->metadata_.Init(dataset->num_data_, io_config_.num_class, weight_idx_, group_idx_);
+      // extract features
+      ExtractFeaturesFromMemory(text_data, parser, dataset);
+      text_data.clear();
+    } else {
+      // sample data from file
+      auto sample_data = SampleTextDataFromFile(filename, dataset->metadata_, rank, num_machines, &num_global_data, &used_data_indices);
+      if (used_data_indices.size() > 0) {
+        dataset->num_data_ = static_cast<data_size_t>(used_data_indices.size());
+      } else {
+        dataset->num_data_ = num_global_data;
+      }
+      // construct feature bin mappers
+      ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser, dataset);
+      // initialize label
+      dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
+
+      // extract features
+      ExtractFeaturesFromFile(filename, parser, used_data_indices, dataset);
+    }
+  } else {
+    // load data from binary file
+    delete dataset;
+    std::string bin_filename(filename);
+    bin_filename.append(".bin");
+    dataset = LoadFromBinFile(bin_filename.c_str(), rank, num_machines);
+  }
+  // check meta data
+  dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
+  // need to check training data
+  CheckDataset(dataset);
+  delete parser;
+  return dataset;
+}
+
+
+
+Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Dataset* other) {
+  auto parser = Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_);
+  if (parser == nullptr) {
+    Log::Fatal("Could not recognize data format of %s", filename);
+  }
+  data_size_t num_global_data = 0;
+  std::vector<data_size_t> used_data_indices;
+  Dataset* dataset = new Dataset();
+  dataset->data_filename_ = filename;
+  dataset->num_class_ = io_config_.num_class;
+  dataset->metadata_.Init(filename, dataset->num_class_);
+  bool is_loading_from_binfile = CheckCanLoadFromBin(filename);
+  if (!is_loading_from_binfile) {
+    if (!io_config_.use_two_round_loading) {
+      // read data in memory
+      auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices);
+      dataset->num_data_ = static_cast<data_size_t>(text_data.size());
+      // initialize label
+      dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
+      other->CopyFeatureMetadataTo(dataset, io_config_.is_enable_sparse);
+      // extract features
+      ExtractFeaturesFromMemory(text_data, parser, dataset);
+      text_data.clear();
+    } else {
+      TextReader<data_size_t> text_reader(filename, io_config_.has_header);
+      // Get number of lines of data file
+      dataset->num_data_ = static_cast<data_size_t>(text_reader.CountLine());
+      num_global_data = dataset->num_data_;
+      // initialize label
+      dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
+      other->CopyFeatureMetadataTo(dataset, io_config_.is_enable_sparse);
+      // extract features
+      ExtractFeaturesFromFile(filename, parser, used_data_indices, dataset);
+    }
+  } else {
+    // load data from binary file
+    delete dataset;
+    std::string bin_filename(filename);
+    bin_filename.append(".bin");
+    dataset = LoadFromBinFile(bin_filename.c_str(), 0, 1);
+  }
+  // not need to check validation data
+  // check meta data
+  dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
+  delete parser;
+  return dataset;
+}
+
+Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int num_machines) {
+  Dataset* dataset = new Dataset();
+  FILE* file;
+#ifdef _MSC_VER
+  fopen_s(&file, bin_filename, "rb");
+#else
+  file = fopen(bin_filename, "rb");
+#endif
+
+  if (file == NULL) {
+    Log::Fatal("Could not read binary data from %s", bin_filename);
+  }
+
+  // buffer to read binary file
+  size_t buffer_size = 16 * 1024 * 1024;
+  char* buffer = new char[buffer_size];
+
+  // read size of header
+  size_t read_cnt = fread(buffer, sizeof(size_t), 1, file);
+
+  if (read_cnt != 1) {
+    Log::Fatal("Binary file error: header has the wrong size");
+  }
+
+  size_t size_of_head = *(reinterpret_cast<size_t*>(buffer));
+
+  // re-allocmate space if not enough
+  if (size_of_head > buffer_size) {
+    delete[] buffer;
+    buffer_size = size_of_head;
+    buffer = new char[buffer_size];
+  }
+  // read header
+  read_cnt = fread(buffer, 1, size_of_head, file);
+
+  if (read_cnt != size_of_head) {
+    Log::Fatal("Binary file error: header is incorrect");
+  }
+  // get header
+  const char* mem_ptr = buffer;
+  dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
+  mem_ptr += sizeof(dataset->num_data_);
+  dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
+  mem_ptr += sizeof(dataset->num_features_);
+  dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
+  mem_ptr += sizeof(dataset->num_total_features_);
+  size_t num_used_feature_map = *(reinterpret_cast<const size_t*>(mem_ptr));
+  mem_ptr += sizeof(num_used_feature_map);
+  const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
+  dataset->used_feature_map_.clear();
+  for (size_t i = 0; i < num_used_feature_map; ++i) {
+    dataset->used_feature_map_.push_back(tmp_feature_map[i]);
+  }
+  mem_ptr += sizeof(int) * num_used_feature_map;
+  // get feature names
+  feature_names_.clear();
+  // write feature names
+  for (int i = 0; i < dataset->num_total_features_; ++i) {
+    int str_len = *(reinterpret_cast<const int*>(mem_ptr));
+    mem_ptr += sizeof(int);
+    std::stringstream str_buf;
+    for (int j = 0; j < str_len; ++j) {
+      char tmp_char = *(reinterpret_cast<const char*>(mem_ptr));
+      mem_ptr += sizeof(char);
+      str_buf << tmp_char;
+    }
+    feature_names_.emplace_back(str_buf.str());
+  }
+
+  // read size of meta data
+  read_cnt = fread(buffer, sizeof(size_t), 1, file);
+
+  if (read_cnt != 1) {
+    Log::Fatal("Binary file error: meta data has the wrong size");
+  }
+
+  size_t size_of_metadata = *(reinterpret_cast<size_t*>(buffer));
+
+  // re-allocate space if not enough
+  if (size_of_metadata > buffer_size) {
+    delete[] buffer;
+    buffer_size = size_of_metadata;
+    buffer = new char[buffer_size];
+  }
+  //  read meta data
+  read_cnt = fread(buffer, 1, size_of_metadata, file);
+
+  if (read_cnt != size_of_metadata) {
+    Log::Fatal("Binary file error: meta data is incorrect");
+  }
+  // load meta data
+  dataset->metadata_.LoadFromMemory(buffer);
+
+  std::vector<data_size_t> used_data_indices;
+  data_size_t num_global_data = dataset->num_data_;
+  // sample local used data if need to partition
+  if (num_machines > 1 && !io_config_.is_pre_partition) {
+    const data_size_t* query_boundaries = dataset->metadata_.query_boundaries();
+    if (query_boundaries == nullptr) {
+      // if not contain query file, minimal sample unit is one record
+      for (data_size_t i = 0; i < dataset->num_data_; ++i) {
+        if (random_.NextInt(0, num_machines) == rank) {
+          used_data_indices.push_back(i);
+        }
+      }
+    } else {
+      // if contain query file, minimal sample unit is one query
+      data_size_t num_queries = dataset->metadata_.num_queries();
+      data_size_t qid = -1;
+      bool is_query_used = false;
+      for (data_size_t i = 0; i < dataset->num_data_; ++i) {
+        if (qid >= num_queries) {
+          Log::Fatal("Current query exceeds the range of the query file, please ensure the query file is correct");
+        }
+        if (i >= query_boundaries[qid + 1]) {
+          // if is new query
+          is_query_used = false;
+          if (random_.NextInt(0, num_machines) == rank) {
+            is_query_used = true;
+          }
+          ++qid;
+        }
+        if (is_query_used) {
+          used_data_indices.push_back(i);
+        }
+      }
+    }
+    dataset->num_data_ = static_cast<data_size_t>(used_data_indices.size());
+  }
+  dataset->metadata_.PartitionLabel(used_data_indices);
+  // read feature data
+  for (int i = 0; i < dataset->num_features_; ++i) {
+    // read feature size
+    read_cnt = fread(buffer, sizeof(size_t), 1, file);
+    if (read_cnt != 1) {
+      Log::Fatal("Binary file error: feature %d has the wrong size", i);
+    }
+    size_t size_of_feature = *(reinterpret_cast<size_t*>(buffer));
+    // re-allocate space if not enough
+    if (size_of_feature > buffer_size) {
+      delete[] buffer;
+      buffer_size = size_of_feature;
+      buffer = new char[buffer_size];
+    }
+
+    read_cnt = fread(buffer, 1, size_of_feature, file);
+
+    if (read_cnt != size_of_feature) {
+      Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt);
+    }
+    dataset->features_.push_back(new Feature(buffer, num_global_data, used_data_indices));
+  }
+  delete[] buffer;
+  fclose(file);
+  dataset->is_loading_from_binfile_ = true;
+  return dataset;
+}
+
+
+
+// ---- private functions ----
+
+void DatasetLoader::CheckDataset(const Dataset* dataset) {
+  if (dataset->num_data_ <= 0) {
+    Log::Fatal("Data file %s is empty", dataset->data_filename_);
+  }
+  if (dataset->features_.size() <= 0) {
+    Log::Fatal("No usable features in data file %s", dataset->data_filename_);
+  }
+}
+
+std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
+  int rank, int num_machines, int* num_global_data, 
+  std::vector<data_size_t>* used_data_indices) {
+  TextReader<data_size_t> text_reader(filename, io_config_.has_header);
+  used_data_indices->clear();
+  if (num_machines == 1 || io_config_.is_pre_partition) {
+    // read all lines
+    *num_global_data = text_reader.ReadAllLines();
+  } else {  // need partition data
+            // get query data
+    const data_size_t* query_boundaries = metadata.query_boundaries();
+
+    if (query_boundaries == nullptr) {
+      // if not contain query data, minimal sample unit is one record
+      *num_global_data = text_reader.ReadAndFilterLines([this, rank, num_machines](data_size_t) {
+        if (random_.NextInt(0, num_machines) == rank) {
+          return true;
+        } else {
+          return false;
+        }
+      }, used_data_indices);
+    } else {
+      // if contain query data, minimal sample unit is one query
+      data_size_t num_queries = metadata.num_queries();
+      data_size_t qid = -1;
+      bool is_query_used = false;
+      *num_global_data = text_reader.ReadAndFilterLines(
+        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
+      (data_size_t line_idx) {
+        if (qid >= num_queries) {
+          Log::Fatal("Current query exceeds the range of the query file, please ensure the query file is correct");
+        }
+        if (line_idx >= query_boundaries[qid + 1]) {
+          // if is new query
+          is_query_used = false;
+          if (random_.NextInt(0, num_machines) == rank) {
+            is_query_used = true;
+          }
+          ++qid;
+        }
+        return is_query_used;
+      }, used_data_indices);
+    }
+  }
+  return std::move(text_reader.Lines());
+}
+
+std::vector<std::string> DatasetLoader::SampleTextDataFromMemory(const std::vector<std::string>& data) {
+  const size_t sample_cnt = static_cast<size_t>(data.size() < io_config_.bin_construct_sample_cnt ? data.size() : io_config_.bin_construct_sample_cnt);
+  std::vector<size_t> sample_indices = random_.Sample(data.size(), sample_cnt);
+  std::vector<std::string> out;
+  for (size_t i = 0; i < sample_indices.size(); ++i) {
+    const size_t idx = sample_indices[i];
+    out.push_back(data[idx]);
+  }
+  return out;
+}
+
+std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices) {
+  const data_size_t sample_cnt = static_cast<data_size_t>(io_config_.bin_construct_sample_cnt);
+  TextReader<data_size_t> text_reader(filename, io_config_.has_header);
+  std::vector<std::string> out_data;
+  if (num_machines == 1 || io_config_.is_pre_partition) {
+    *num_global_data = static_cast<data_size_t>(text_reader.SampleFromFile(random_, sample_cnt, &out_data));
+  } else {  // need partition data
+            // get query data
+    const data_size_t* query_boundaries = metadata.query_boundaries();
+    if (query_boundaries == nullptr) {
+      // if not contain query file, minimal sample unit is one record
+      *num_global_data = text_reader.SampleAndFilterFromFile([this, rank, num_machines]
+      (data_size_t) {
+        if (random_.NextInt(0, num_machines) == rank) {
+          return true;
+        } else {
+          return false;
+        }
+      }, used_data_indices, random_, sample_cnt, &out_data);
+    } else {
+      // if contain query file, minimal sample unit is one query
+      data_size_t num_queries = metadata.num_queries();
+      data_size_t qid = -1;
+      bool is_query_used = false;
+      *num_global_data = text_reader.SampleAndFilterFromFile(
+        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
+      (data_size_t line_idx) {
+        if (qid >= num_queries) {
+          Log::Fatal("Query id exceeds the range of the query file, \
+                      please ensure the query file is correct");
+        }
+        if (line_idx >= query_boundaries[qid + 1]) {
+          // if is new query
+          is_query_used = false;
+          if (random_.NextInt(0, num_machines) == rank) {
+            is_query_used = true;
+          }
+          ++qid;
+        }
+        return is_query_used;
+      }, used_data_indices, random_, sample_cnt, &out_data);
+    }
+  }
+  return out_data;
+}
+
+void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset) {
+  // sample_values[i][j], means the value of j-th sample on i-th feature
+  std::vector<std::vector<double>> sample_values;
+  // temp buffer for one line features and label
+  std::vector<std::pair<int, double>> oneline_features;
+  double label;
+  for (size_t i = 0; i < sample_data.size(); ++i) {
+    oneline_features.clear();
+    // parse features
+    parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
+    // push 0 first, then edit the value according existing feature values
+    for (auto& feature_values : sample_values) {
+      feature_values.push_back(0.0);
+    }
+    for (std::pair<int, double>& inner_data : oneline_features) {
+      if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
+        // if need expand feature set
+        size_t need_size = inner_data.first - sample_values.size() + 1;
+        for (size_t j = 0; j < need_size; ++j) {
+          // push i+1 0
+          sample_values.emplace_back(i + 1, 0.0f);
+        }
+      }
+      // edit the feature value
+      sample_values[inner_data.first][i] = inner_data.second;
+    }
+  }
+
+  dataset->features_.clear();
+
+  // -1 means doesn't use this feature
+  dataset->used_feature_map_ = std::vector<int>(sample_values.size(), -1);
+  dataset->num_total_features_ = static_cast<int>(sample_values.size());
+
+  // check the range of label_idx, weight_idx and group_idx
+  CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
+  CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
+  CHECK(group_idx_ < 0 || group_idx_ < dataset->num_total_features_);
+
+  // fill feature_names_ if not header
+  if (feature_names_.size() <= 0) {
+    for (int i = 0; i < dataset->num_total_features_; ++i) {
+      std::stringstream str_buf;
+      str_buf << "Column_" << i;
+      feature_names_.push_back(str_buf.str());
+    }
+  }
+  dataset->feature_names_ = feature_names_;
+  // start find bins
+  if (num_machines == 1) {
+    std::vector<BinMapper*> bin_mappers(sample_values.size());
+    // if only one machine, find bin locally
+#pragma omp parallel for schedule(guided)
+    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
+      if (ignore_features_.count(i) > 0) {
+        bin_mappers[i] = nullptr;
+        continue;
+      }
+      bin_mappers[i] = new BinMapper();
+      bin_mappers[i]->FindBin(&sample_values[i], io_config_.max_bin);
+    }
+
+    for (size_t i = 0; i < sample_values.size(); ++i) {
+      if (bin_mappers[i] == nullptr) {
+        Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
+      } else if (!bin_mappers[i]->is_trival()) {
+        // map real feature index to used feature index
+        dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
+        // push new feature
+        dataset->features_.push_back(new Feature(static_cast<int>(i), bin_mappers[i],
+          dataset->num_data_, io_config_.is_enable_sparse));
+      } else {
+        // if feature is trival(only 1 bin), free spaces
+        Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
+        delete bin_mappers[i];
+      }
+    }
+  } else {
+    // if have multi-machines, need find bin distributed
+    // different machines will find bin for different features
+
+    // start and len will store the process feature indices for different machines
+    // machine i will find bins for features in [ strat[i], start[i] + len[i] )
+    int* start = new int[num_machines];
+    int* len = new int[num_machines];
+    int total_num_feature = static_cast<int>(sample_values.size());
+    int step = (total_num_feature + num_machines - 1) / num_machines;
+    if (step < 1) { step = 1; }
+
+    start[0] = 0;
+    for (int i = 0; i < num_machines - 1; ++i) {
+      len[i] = Common::Min<int>(step, total_num_feature - start[i]);
+      start[i + 1] = start[i] + len[i];
+    }
+    len[num_machines - 1] = total_num_feature - start[num_machines - 1];
+    // get size of bin mapper with max_bin_ size
+    int type_size = BinMapper::SizeForSpecificBin(io_config_.max_bin);
+    // since sizes of different feature may not be same, we expand all bin mapper to type_size
+    int buffer_size = type_size * total_num_feature;
+    char* input_buffer = new char[buffer_size];
+    char* output_buffer = new char[buffer_size];
+
+    // find local feature bins and copy to buffer
+#pragma omp parallel for schedule(guided)
+    for (int i = 0; i < len[rank]; ++i) {
+      BinMapper* bin_mapper = new BinMapper();
+      bin_mapper->FindBin(&sample_values[start[rank] + i], io_config_.max_bin);
+      bin_mapper->CopyTo(input_buffer + i * type_size);
+      // don't need this any more
+      delete bin_mapper;
+    }
+    // convert to binary size
+    for (int i = 0; i < num_machines; ++i) {
+      start[i] *= type_size;
+      len[i] *= type_size;
+    }
+    // gather global feature bin mappers
+    Network::Allgather(input_buffer, buffer_size, start, len, output_buffer);
+    // restore features bins from buffer
+    for (int i = 0; i < total_num_feature; ++i) {
+      if (ignore_features_.count(i) > 0) {
+        Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
+        continue;
+      }
+      BinMapper* bin_mapper = new BinMapper();
+      bin_mapper->CopyFrom(output_buffer + i * type_size);
+      if (!bin_mapper->is_trival()) {
+        dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
+        dataset->features_.push_back(new Feature(static_cast<int>(i), bin_mapper, dataset->num_data_, io_config_.is_enable_sparse));
+      } else {
+        Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
+        delete bin_mapper;
+      }
+    }
+    // free buffer
+    delete[] start;
+    delete[] len;
+    delete[] input_buffer;
+    delete[] output_buffer;
+  }
+  dataset->num_features_ = static_cast<int>(dataset->features_.size());
+}
+
+/*! \brief Extract local features from memory */
+void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset) {
+  std::vector<std::pair<int, double>> oneline_features;
+  double tmp_label = 0.0f;
+  if (predict_fun_ == nullptr) {
+    // if doesn't need to prediction with initial model
+#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
+    for (data_size_t i = 0; i < dataset->num_data_; ++i) {
+      const int tid = omp_get_thread_num();
+      oneline_features.clear();
+      // parser
+      parser->ParseOneLine(text_data[i].c_str(), &oneline_features, &tmp_label);
+      // set label
+      dataset->metadata_.SetLabelAt(i, static_cast<float>(tmp_label));
+      // free processed line:
+      text_data[i].clear();
+      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
+      // text_reader_->Lines()[i].shrink_to_fit();
+      // push data
+      for (auto& inner_data : oneline_features) {
+        int feature_idx = dataset->used_feature_map_[inner_data.first];
+        if (feature_idx >= 0) {
+          // if is used feature
+          dataset->features_[feature_idx]->PushData(tid, i, inner_data.second);
+        } else {
+          if (inner_data.first == weight_idx_) {
+            dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
+          } else if (inner_data.first == group_idx_) {
+            dataset->metadata_.SetQueryAt(i, static_cast<data_size_t>(inner_data.second));
+          }
+        }
+      }
+    }
+  } else {
+    // if need to prediction with initial model
+    float* init_score = new float[dataset->num_data_ * dataset->num_class_];
+#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
+    for (data_size_t i = 0; i < dataset->num_data_; ++i) {
+      const int tid = omp_get_thread_num();
+      oneline_features.clear();
+      // parser
+      parser->ParseOneLine(text_data[i].c_str(), &oneline_features, &tmp_label);
+      // set initial score
+      std::vector<double> oneline_init_score = predict_fun_(oneline_features);
+      for (int k = 0; k < dataset->num_class_; ++k) {
+        init_score[k * dataset->num_data_ + i] = static_cast<float>(oneline_init_score[k]);
+      }
+      // set label
+      dataset->metadata_.SetLabelAt(i, static_cast<float>(tmp_label));
+      // free processed line:
+      text_data[i].clear();
+      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
+      // text_reader_->Lines()[i].shrink_to_fit();
+      // push data
+      for (auto& inner_data : oneline_features) {
+        int feature_idx = dataset->used_feature_map_[inner_data.first];
+        if (feature_idx >= 0) {
+          // if is used feature
+          dataset->features_[feature_idx]->PushData(tid, i, inner_data.second);
+        } else {
+          if (inner_data.first == weight_idx_) {
+            dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
+          } else if (inner_data.first == group_idx_) {
+            dataset->metadata_.SetQueryAt(i, static_cast<data_size_t>(inner_data.second));
+          }
+        }
+      }
+    }
+    // metadata_ will manage space of init_score
+    dataset->metadata_.SetInitScore(init_score, dataset->num_data_ * dataset->num_class_);
+    delete[] init_score;
+  }
+
+#pragma omp parallel for schedule(guided)
+  for (int i = 0; i < dataset->num_features_; ++i) {
+    dataset->features_[i]->FinishLoad();
+  }
+  // text data can be free after loaded feature values
+  text_data.clear();
+}
+
+/*! \brief Extract local features from file */
+void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset) {
+  float* init_score = nullptr;
+  if (predict_fun_ != nullptr) {
+    init_score = new float[dataset->num_data_ * dataset->num_class_];
+  }
+  std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
+    [this, &init_score, &parser, &dataset]
+  (data_size_t start_idx, const std::vector<std::string>& lines) {
+    std::vector<std::pair<int, double>> oneline_features;
+    double tmp_label = 0.0f;
+#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
+    for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
+      const int tid = omp_get_thread_num();
+      oneline_features.clear();
+      // parser
+      parser->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
+      // set initial score
+      if (init_score != nullptr) {
+        std::vector<double> oneline_init_score = predict_fun_(oneline_features);
+        for (int k = 0; k < dataset->num_class_; ++k) {
+          init_score[k * dataset->num_data_ + start_idx + i] = static_cast<float>(oneline_init_score[k]);
+        }
+      }
+      // set label
+      dataset->metadata_.SetLabelAt(start_idx + i, static_cast<float>(tmp_label));
+      // push data
+      for (auto& inner_data : oneline_features) {
+        int feature_idx = dataset->used_feature_map_[inner_data.first];
+        if (feature_idx >= 0) {
+          // if is used feature
+          dataset->features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
+        } else {
+          if (inner_data.first == weight_idx_) {
+            dataset->metadata_.SetWeightAt(start_idx + i, static_cast<float>(inner_data.second));
+          } else if (inner_data.first == group_idx_) {
+            dataset->metadata_.SetQueryAt(start_idx + i, static_cast<data_size_t>(inner_data.second));
+          }
+        }
+      }
+    }
+  };
+  TextReader<data_size_t> text_reader(filename, io_config_.has_header);
+  if (used_data_indices.size() > 0) {
+    // only need part of data
+    text_reader.ReadPartAndProcessParallel(used_data_indices, process_fun);
+  } else {
+    // need full data
+    text_reader.ReadAllAndProcessParallel(process_fun);
+  }
+
+  // metadata_ will manage space of init_score
+  if (init_score != nullptr) {
+    dataset->metadata_.SetInitScore(init_score, dataset->num_data_ * dataset->num_class_);
+    delete[] init_score;
+  }
+
+#pragma omp parallel for schedule(guided)
+  for (int i = 0; i < dataset->num_features_; ++i) {
+    dataset->features_[i]->FinishLoad();
+  }
+}
+
+/*! \brief Check can load from binary file */
+bool DatasetLoader::CheckCanLoadFromBin(const char* filename) {
+  std::string bin_filename(filename);
+  bin_filename.append(".bin");
+
+  FILE* file;
+
+#ifdef _MSC_VER
+  fopen_s(&file, bin_filename.c_str(), "rb");
+#else
+  file = fopen(bin_filename.c_str(), "rb");
+#endif
+  if (file == NULL) {
+    return false;
+  } else {
+    fclose(file);
+    return true;
+  }
+}
+
+}
\ No newline at end of file
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -14,9 +14,8 @@ Metadata::Metadata()

 }

-void Metadata::Init(const char * data_filename, const char* init_score_filename, const int num_class) {
+void Metadata::Init(const char * data_filename, const int num_class) {
  data_filename_ = data_filename;
-  init_score_filename_ = init_score_filename;
  num_class_ = num_class;
  // for lambdarank, it needs query data for partition data in parallel learning
  LoadQueryBoundaries();
@@ -25,11 +24,6 @@ void Metadata::Init(const char * data_filename, const char* init_score_filename,
  LoadInitialScore();
 }

-void Metadata::Init(const char* init_score_filename, const int num_class) {
-  init_score_filename_ = init_score_filename;
-  num_class_ = num_class;
-  LoadInitialScore();
-}


 Metadata::~Metadata() {
@@ -294,10 +288,14 @@ void Metadata::LoadWeights() {

 void Metadata::LoadInitialScore() {
  num_init_score_ = 0;
-  if (init_score_filename_[0] == '\0') { return; }
-  TextReader<size_t> reader(init_score_filename_, false);
+  std::string init_score_filename(data_filename_);
+  // default weight file name
+  init_score_filename.append(".init");
+  TextReader<size_t> reader(init_score_filename.c_str(), false);
  reader.ReadAllLines();
-
+  if (reader.Lines().size() <= 0) {
+    return;
+  }
  Log::Info("Loading initial scores...");
  num_init_score_ = static_cast<data_size_t>(reader.Lines().size());


--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -161,6 +161,7 @@
    <ClInclude Include="..\include\LightGBM\config.h" />
    <ClInclude Include="..\include\LightGBM\c_api.h" />
    <ClInclude Include="..\include\LightGBM\dataset.h" />
+    <ClInclude Include="..\include\LightGBM\dataset_loader.h" />
    <ClInclude Include="..\include\LightGBM\feature.h" />
    <ClInclude Include="..\include\LightGBM\meta.h" />
    <ClInclude Include="..\include\LightGBM\metric.h" />
@@ -208,6 +209,7 @@
    <ClCompile Include="..\src\io\bin.cpp" />
    <ClCompile Include="..\src\io\config.cpp" />
    <ClCompile Include="..\src\io\dataset.cpp" />
+    <ClCompile Include="..\src\io\dataset_loader.cpp" />
    <ClCompile Include="..\src\io\metadata.cpp" />
    <ClCompile Include="..\src\io\parser.cpp" />
    <ClCompile Include="..\src\io\tree.cpp" />

--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -168,6 +168,9 @@
    <ClInclude Include="..\include\LightGBM\c_api.h">
      <Filter>include\LightGBM</Filter>
    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\dataset_loader.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\src\application\application.cpp">
@@ -236,5 +239,8 @@
    <ClCompile Include="..\src\c_api.cpp">
      <Filter>src</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\io\dataset_loader.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
  </ItemGroup>
 </Project>
\ No newline at end of file