Merge pull request #44 from guolinke/master

To solve #41

Merge pull request #44 from guolinke/master
To solve #41
b23a2c31 · xuehui · GitHub · 2af0dccd · 3a06ce35 · b23a2c31
Commit b23a2c31 authored Oct 29, 2016 by xuehui Committed by GitHub Oct 29, 2016
17 changed files
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -85,6 +85,12 @@ public:
  */
  virtual int MaxFeatureIdx() const = 0;

+  /*!
+  * \brief Get index of label column
+  * \return index of label column
+  */
+  virtual int LabelIdx() const = 0;
+
  /*!
  * \brief Get number of weak sub-models
  * \return Number of weak sub-models

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -100,6 +100,20 @@ public:
  bool use_two_round_loading = false;
  bool is_save_binary_file = false;
  bool is_sigmoid = true;
+
+  bool has_header = false;
+  /*! \brief Index or column name of label, default is the first column
+   * And add an prefix "name:" while using column name */
+  std::string label_column = "";
+  /*! \brief Index or column name of weight, < 0 means not used
+  * And add an prefix "name:" while using column name */
+  std::string weight_column = "";
+  /*! \brief Index or column name of group, < 0 means not used */
+  std::string group_column = "";
+  /*! \brief ignored features, separate by ','
+  * e.g. name:column_name1,column_name2  */
+  std::string ignore_column = "";
+
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };

@@ -323,7 +337,15 @@ struct ParameterAlias {
      { "save_binary", "is_save_binary_file" },
      { "early_stopping_rounds", "early_stopping_round"},
      { "early_stopping", "early_stopping_round"},
-      { "verbosity", "verbose" }
+      { "verbosity", "verbose" },
+      { "header", "has_header" },
+      { "label", "label_column" },
+      { "weight", "weight_column" },
+      { "group", "group_column" },
+      { "query", "group_column" },
+      { "query_column", "group_column" },
+      { "ignore_feature", "ignore_column" },
+      { "blacklist", "ignore_column" }
    });
    std::unordered_map<std::string, std::string> tmp_map;
    for (const auto& pair : *params) {

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -5,11 +5,13 @@
 #include <LightGBM/utils/text_reader.h>

 #include <LightGBM/meta.h>
+#include <LightGBM/config.h>

 #include <vector>
 #include <utility>
 #include <functional>
 #include <string>
+#include <unordered_set>

 namespace LightGBM {

@@ -56,10 +58,12 @@ public:
  ~Metadata();

  /*!
-  * \brief Initial work, will auto load weight, inital scores
+  * \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
  * \param num_data Number of training data
+  * \param weight_idx Index of weight column, < 0 means doesn't exists
+  * \param query_idx Index of query id column, < 0 means doesn't exists
  */
-  void InitLabel(data_size_t num_data);
+  void Init(data_size_t num_data, int weight_idx, int query_idx);

  /*!
  * \brief Partition label by used indices
@@ -109,6 +113,26 @@ public:
    label_[idx] = static_cast<float>(value);
  }

+  /*!
+  * \brief Set Weight for one record
+  * \param idx Index of this record
+  * \param value Weight value of this record
+  */
+  inline void SetWeightAt(data_size_t idx, double value)
+  {
+    weights_[idx] = static_cast<float>(value);
+  }
+
+  /*!
+  * \brief Set Query Id for one record
+  * \param idx Index of this record
+  * \param value Query Id value of this record
+  */
+  inline void SetQueryAt(data_size_t idx, double value)
+  {
+    queries_[idx] = static_cast<data_size_t>(value);
+  }
+
  /*!
  * \brief Get weights, if not exists, will return nullptr
  * \return Pointer of weights
@@ -178,41 +202,35 @@ private:
  data_size_t num_init_score_;
  /*! \brief Initial score */
  score_t* init_score_;
+  /*! \brief Queries data */
+  data_size_t* queries_;
 };


 /*! \brief Interface for Parser */
 class Parser {
 public:
+
  /*! \brief virtual destructor */
  virtual ~Parser() {}
-  /*!
-  * \brief Parse one line with label
-  * \param str One line record, string format, should end with '\0'
-  * \param out_features Output features, store in (feature_idx, feature_value)
-  * \param out_label Output label
-  */
-  virtual void ParseOneLine(const char* str,
-    std::vector<std::pair<int, double>>* out_features,
-    double* out_label) const = 0;

  /*!
  * \brief Parse one line with label
  * \param str One line record, string format, should end with '\0'
-  * \param out_features Output features, store in (feature_idx, feature_value)
-  * \param out_label Output label
+  * \param out_features Output columns, store in (column_idx, values)
+  * \param out_label Label will store to this if exists
  */
  virtual void ParseOneLine(const char* str,
-    std::vector<std::pair<int, double>>* out_features) const = 0;
+    std::vector<std::pair<int, double>>* out_features, double* out_label) const = 0;

  /*!
  * \brief Create a object of parser, will auto choose the format depend on file
  * \param filename One Filename of data
  * \param num_features Pass num_features of this data file if you know, <=0 means don't know
-  * \param has_label output, if num_features > 0, will output this data has label or not
+  * \param label_idx index of label column
  * \return Object of parser
  */
-  static Parser* CreateParser(const char* filename, int num_features, bool* has_label);
+  static Parser* CreateParser(const char* filename, bool has_header, int num_features, int label_idx);
 };

 using PredictFunction =
@@ -227,29 +245,21 @@ public:
  * \brief Constructor
  * \param data_filename Filename of dataset
  * \param init_score_filename Filename of initial score
-  * \param is_int_label True if label is int type
-  * \param max_bin The maximal number of bin that feature values will bucket in
-  * \param random_seed The seed for random generator
-  * \param is_enable_sparse True for sparse feature
+  * \param io_config configs for IO
  * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
  */
  Dataset(const char* data_filename, const char* init_score_filename,
-    int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun);
+    const IOConfig& io_config, const PredictFunction& predict_fun);

  /*!
  * \brief Constructor
  * \param data_filename Filename of dataset
-  * \param is_int_label True if label is int type
-  * \param max_bin The maximal number of bin that feature values will bucket in
-  * \param random_seed The seed for random generator
-  * \param is_enable_sparse True for sparse feature
+  * \param io_config configs for IO
  * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
  */
  Dataset(const char* data_filename,
-    int max_bin, int random_seed, bool is_enable_sparse,
-                     const PredictFunction& predict_fun)
-    : Dataset(data_filename, "", max_bin, random_seed,
-                                    is_enable_sparse, predict_fun) {
+    const IOConfig& io_config, const PredictFunction& predict_fun)
+    : Dataset(data_filename, "", io_config, predict_fun) {
  }

  /*! \brief Destructor */
@@ -304,6 +314,12 @@ public:
  /*! \brief Get Number of total features */
  inline int num_total_features() const { return num_total_features_; }

+  /*! \brief Get the index of label column */
+  inline int label_idx() const { return label_idx_; }
+
+  /*! \brief Get names of current data set */
+  inline std::vector<std::string> feature_names() const { return feature_names_; }
+
  /*! \brief Get Number of data */
  inline data_size_t num_data() const { return num_data_; }

@@ -394,10 +410,20 @@ private:
  bool is_loading_from_binfile_;
  /*! \brief Number of global data, used for distributed learning */
  size_t global_num_data_ = 0;
-  // used to local used data indices
+  /*! \brief used to local used data indices */
  std::vector<data_size_t> used_data_indices_;
-  // prediction function for initial model
+  /*! \brief prediction function for initial model */
  const PredictFunction& predict_fun_;
+  /*! \brief index of label column */
+  int label_idx_ = 0;
+  /*! \brief index of weight column */
+  int weight_idx_ = -1;
+  /*! \brief index of group column */
+  int group_idx_ = -1;
+  /*! \brief Mapper from real feature index to used index*/
+  std::unordered_set<int> ignore_features_;
+  /*! \brief store feature names */
+  std::vector<std::string> feature_names_;
 };

 }  // namespace LightGBM

--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -43,7 +43,13 @@ inline static std::string& RemoveQuotationSymbol(std::string& str) {
  str.erase(0, str.find_first_not_of("'\""));
  return str;
 }
-
+inline static bool StartsWith(const std::string& str, const std::string prefix) {
+  if (str.substr(0, prefix.size()) == prefix) {
+    return true;
+  } else {
+    return false;
+  }
+}
 inline static std::vector<std::string> Split(const char* c_str, char delimiter) {
  std::vector<std::string> ret;
  std::string str(c_str);
@@ -58,6 +64,21 @@ inline static std::vector<std::string> Split(const char* c_str, char delimiter)
  return ret;
 }

+inline static std::vector<std::string> Split(const char* c_str, const char* delimiters) {
+  // will split when met any chars in delimiters
+  std::vector<std::string> ret;
+  std::string str(c_str);
+  size_t i = 0;
+  size_t pos = str.find_first_of(delimiters);
+  while (pos != std::string::npos) {
+    ret.push_back(str.substr(i, pos - i));
+    i = ++pos;
+    pos = str.find_first_of(delimiters, pos);
+  }
+  ret.push_back(str.substr(i));
+  return ret;
+}
+
 inline static const char* Atoi(const char* p, int* out) {
  int sign, value;
  while (*p == ' ') {

--- a/include/LightGBM/utils/pipeline_reader.h
+++ b/include/LightGBM/utils/pipeline_reader.h
@@ -21,7 +21,7 @@ public:
  * \param filename Filename of data
  * \process_fun Process function
  */
-  static size_t Read(const char* filename, const std::function<size_t (const char*, size_t)>& process_fun) {
+  static size_t Read(const char* filename, int skip_bytes, const std::function<size_t (const char*, size_t)>& process_fun) {
    FILE* file;

 #ifdef _MSC_VER
@@ -38,8 +38,13 @@ public:
    char* buffer_process = new char[buffer_size];
    // buffer used for the file reading
    char* buffer_read = new char[buffer_size];
+    size_t read_cnt = 0;
+    if (skip_bytes > 0) {
+      // skip first k bytes
+      read_cnt = fread(buffer_process, 1, skip_bytes, file);
+    }
    // read first block
-    size_t read_cnt = fread(buffer_process, 1, buffer_size, file);
+    read_cnt = fread(buffer_process, 1, buffer_size, file);
    size_t last_read_cnt = 0;
    while (read_cnt > 0) {
      // strat read thread

--- a/include/LightGBM/utils/text_reader.h
+++ b/include/LightGBM/utils/text_reader.h
@@ -6,6 +6,7 @@
 #include <LightGBM/utils/random.h>

 #include <cstdio>
+#include <sstream>

 #include <vector>
 #include <string>
@@ -22,9 +23,41 @@ public:
  /*!
  * \brief Constructor
  * \param filename Filename of data
+  * \param is_skip_first_line True if need to skip header
  */
-  TextReader(const char* filename):
-    filename_(filename){
+  TextReader(const char* filename, bool is_skip_first_line):
+    filename_(filename), is_skip_first_line_(is_skip_first_line){
+    if (is_skip_first_line_) {
+      FILE* file;
+#ifdef _MSC_VER
+      fopen_s(&file, filename, "r");
+#else
+      file = fopen(filename, "r");
+#endif
+      std::stringstream str_buf;
+      int read_c = -1;
+      read_c = fgetc(file);
+      while (read_c != EOF) {
+        char tmp_ch = static_cast<char>(read_c);
+        if (tmp_ch == '\n' || tmp_ch == '\r') {
+          break;
+        }
+        str_buf << tmp_ch;
+        ++skip_bytes_;
+        read_c = fgetc(file);
+      }
+      if (static_cast<char>(read_c) == '\r') {
+        read_c = fgetc(file);
+        ++skip_bytes_;
+      }
+      if (static_cast<char>(read_c) == '\n') {
+        read_c = fgetc(file);
+        ++skip_bytes_;
+      }
+      fclose(file);
+      first_line_ = str_buf.str();
+      Log::Info("skip header:\"%s\" in file %s", first_line_.c_str(), filename_);
+    }
  }
  /*!
  * \brief Destructor
@@ -40,6 +73,12 @@ public:
    lines_.shrink_to_fit();
  }
  /*!
+  * \brief return first line of data
+  */
+  inline std::string first_line() {
+    return first_line_;
+  }
+  /*!
  * \brief Get text data that read from file
  * \return Text data, store in std::vector by line
  */
@@ -48,7 +87,7 @@ public:
  INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
    last_line_ = "";
    INDEX_T total_cnt = 0;
-    PipelineReader::Read(filename_,
+    PipelineReader::Read(filename_, skip_bytes_,
      [this, &total_cnt, &process_fun]
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
@@ -176,7 +215,7 @@ public:
    last_line_ = "";
    INDEX_T total_cnt = 0;
    INDEX_T used_cnt = 0;
-    PipelineReader::Read(filename_,
+    PipelineReader::Read(filename_, skip_bytes_,
      [this, &total_cnt, &process_fun,&used_cnt, &filter_fun]
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
@@ -260,6 +299,12 @@ private:
  std::vector<std::string> lines_;
  /*! \brief Buffer for last line */
  std::string last_line_;
+  /*! \brief first line */
+  std::string first_line_="";
+  /*! \brief is skip first line */
+  bool is_skip_first_line_ = false;
+  /*! \brief is skip first line */
+  int skip_bytes_ = 0;
 };

 }  // namespace LightGBM

--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -76,7 +76,7 @@ void Application::LoadParameters(int argc, char** argv) {
  ParameterAlias::KeyAliasTransform(&params);
  // read parameters from config file
  if (params.count("config_file") > 0) {
-    TextReader<size_t> config_reader(params["config_file"].c_str());
+    TextReader<size_t> config_reader(params["config_file"].c_str(), false);
    config_reader.ReadAllLines();
    if (config_reader.Lines().size() > 0) {
      for (auto& line : config_reader.Lines()) {
@@ -139,9 +139,7 @@ void Application::LoadData() {
  }
  train_data_ = new Dataset(config_.io_config.data_filename.c_str(),
                         config_.io_config.input_init_score.c_str(),
-                                          config_.io_config.max_bin,
-                                 config_.io_config.data_random_seed,
-                                 config_.io_config.is_enable_sparse,
+                                                  config_.io_config,
                                                       predict_fun);
  // load Training data
  if (config_.is_parallel_find_bin) {
@@ -173,9 +171,7 @@ void Application::LoadData() {
    // add
    valid_datas_.push_back(
      new Dataset(config_.io_config.valid_data_filenames[i].c_str(),
-                                          config_.io_config.max_bin,
-                                 config_.io_config.data_random_seed,
-                                 config_.io_config.is_enable_sparse,
+                                                  config_.io_config,
                                                      predict_fun));
    // load validation data like train data
    valid_datas_.back()->LoadValidationData(train_data_,
@@ -253,7 +249,8 @@ void Application::Train() {
 void Application::Predict() {
  // create predictor
  Predictor predictor(boosting_, config_.io_config.is_sigmoid, config_.predict_leaf_index);
-  predictor.Predict(config_.io_config.data_filename.c_str(), config_.io_config.output_result.c_str());
+  predictor.Predict(config_.io_config.data_filename.c_str(), 
+    config_.io_config.output_result.c_str(), config_.io_config.has_header);
  Log::Info("Finish predict.");
 }

@@ -265,7 +262,7 @@ void Application::InitPredict() {
 }

 void Application::LoadModel() {
-  TextReader<size_t> model_reader(config_.io_config.input_model.c_str());
+  TextReader<size_t> model_reader(config_.io_config.input_model.c_str(), false);
  model_reader.ReadAllLines();
  std::stringstream ss;
  for (auto& line : model_reader.Lines()) {

--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -92,7 +92,7 @@ public:
  * \param has_label True if this data contains label
  * \param result_filename Filename of output result
  */
-  void Predict(const char* data_filename, const char* result_filename) {
+  void Predict(const char* data_filename, const char* result_filename, bool has_header) {
    FILE* result_file;

 #ifdef _MSC_VER
@@ -104,8 +104,7 @@ public:
    if (result_file == NULL) {
      Log::Fatal("Predition result file %s doesn't exists", data_filename);
    }
-    bool has_label = false;
-    Parser* parser = Parser::CreateParser(data_filename, num_features_, &has_label);
+    Parser* parser = Parser::CreateParser(data_filename, has_header, num_features_, boosting_->LabelIdx());

    if (parser == nullptr) {
      Log::Fatal("Recongnizing input data format failed, filename %s", data_filename);
@@ -114,21 +113,12 @@ public:
    // function for parse data
    std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
    double tmp_label;
-    if (has_label) {
-      // parse function with label
-      parser_fun = [this, &parser, &tmp_label]
-      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
-        parser->ParseOneLine(buffer, feature, &tmp_label);
-      };
-      Log::Info("Start prediction for data %s with labels", data_filename);
-    } else {
-      // parse function without label
-      parser_fun = [this, &parser]
-      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
-        parser->ParseOneLine(buffer, feature);
-      };
-      Log::Info("Start prediction for data %s without label", data_filename);
-    }
+
+    parser_fun = [this, &parser, &tmp_label]
+    (const char* buffer, std::vector<std::pair<int, double>>* feature) {
+      parser->ParseOneLine(buffer, feature, &tmp_label);
+    };
+
    std::function<std::string(const std::vector<std::pair<int, double>>&)> predict_fun;
    if (predict_leaf_index) {
      predict_fun = [this](const std::vector<std::pair<int, double>>& features){
@@ -173,7 +163,7 @@ public:
        fprintf(result_file, "%s\n", pred_result[i].c_str());
      }
    };
-    TextReader<data_size_t> predict_data_reader(data_filename);
+    TextReader<data_size_t> predict_data_reader(data_filename, has_header);
    predict_data_reader.ReadAllAndProcessParallel(process_fun);

    fclose(result_file);

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -61,7 +61,8 @@ void GBDT::Init(const Dataset* train_data, const ObjectiveFunction* object_funct

  // get max feature index
  max_feature_idx_ = train_data_->num_total_features() - 1;
-
+  // get label index
+  label_idx_ = train_data_->label_idx();
  // if need bagging, create buffer
  if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
    out_of_bag_data_indices_ = new data_size_t[num_data_];
@@ -276,19 +277,21 @@ void GBDT::Boosting() {

 std::string GBDT::ModelsToString() const {
  // serialize this object to string
-  std::stringstream ss;
+  std::stringstream str_buf;
+  // output label index
+  str_buf << "label_index=" << label_idx_ << std::endl;
  // output max_feature_idx
-  ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
+  str_buf << "max_feature_idx=" << max_feature_idx_ << std::endl;
  // output sigmoid parameter
-  ss << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
-  ss << std::endl;
+  str_buf << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
+  str_buf << std::endl;

  // output tree models
  for (size_t i = 0; i < models_.size(); ++i) {
-    ss << "Tree=" << i << std::endl;
-    ss << models_[i]->ToString() << std::endl;
+    str_buf << "Tree=" << i << std::endl;
+    str_buf << models_[i]->ToString() << std::endl;
  }
-  return ss.str();
+  return str_buf.str();
 }

 void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
@@ -296,7 +299,26 @@ void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
  models_.clear();
  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
  size_t i = 0;
+
+  // get index of label
+  while (i < lines.size()) {
+    size_t find_pos = lines[i].find("label_index=");
+    if (find_pos != std::string::npos) {
+      std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
+      Common::Atoi(strs[1].c_str(), &label_idx_);
+      ++i;
+      break;
+    } else {
+      ++i;
+    }
+  }
+  if (i == lines.size()) {
+    Log::Fatal("Model file doesn't contain label index");
+    return;
+  }
+
  // get max_feature_idx first
+  i = 0;
  while (i < lines.size()) {
    size_t find_pos = lines[i].find("max_feature_idx=");
    if (find_pos != std::string::npos) {

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -82,6 +82,13 @@ public:
  * \return Max feature index of this model
  */
  inline int MaxFeatureIdx() const override { return max_feature_idx_; }
+
+  /*!
+  * \brief Get index of label column
+  * \return index of label column
+  */
+  inline int LabelIdx() const override { return label_idx_; }
+
  /*!
  * \brief Get number of weak sub-models
  * \return Number of weak sub-models
@@ -173,6 +180,9 @@ private:
  *          if > 0 meas output score will transform by sigmoid function
  */
  double sigmoid_;
+
+  /*! \brief Index of label column */
+  data_size_t label_idx_;
 };

 }  // namespace LightGBM

--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -163,6 +163,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
  if (GetString(params, "valid_data", &tmp_str)) {
    valid_data_filenames = Common::Split(tmp_str.c_str(), ',');
  }
+  GetBool(params, "has_header", &has_header);
+  GetString(params, "label_column", &label_column);
+  GetString(params, "weight_column", &weight_column);
+  GetString(params, "group_column", &group_column);
+  GetString(params, "ignore_column", &ignore_column);
 }



--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -11,13 +11,14 @@
 #include <vector>
 #include <utility>
 #include <string>
+#include <sstream>

 namespace LightGBM {

 Dataset::Dataset(const char* data_filename, const char* init_score_filename,
-                 int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun)
-  :data_filename_(data_filename), random_(random_seed),
-  max_bin_(max_bin), is_enable_sparse_(is_enable_sparse), predict_fun_(predict_fun) {
+  const IOConfig& io_config, const PredictFunction& predict_fun)
+  :data_filename_(data_filename), random_(io_config.data_random_seed),
+  max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse), predict_fun_(predict_fun) {

  CheckCanLoadFromBin();
  if (is_loading_from_binfile_ && predict_fun != nullptr) {
@@ -28,13 +29,134 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
  if (!is_loading_from_binfile_) {
    // load weight, query information and initilize score
    metadata_.Init(data_filename, init_score_filename);
+    // create text reader
+    text_reader_ = new TextReader<data_size_t>(data_filename, io_config.has_header);
+
+    std::unordered_map<std::string, int> name2idx;
+    // get column names
+    if (io_config.has_header) {
+      std::string first_line = text_reader_->first_line();
+      feature_names_ = Common::Split(first_line.c_str(), "\t ,");
+      for (size_t i = 0; i < feature_names_.size(); ++i) {
+        name2idx[feature_names_[i]] = static_cast<int>(i);
+      }
+    }
+    std::string name_prefix("name:");
+
+    // load label idx
+    if (io_config.label_column.size() > 0) {
+      if (Common::StartsWith(io_config.label_column, name_prefix)) {
+        std::string name = io_config.label_column.substr(name_prefix.size());
+        if (name2idx.count(name) > 0) {
+          label_idx_ = name2idx[name];
+          Log::Info("use %s column as label", name.c_str());
+        } else {
+          Log::Fatal("cannot find label column: %s in data file", name.c_str());
+        }
+      } else {
+        size_t pos = 0;
+        label_idx_ = std::stoi(io_config.label_column, &pos);
+        if (pos != io_config.label_column.size()) {
+          Log::Fatal("label_column is not a number, \
+                      if you want to use column name, \
+                      please add prefix \"name:\" before column name");
+        }
+        Log::Info("use %d-th column as label", label_idx_);
+      }
+    }
+    if (feature_names_.size() > 0) {
+      // erase label column name
+      feature_names_.erase(feature_names_.begin() + label_idx_);
+    }
+    // load ignore columns
+    if (io_config.ignore_column.size() > 0) {
+      if (Common::StartsWith(io_config.ignore_column, name_prefix)) {
+        std::string names = io_config.ignore_column.substr(name_prefix.size());
+        for (auto name : Common::Split(names.c_str(), ',')) {
+          if (name2idx.count(name) > 0) {
+            int tmp = name2idx[name];
+            // skip for label column
+            if (tmp > label_idx_) { tmp -= 1; }
+            ignore_features_.emplace(tmp);
+          } else {
+            Log::Fatal("cannot find column: %s in data file", name.c_str());
+          }
+        }
+      } else {
+        for (auto token : Common::Split(io_config.ignore_column.c_str(), ',')) {
+          size_t pos = 0;
+          int tmp = std::stoi(token, &pos);
+          if (pos != token.size()) {
+            Log::Fatal("ignore_column is not a number, \
+                      if you want to use column name, \
+                      please add prefix \"name:\" before column name");
+          }
+          // skip for label column
+          if (tmp > label_idx_) { tmp -= 1; }
+          ignore_features_.emplace(tmp);
+        }
+      }
+
+    }
+
+    // load weight idx
+    if (io_config.weight_column.size() > 0) {
+      if (Common::StartsWith(io_config.weight_column, name_prefix)) {
+        std::string name = io_config.weight_column.substr(name_prefix.size());
+        if (name2idx.count(name) > 0) {
+          weight_idx_ = name2idx[name];
+          Log::Info("use %s column as weight", name.c_str());
+        } else {
+          Log::Fatal("cannot find weight column: %s in data file", name.c_str());
+        }
+      } else {
+        size_t pos = 0;
+        weight_idx_ = std::stoi(io_config.weight_column, &pos);
+        if (pos != io_config.weight_column.size()) {
+          Log::Fatal("weight_column is not a number, \
+                      if you want to use column name, \
+                      please add prefix \"name:\" before column name");
+        }
+        Log::Info("use %d-th column as weight", weight_idx_);
+      }
+      // skip for label column
+      if (weight_idx_ > label_idx_) {
+        weight_idx_ -= 1;
+      }
+      ignore_features_.emplace(weight_idx_);
+    }
+
+    if (io_config.group_column.size() > 0) {
+      if (Common::StartsWith(io_config.group_column, name_prefix)) {
+        std::string name = io_config.group_column.substr(name_prefix.size());
+        if (name2idx.count(name) > 0) {
+          group_idx_ = name2idx[name];
+          Log::Info("use %s column as group/query id", name.c_str());
+        } else {
+          Log::Fatal("cannot find group/query column: %s in data file", name.c_str());
+        }
+      } else {
+        size_t pos = 0;
+        group_idx_ = std::stoi(io_config.group_column, &pos);
+        if (pos != io_config.group_column.size()) {
+          Log::Fatal("group_column is not a number, \
+                      if you want to use column name, \
+                      please add prefix \"name:\" before column name");
+        }
+        Log::Info("use %d-th column as group/query id", group_idx_);
+      }
+      // skip for label column
+      if (group_idx_ > label_idx_) {
+        group_idx_ -= 1;
+      }
+      ignore_features_.emplace(group_idx_);
+    }
+
    // create text parser
-    parser_ = Parser::CreateParser(data_filename_, 0, nullptr);
+    parser_ = Parser::CreateParser(data_filename_, io_config.has_header, 0, label_idx_);
    if (parser_ == nullptr) {
      Log::Fatal("Cannot recognising input data format, filename: %s", data_filename_);
    }
-    // create text reader
-    text_reader_ = new TextReader<data_size_t>(data_filename);
  } else {
    // only need to load initilize score, other meta data will be loaded from bin flie
    metadata_.Init(init_score_filename);
@@ -190,18 +312,40 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
  // -1 means doesn't use this feature
  used_feature_map_ = std::vector<int>(sample_values.size(), -1);
  num_total_features_ = static_cast<int>(sample_values.size());
+
+  // check the range of label_idx, weight_idx and group_idx
+  CHECK(label_idx_ >= 0 && label_idx_ <= num_total_features_);
+  CHECK(weight_idx_ < 0 || weight_idx_ < num_total_features_);
+  CHECK(group_idx_ < 0 || group_idx_ < num_total_features_);
+
+  // fill feature_names_ if not header
+  if (feature_names_.size() <= 0) {
+    for (int i = 0; i < num_total_features_; ++i) {
+      std::stringstream str_buf;
+      str_buf << "Column_" << i;
+      feature_names_.push_back(str_buf.str());
+    }
+  }
+
  // start find bins
  if (num_machines == 1) {
    std::vector<BinMapper*> bin_mappers(sample_values.size());
    // if only 1 machines, find bin locally
    #pragma omp parallel for schedule(guided)
    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
+      if (ignore_features_.count(i) > 0) {
+        bin_mappers[i] = nullptr;
+        continue;
+      }
      bin_mappers[i] = new BinMapper();
      bin_mappers[i]->FindBin(&sample_values[i], max_bin_);
    }

    for (size_t i = 0; i < sample_values.size(); ++i) {
-      if (!bin_mappers[i]->is_trival()) {
+      if (bin_mappers[i] == nullptr) {
+        Log::Error("Ignore Feature %s ", feature_names_[i].c_str());
+      }
+      else if (!bin_mappers[i]->is_trival()) {
        // map real feature index to used feature index
        used_feature_map_[i] = static_cast<int>(features_.size());
        // push new feature
@@ -209,7 +353,7 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
                                             num_data_, is_enable_sparse_));
      } else {
        // if feature is trival(only 1 bin), free spaces
-        Log::Error("Feature %d only contains one value, will be ignored", i);
+        Log::Error("Feature %s only contains one value, will be ignored", feature_names_[i].c_str());
        delete bin_mappers[i];
      }
    }
@@ -256,12 +400,17 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
    Network::Allgather(input_buffer, buffer_size, start, len, output_buffer);
    // restore features bins from buffer
    for (int i = 0; i < total_num_feature; ++i) {
+      if (ignore_features_.count(i) > 0) {
+        Log::Error("Ignore Feature %s ", feature_names_[i].c_str());
+        continue;
+      }
      BinMapper* bin_mapper = new BinMapper();
      bin_mapper->CopyFrom(output_buffer + i * type_size);
      if (!bin_mapper->is_trival()) {
        used_feature_map_[i] = static_cast<int>(features_.size());
        features_.push_back(new Feature(static_cast<int>(i), bin_mapper, num_data_, is_enable_sparse_));
      } else {
+        Log::Error("Feature %s only contains one value, will be ignored", feature_names_[i].c_str());
        delete bin_mapper;
      }
    }
@@ -276,6 +425,13 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<


 void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, bool use_two_round_loading) {
+  // don't support query id in data file when training in parallel
+  if (num_machines > 1 && !is_pre_partition) {
+    if (group_idx_ > 0) {
+      Log::Fatal("Don't support query id in data file when training parallel without pre-partition. \
+                  Please use an additional query file or pre-partition your data");
+    }
+  }
  used_data_indices_.clear();
  if (!is_loading_from_binfile_ ) {
    if (!use_two_round_loading) {
@@ -287,7 +443,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
      // construct feature bin mappers
      ConstructBinMappers(rank, num_machines, sample_data);
      // initialize label
-      metadata_.InitLabel(num_data_);
+      metadata_.Init(num_data_, weight_idx_, group_idx_);
      // extract features
      ExtractFeaturesFromMemory();
    } else {
@@ -297,7 +453,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
      // construct feature bin mappers
      ConstructBinMappers(rank, num_machines, sample_data);
      // initialize label
-      metadata_.InitLabel(num_data_);
+      metadata_.Init(num_data_, weight_idx_, group_idx_);

      // extract features
      ExtractFeaturesFromFile();
@@ -322,7 +478,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
      // read data in memory
      LoadDataToMemory(0, 1, false);
      // initialize label
-      metadata_.InitLabel(num_data_);
+      metadata_.Init(num_data_, weight_idx_, group_idx_);
      features_.clear();
      // copy feature bin mapper data
      for (Feature* feature : train_set->features_) {
@@ -336,7 +492,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
      // Get number of lines of data file
      num_data_ = static_cast<data_size_t>(text_reader_->CountLine());
      // initialize label
-      metadata_.InitLabel(num_data_);
+      metadata_.Init(num_data_, weight_idx_, group_idx_);
      features_.clear();
      // copy feature bin mapper data
      for (Feature* feature : train_set->features_) {
@@ -381,6 +537,13 @@ void Dataset::ExtractFeaturesFromMemory() {
          // if is used feature
          features_[feature_idx]->PushData(tid, i, inner_data.second);
        }
+        else {
+          if (inner_data.first == weight_idx_) {
+            metadata_.SetWeightAt(i, inner_data.second);
+          } else if (inner_data.first == group_idx_) {
+            metadata_.SetQueryAt(i, inner_data.second);
+          }
+        }
      }
    }
  } else {
@@ -407,6 +570,13 @@ void Dataset::ExtractFeaturesFromMemory() {
          // if is used feature
          features_[feature_idx]->PushData(tid, i, inner_data.second);
        }
+        else {
+          if (inner_data.first == weight_idx_) {
+            metadata_.SetWeightAt(i, inner_data.second);
+          } else if (inner_data.first == group_idx_) {
+            metadata_.SetQueryAt(i, inner_data.second);
+          }
+        }
      }
    }
    // metadata_ will manage space of init_score
@@ -451,6 +621,13 @@ void Dataset::ExtractFeaturesFromFile() {
          // if is used feature
          features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
        }
+        else {
+          if (inner_data.first == weight_idx_) {
+            metadata_.SetWeightAt(start_idx + i, inner_data.second);
+          } else if (inner_data.first == group_idx_) {
+            metadata_.SetQueryAt(start_idx + i, inner_data.second);
+          }
+        }
      }
    }
  };

--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -10,7 +10,7 @@ namespace LightGBM {
 Metadata::Metadata()
  :label_(nullptr), label_int_(nullptr), weights_(nullptr), 
  query_boundaries_(nullptr),
-  query_weights_(nullptr), init_score_(nullptr) {
+  query_weights_(nullptr), init_score_(nullptr), queries_(nullptr){

 }

@@ -36,12 +36,31 @@ Metadata::~Metadata() {
  if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
  if (query_weights_ != nullptr) { delete[] query_weights_; }
  if (init_score_ != nullptr) { delete[] init_score_; }
+  if (queries_ != nullptr) { delete[] queries_; }
 }


-void Metadata::InitLabel(data_size_t num_data) {
+void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
  num_data_ = num_data;
  label_ = new float[num_data_];
+  if (weight_idx >= 0) {
+    if (weights_ != nullptr) {
+      Log::Info("using weight in data file, and ignore additional weight file");
+      delete[] weights_; 
+    }
+    weights_ = new float[num_data_];
+    num_weights_ = num_data_;
+    memset(weights_, 0, sizeof(float) * num_data_);
+  }
+  if (query_idx >= 0) {
+    if (query_boundaries_ != nullptr) {
+      Log::Info("using query id in data file, and ignore additional query file");
+      delete[] query_boundaries_;
+    }
+    if (query_weights_ != nullptr) { delete[] query_weights_; }
+    queries_ = new data_size_t[num_data_];
+    memset(queries_, 0, sizeof(data_size_t) * num_data_);
+  }
 }

 void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
@@ -59,6 +78,32 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {

 void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data_size_t>& used_data_indices) {
  if (used_data_indices.size() == 0) {
+    if (queries_ != nullptr) {
+      // need convert query_id to boundaries
+      std::vector<data_size_t> tmp_buffer;
+      data_size_t last_qid = -1;
+      data_size_t cur_cnt = 0;
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        if (last_qid != queries_[i]) {
+          if (cur_cnt > 0) {
+            tmp_buffer.push_back(cur_cnt);
+          }
+          cur_cnt = 0;
+          last_qid = queries_[i];
+        }
+        ++cur_cnt;
+      }
+      tmp_buffer.push_back(cur_cnt);
+      query_boundaries_ = new data_size_t[tmp_buffer.size() + 1];
+      num_queries_ = static_cast<data_size_t>(tmp_buffer.size());
+      query_boundaries_[0] = 0;
+      for (size_t i = 0; i < tmp_buffer.size(); ++i) {
+        query_boundaries_[i + 1] = query_boundaries_[i] + tmp_buffer[i];
+      }
+      LoadQueryWeights();
+      delete[] queries_;
+      queries_ = nullptr;
+    }
    // check weights
    if (weights_ != nullptr && num_weights_ != num_data_) {
      Log::Error("Initial weight size doesn't equal to data, weights will be ignored");
@@ -131,10 +176,10 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
            used_query.push_back(qid);
            data_idx += len;
          } else {
-            Log::Fatal("Data partition error, data didn't match queies");
+            Log::Fatal("Data partition error, data didn't match queries");
          }
        } else {
-          Log::Fatal("Data partition error, data didn't match queies");
+          Log::Fatal("Data partition error, data didn't match queries");
        }
      }
      data_size_t * old_query_boundaries = query_boundaries_;
@@ -177,7 +222,7 @@ void Metadata::LoadWeights() {
  std::string weight_filename(data_filename_);
  // default weight file name
  weight_filename.append(".weight");
-  TextReader<size_t> reader(weight_filename.c_str());
+  TextReader<size_t> reader(weight_filename.c_str(), false);
  reader.ReadAllLines();
  if (reader.Lines().size() <= 0) {
    return;
@@ -195,7 +240,7 @@ void Metadata::LoadWeights() {
 void Metadata::LoadInitialScore() {
  num_init_score_ = 0;
  if (init_score_filename_[0] == '\0') { return; }
-  TextReader<size_t> reader(init_score_filename_);
+  TextReader<size_t> reader(init_score_filename_, false);
  reader.ReadAllLines();

  Log::Info("Start loading initial scores");
@@ -213,7 +258,7 @@ void Metadata::LoadQueryBoundaries() {
  std::string query_filename(data_filename_);
  // default query file name
  query_filename.append(".query");
-  TextReader<size_t> reader(query_filename.c_str());
+  TextReader<size_t> reader(query_filename.c_str(), false);
  reader.ReadAllLines();
  if (reader.Lines().size() <= 0) {
    return;

--- a/src/io/parser.cpp
+++ b/src/io/parser.cpp
@@ -2,6 +2,7 @@

 #include <iostream>
 #include <fstream>
+#include <functional>

 namespace LightGBM {

@@ -20,44 +21,65 @@ void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt)
  }
 }

-bool CheckHasLabelForLibsvm(std::string& str) {
+int GetLabelIdxForLibsvm(std::string& str, int num_features, int label_idx) {
+  if (num_features <= 0) {
+    return label_idx;
+  }
  str = Common::Trim(str);
  auto pos_space = str.find_first_of(" \f\n\r\t\v");
  auto pos_colon = str.find_first_of(":");
  if (pos_colon == std::string::npos || pos_colon > pos_space) {
-    return true;
+    return -1;
  } else {
-    return false;
+    return label_idx;
  }
 }

-bool CheckHasLabelForTSV(std::string& str, int num_features) {
+int GetLabelIdxForTSV(std::string& str, int num_features, int label_idx) {
+  if (num_features <= 0) {
+    return label_idx;
+  }
  str = Common::Trim(str);
  auto tokens = Common::Split(str.c_str(), '\t');
  if (static_cast<int>(tokens.size()) == num_features) {
-    return false;
+    return -1;
  } else {
-    return true;
+    return label_idx;
  }
 }

-bool CheckHasLabelForCSV(std::string& str, int num_features) {
+int GetLabelIdxForCSV(std::string& str, int num_features, int label_idx) {
+  if (num_features <= 0) {
+    return label_idx;
+  }
  str = Common::Trim(str);
  auto tokens = Common::Split(str.c_str(), ',');
  if (static_cast<int>(tokens.size()) == num_features) {
-    return false;
+    return -1;
  } else {
-    return true;
+    return label_idx;
  }
 }

-Parser* Parser::CreateParser(const char* filename, int num_features, bool* has_label) {
+enum DataType {
+  INVALID,
+  CSV,
+  TSV,
+  LIBSVM
+};
+
+Parser* Parser::CreateParser(const char* filename, bool has_header, int num_features, int label_idx) {
  std::ifstream tmp_file;
  tmp_file.open(filename);
  if (!tmp_file.is_open()) {
    Log::Fatal("Data file: %s doesn't exist", filename);
  }
  std::string line1, line2;
+  if (has_header) {
+    if (!tmp_file.eof()) {
+      std::getline(tmp_file, line1);
+    }
+  }
  if (!tmp_file.eof()) {
    std::getline(tmp_file, line1);
  } else {
@@ -75,44 +97,48 @@ Parser* Parser::CreateParser(const char* filename, int num_features, bool* has_l
  // Get some statistic from 2 line
  GetStatistic(line1.c_str(), &comma_cnt, &tab_cnt, &colon_cnt);
  GetStatistic(line2.c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2);
-  Parser* ret = nullptr;
+  
+  
+
+  DataType type = DataType::INVALID;
  if (line2.size() == 0) {
    // if only have one line on file
    if (colon_cnt > 0) {
-      ret =  new LibSVMParser();
-      if (num_features > 0 && has_label != nullptr) {
-        *has_label = CheckHasLabelForLibsvm(line1);
-      }
+      type = DataType::LIBSVM;
    } else if (tab_cnt > 0) {
-      ret = new TSVParser();
-      if (num_features > 0 && has_label != nullptr) {
-        *has_label = CheckHasLabelForTSV(line1, num_features);
-      }
+      type = DataType::TSV;
    } else if (comma_cnt > 0) {
-      ret = new CSVParser();
-      if (num_features > 0 && has_label != nullptr) {
-        *has_label = CheckHasLabelForCSV(line1, num_features);
-      }
-    } 
+      type = DataType::CSV;
+    }
  } else {
    if (colon_cnt > 0 || colon_cnt2 > 0) {
-      ret = new LibSVMParser();
-      if (num_features > 0 && has_label != nullptr) {
-        *has_label = CheckHasLabelForLibsvm(line1);
-      }
-    }
-    else if (tab_cnt == tab_cnt2 && tab_cnt > 0) {
-      ret = new TSVParser();
-      if (num_features > 0 && has_label != nullptr) {
-        *has_label = CheckHasLabelForTSV(line1, num_features);
-      }
+      type = DataType::LIBSVM;
+    } else if (tab_cnt == tab_cnt2 && tab_cnt > 0) {
+      type = DataType::TSV;
    } else if (comma_cnt == comma_cnt2 && comma_cnt > 0) {
-      ret = new CSVParser();
-      if (num_features > 0 && has_label != nullptr) {
-        *has_label = CheckHasLabelForCSV(line1, num_features);
-      }
+      type = DataType::CSV;
    }
  }
+  if (type == DataType::INVALID) {
+    Log::Fatal("Unkown format of training data");
+  }
+  Parser* ret = nullptr;
+  if (type == DataType::LIBSVM) {
+    label_idx = GetLabelIdxForLibsvm(line1, num_features, label_idx);
+    ret = new LibSVMParser(label_idx);
+  }
+  else if (type == DataType::TSV) {
+    label_idx = GetLabelIdxForTSV(line1, num_features, label_idx);
+    ret = new TSVParser(label_idx);
+  }
+  else if (type == DataType::CSV) {
+    label_idx = GetLabelIdxForCSV(line1, num_features, label_idx);
+    ret = new CSVParser(label_idx);
+  }
+
+  if (label_idx < 0) {
+    Log::Info("Data file: %s doesn't contain label column", filename);
+  }
  return ret;
 }


--- a/src/io/parser.hpp
+++ b/src/io/parser.hpp
@@ -14,14 +14,23 @@ namespace LightGBM {

 class CSVParser: public Parser {
 public:
+  explicit CSVParser(int label_idx)
+    :label_idx_(label_idx) {
+  }
  inline void ParseOneLine(const char* str,
-    std::vector<std::pair<int, double>>* out_features) const override {
+    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
    int idx = 0;
    double val = 0.0;
+    int bias = 0;
+    *out_label = 0.0f;
    while (*str != '\0') {
      str = Common::Atof(str, &val);
-      if (fabs(val) > 1e-10) {
-        out_features->emplace_back(idx, val);
+      if (idx == label_idx_) {
+        *out_label = val;
+        bias = -1;
+      }
+      else if (fabs(val) > 1e-10) {
+        out_features->emplace_back(idx + bias, val);
      }
      ++idx;
      if (*str == ',') {
@@ -31,28 +40,27 @@ public:
      }
    }
  }
-  inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
-    double* out_label) const override {
-    // first column is label
-    str = Common::Atof(str, out_label);
-    if (*str == ',') {
-      ++str;
-    } else if (*str != '\0') {
-      Log::Fatal("input format error, should be CSV");
-    }
-    return ParseOneLine(str, out_features);
-  }
+private:
+  int label_idx_ = 0;
 };

 class TSVParser: public Parser {
 public:
-  inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features) const override {
+  explicit TSVParser(int label_idx)
+    :label_idx_(label_idx) {
+  }
+  inline void ParseOneLine(const char* str, 
+    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
    int idx = 0;
    double val = 0.0;
+    int bias = 0;
    while (*str != '\0') {
      str = Common::Atof(str, &val);
-      if (fabs(val) > 1e-10) {
-        out_features->emplace_back(idx, val);
+      if (idx == label_idx_) {
+        *out_label = val;
+        bias = -1;
+      } else if (fabs(val) > 1e-10) {
+        out_features->emplace_back(idx + bias, val);
      }
      ++idx;
      if (*str == '\t') {
@@ -62,24 +70,27 @@ public:
      }
    }
  }
-  inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
-    double* out_label) const override {
-    // first column is label
-    str = Common::Atof(str, out_label);
-    if (*str == '\t') {
-      ++str;
-    } else if (*str != '\0') {
-      Log::Fatal("input format error, should be TSV");
-    }
-    return ParseOneLine(str, out_features);
-  }
+private:
+  int label_idx_ = 0;
 };

 class LibSVMParser: public Parser {
 public:
-  inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features) const override {
+  explicit LibSVMParser(int label_idx)
+    :label_idx_(label_idx) {
+    if (label_idx > 0) {
+      Log::Fatal("label should be the first column in Libsvm file");
+    }
+  }
+  inline void ParseOneLine(const char* str, 
+    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
    int idx = 0;
    double val = 0.0;
+    if (label_idx_ == 0) {
+      str = Common::Atof(str, &val);
+      *out_label = val;
+      str = Common::SkipSpaceAndTab(str);
+    }
    while (*str != '\0') {
      str = Common::Atoi(str, &idx);
      str = Common::SkipSpaceAndTab(str);
@@ -93,13 +104,9 @@ public:
      str = Common::SkipSpaceAndTab(str);
    }
  }
-  inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
-    double* out_label) const override {
-    // first column is label
-    str = Common::Atof(str, out_label);
-    str = Common::SkipSpaceAndTab(str);
-    return ParseOneLine(str, out_features);
-  }
+private:
+  int label_idx_ = 0;
 };
+
 }  // namespace LightGBM
 #endif   // LightGBM_IO_PARSER_HPP_
--- a/src/network/linkers_socket.cpp
+++ b/src/network/linkers_socket.cpp
@@ -77,7 +77,7 @@ Linkers::~Linkers() {
 }

 void Linkers::ParseMachineList(const char * filename) {
-  TextReader<size_t> machine_list_reader(filename);
+  TextReader<size_t> machine_list_reader(filename, false);
  machine_list_reader.ReadAllLines();
  if (machine_list_reader.Lines().size() <= 0) {
    Log::Fatal("Machine list file:%s doesn't exist", filename);

--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -47,7 +47,7 @@ public:
    // get boundries
    query_boundaries_ = metadata.query_boundaries();
    if (query_boundaries_ == nullptr) {
-      Log::Fatal("For NDCG metric, should have query information");
+      Log::Fatal("For lambdarank tasks, should have query information");
    }
    num_queries_ = metadata.num_queries();
    // cache inverse max DCG, avoid computation many times