Merge branches 'master' and 'master' of https://github.com/Microsoft/LightGBM

2a8d38c5 · Qiwei Ye · 351b3d7e · ed958eb2 · 2a8d38c5 · 2a8d38c5
Commit 2a8d38c5 authored Nov 01, 2016 by Qiwei Ye
20 changed files
--- a/examples/binary_classification/LightGBM_model.txt
+++ b/examples/binary_classification/LightGBM_model.txt
--- a/examples/multiclass_classification/multiclass.test
+++ b/examples/multiclass_classification/multiclass.test
--- a/examples/multiclass_classification/multiclass.train
+++ b/examples/multiclass_classification/multiclass.train
--- a/examples/multiclass_classification/predict.conf
+++ b/examples/multiclass_classification/predict.conf
+task = predict
+data = multiclass.test
+input_model= LightGBM_model.txt
--- a/examples/multiclass_classification/train.conf
+++ b/examples/multiclass_classification/train.conf
+# task type, support train and predict
+task = train
+# boosting type, support gbdt for now, alias: boosting, boost
+boosting_type = gbdt
+# application type, support following application
+# regression , regression task
+# binary , binary classification task
+# lambdarank , lambdarank task
+# multiclass
+# alias: application, app
+objective = multiclass
+# eval metrics, support multi metric, delimite by ',' , support following metrics
+# l1 
+# l2 , default metric for regression
+# ndcg , default metric for lambdarank
+# auc 
+# binary_logloss , default metric for binary
+# binary_error
+# multi_logloss
+# multi_error
+metric = multi_logloss
+# number of class, for multiclass classification
+num_class = 5
+# frequence for metric output
+metric_freq = 1
+# true if need output metric for training data, alias: tranining_metric, train_metric
+is_training_metric = true
+# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
+max_bin = 255
+# training data
+# if exsting weight file, should name to "regression.train.weight"
+# alias: train_data, train
+data = multiclass.train
+# valid data
+valid_data = multiclass.test
+# round for early stopping
+early_stopping = 10
+# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
+num_trees = 100
+# shrinkage rate , alias: shrinkage_rate
+learning_rate = 0.05
+# number of leaves for one tree, alias: num_leaf
+num_leaves = 31
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -54,9 +54,6 @@ private:
  /*! \brief Initializations before prediction */
  void InitPredict();
-  /*! \brief Load model from local disk */
-  void LoadModel();
  /*! \brief Main predicting logic */
  void Predict();

--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -13,9 +13,9 @@ namespace LightGBM {
 struct HistogramBinEntry {
 public:
  /*! \brief Sum of gradients on this bin */
-  score_t sum_gradients = 0.0;
+  double sum_gradients = 0.0;
  /*! \brief Sum of hessians on this bin */
-  score_t sum_hessians = 0.0;
+  double sum_hessians = 0.0;
  /*! \brief Number of data on this bin */
  data_size_t cnt = 0;
@@ -56,7 +56,7 @@ public:
  /*! \brief True if bin is trival (contains only one bin) */
  inline bool is_trival() const { return is_trival_; }
  /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
-  inline double sparse_rate() const { return sparse_rate_; }
+  inline float sparse_rate() const { return sparse_rate_; }
  /*!
  * \brief Save binary data to file
  * \param file File want to write
@@ -67,7 +67,7 @@ public:
  * \param bin
  * \return Feature value of this bin
  */
-  inline double BinToValue(unsigned int bin) const {
+  inline float BinToValue(unsigned int bin) const {
    return bin_upper_bound_[bin];
  }
  /*!
@@ -79,14 +79,14 @@ public:
  * \param value
  * \return bin for this feature value
  */
-  inline unsigned int ValueToBin(double value) const;
+  inline unsigned int ValueToBin(float value) const;
  /*!
  * \brief Construct feature value to bin mapper according feature values
  * \param values (Sampled) values of this feature
  * \param max_bin The maximal number of bin
  */
-  void FindBin(std::vector<double>* values, int max_bin);
+  void FindBin(std::vector<float>* values, int max_bin);
  /*!
  * \brief Use specific number of bin to calculate the size of this class
@@ -111,11 +111,11 @@ private:
  /*! \brief Number of bins */
  int num_bin_;
  /*! \brief Store upper bound for each bin */
-  double* bin_upper_bound_;
+  float* bin_upper_bound_;
  /*! \brief True if this feature is trival */
  bool is_trival_;
  /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
-  double sparse_rate_;
+  float sparse_rate_;
 };
 /*!
@@ -271,7 +271,7 @@ public:
  * \return The bin data object
  */
  static Bin* CreateBin(data_size_t num_data, int num_bin,
-    double sparse_rate, bool is_enable_sparse, bool* is_sparse, int default_bin);
+    float sparse_rate, bool is_enable_sparse, bool* is_sparse, int default_bin);
  /*!
  * \brief Create object for bin data of one feature, used for dense feature
@@ -293,7 +293,7 @@ public:
    int num_bin, int default_bin);
 };
-inline unsigned int BinMapper::ValueToBin(double value) const {
+inline unsigned int BinMapper::ValueToBin(float value) const {
  // binary search to find bin
  int l = 0;
  int r = num_bin_ - 1;

--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -28,12 +28,12 @@ public:
  * \param train_data Training data
  * \param object_function Training objective function
  * \param training_metrics Training metric
-  * \param output_model_filename Filename of output model
  */
-  virtual void Init(const Dataset* train_data,
+  virtual void Init(
+    const BoostingConfig* config,
+    const Dataset* train_data,
    const ObjectiveFunction* object_function,
-    const std::vector<const Metric*>& training_metrics,
+    const std::vector<const Metric*>& training_metrics) = 0;
-    const char* output_model_filename) = 0;
  /*!
  * \brief Add a validation data
@@ -44,40 +44,59 @@ public:
    const std::vector<const Metric*>& valid_metrics) = 0;
  /*! \brief Training logic */
-  virtual void Train() = 0;
+  virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) = 0;
+  /*! \brief Get eval result */
+  virtual std::vector<std::string> EvalCurrent(bool is_eval_train) const = 0 ;
+  /*! \brief Get prediction result */
+  virtual const std::vector<const score_t*> PredictCurrent(bool is_predict_train) const = 0;
  /*!
  * \brief Prediction for one record, not sigmoid transform
  * \param feature_values Feature value on this record
+  * \param num_used_model Number of used model
  * \return Prediction result for this record
  */
-  virtual double PredictRaw(const double * feature_values) const = 0;
+  virtual float PredictRaw(const float* feature_values,
+    int num_used_model) const = 0;
  /*!
  * \brief Prediction for one record, sigmoid transformation will be used if needed
  * \param feature_values Feature value on this record
+  * \param num_used_model Number of used model
  * \return Prediction result for this record
  */
-  virtual double Predict(const double * feature_values) const = 0;
+  virtual float Predict(const float* feature_values, 
+    int num_used_model) const = 0;
  /*!
  * \brief Predtion for one record with leaf index
  * \param feature_values Feature value on this record
+  * \param num_used_model Number of used model
  * \return Predicted leaf index for this record
  */
-  virtual std::vector<int> PredictLeafIndex(const double * feature_values) const = 0;
+  virtual std::vector<int> PredictLeafIndex(
+    const float* feature_values,
+    int num_used_model) const = 0;
+  /*!
+  * \brief Predtion for multiclass classification
+  * \param feature_values Feature value on this record
+  * \return Prediction result, num_class numbers per line
+  */
+  virtual std::vector<float> PredictMulticlass(const float* value, int num_used_model) const = 0; 
  /*!
-  * \brief Serialize models by string
+  * \brief save model to file
-  * \return String output of tranined model
  */
-  virtual std::string ModelsToString() const = 0;
+  virtual void SaveModelToFile(bool is_finish, const char* filename) = 0;
  /*!
  * \brief Restore from a serialized string
  * \param model_str The string of model
  */
-  virtual void ModelsFromString(const std::string& model_str, int num_used_model) = 0;
+  virtual void ModelsFromString(const std::string& model_str) = 0;
  /*!
  * \brief Get max feature index of this model
@@ -85,19 +104,45 @@ public:
  */
  virtual int MaxFeatureIdx() const = 0;
+  /*!
+  * \brief Get index of label column
+  * \return index of label column
+  */
+  virtual int LabelIdx() const = 0;
  /*!
  * \brief Get number of weak sub-models
  * \return Number of weak sub-models
  */
  virtual int NumberOfSubModels() const = 0;
+  /*!
+  * \brief Get number of classes
+  * \return Number of classes
+  */
+  virtual int NumberOfClass() const = 0;
+  /*!
+  * \brief Get Type name of this boosting object
+  */
+  virtual const char* Name() const = 0;
  /*!
  * \brief Create boosting object
  * \param type Type of boosting
+  * \param config config for boosting
+  * \param filename name of model file, if existing will continue to train from this model
  * \return The boosting object
  */
-  static Boosting* CreateBoosting(BoostingType type,
+  static Boosting* CreateBoosting(BoostingType type, const char* filename);
-    const BoostingConfig* config);
+  /*!
+  * \brief Create boosting object from model file
+  * \param filename name of model file
+  * \return The boosting object
+  */
+  static Boosting* CreateBoosting(const char* filename);
 };
 }  // namespace LightGBM

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -49,15 +49,15 @@ public:
    const std::string& name, int* out);
  /*!
-  * \brief Get double value by specific name of key
+  * \brief Get float value by specific name of key
  * \param params Store the key and value for params
  * \param name Name of key
  * \param out Value will assign to out if key exists
  * \return True if key exists
  */
-  inline bool GetDouble(
+  inline bool GetFloat(
    const std::unordered_map<std::string, std::string>& params,
-    const std::string& name, double* out);
+    const std::string& name, float* out);
  /*!
  * \brief Get bool value by specific name of key
@@ -73,7 +73,7 @@ public:
 /*! \brief Types of boosting */
 enum BoostingType {
-  kGBDT
+  kGBDT, kUnknow
 };
@@ -94,13 +94,26 @@ public:
  std::string input_model = "";
  std::string input_init_score = "";
  int verbosity = 1;
-  std::string log_file = "";
  int num_model_predict = -1;
  bool is_pre_partition = false;
  bool is_enable_sparse = true;
  bool use_two_round_loading = false;
  bool is_save_binary_file = false;
  bool is_sigmoid = true;
+  bool has_header = false;
+  /*! \brief Index or column name of label, default is the first column
+   * And add an prefix "name:" while using column name */
+  std::string label_column = "";
+  /*! \brief Index or column name of weight, < 0 means not used
+  * And add an prefix "name:" while using column name */
+  std::string weight_column = "";
+  /*! \brief Index or column name of group, < 0 means not used */
+  std::string group_column = "";
+  /*! \brief ignored features, separate by ','
+  * e.g. name:column_name1,column_name2  */
+  std::string ignore_column = "";
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
@@ -108,13 +121,15 @@ public:
 struct ObjectiveConfig: public ConfigBase {
 public:
  virtual ~ObjectiveConfig() {}
-  double sigmoid = 1;
+  float sigmoid = 1.0f;
  // for lambdarank
-  std::vector<double> label_gain;
+  std::vector<float> label_gain;
  // for lambdarank
  int max_position = 20;
  // for binary
  bool is_unbalance = false;
+  // for multiclass
+  int num_class = 1;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
@@ -122,11 +137,9 @@ public:
 struct MetricConfig: public ConfigBase {
 public:
  virtual ~MetricConfig() {}
-  int early_stopping_round = 0;
+  int num_class = 1;
-  int output_freq = 1;
+  float sigmoid = 1.0f;
-  double sigmoid = 1;
+  std::vector<float> label_gain;
-  bool is_provide_training_metric = false;
-  std::vector<double> label_gain;
  std::vector<int> eval_at;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
@@ -136,12 +149,18 @@ public:
 struct TreeConfig: public ConfigBase {
 public:
  int min_data_in_leaf = 100;
-  double min_sum_hessian_in_leaf = 10.0f;
+  float min_sum_hessian_in_leaf = 10.0f;
+  // should > 1, only one leaf means not need to learning
  int num_leaves = 127;
  int feature_fraction_seed = 2;
-  double feature_fraction = 1.0;
+  float feature_fraction = 1.0f;
  // max cache size(unit:MB) for historical histogram. < 0 means not limit
-  double histogram_pool_size = -1;
+  float histogram_pool_size = -1.0f;
+  // max depth of tree model. 
+  // Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
+  // And the max leaves will be min(num_leaves, pow(2, max_depth - 1)) 
+  // max_depth < 0 means not limit
+  int max_depth = -1;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
@@ -155,12 +174,15 @@ enum TreeLearnerType {
 struct BoostingConfig: public ConfigBase {
 public:
  virtual ~BoostingConfig() {}
+  int output_freq = 1;
+  bool is_provide_training_metric = false;
  int num_iterations = 10;
-  double learning_rate = 0.1;
+  float learning_rate = 0.1f;
-  double bagging_fraction = 1.0;
+  float bagging_fraction = 1.0f;
  int bagging_seed = 3;
  int bagging_freq = 0;
  int early_stopping_round = 0;
+  int num_class = 1;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
@@ -207,7 +229,7 @@ public:
    delete boosting_config;
  }
  void Set(const std::unordered_map<std::string, std::string>& params) override;
+  void LoadFromString(const char* str);
 private:
  void GetBoostingType(const std::unordered_map<std::string, std::string>& params);
@@ -235,17 +257,23 @@ inline bool ConfigBase::GetInt(
  const std::unordered_map<std::string, std::string>& params,
  const std::string& name, int* out) {
  if (params.count(name) > 0) {
-    Common::Atoi(params.at(name).c_str(), out);
+    if (!Common::AtoiAndCheck(params.at(name).c_str(), out)) {
+      Log::Fatal("Parameter %s should be int type, passed is [%s]", 
+        name.c_str(), params.at(name).c_str());
+    }
    return true;
  }
  return false;
 }
-inline bool ConfigBase::GetDouble(
+inline bool ConfigBase::GetFloat(
  const std::unordered_map<std::string, std::string>& params,
-  const std::string& name, double* out) {
+  const std::string& name, float* out) {
  if (params.count(name) > 0) {
-    Common::Atof(params.at(name).c_str(), out);
+    if (!Common::AtofAndCheck(params.at(name).c_str(), out)) {
+      Log::Fatal("Parameter %s should be float type, passed is [%s]",
+        name.c_str(), params.at(name).c_str());
+    }
    return true;
  }
  return false;
@@ -257,10 +285,13 @@ inline bool ConfigBase::GetBool(
  if (params.count(name) > 0) {
    std::string value = params.at(name);
    std::transform(value.begin(), value.end(), value.begin(), ::tolower);
-    if (value == std::string("false")) {
+    if (value == std::string("false") || value == std::string("-")) {
      *out = false;
-    } else {
+    } else if (value == std::string("true") || value == std::string("+")) {
      *out = true;
+    } else {
+      Log::Fatal("Parameter %s should be \"true\"/\"+\" or \"false\"/\"-\", passed is [%s]",
+        name.c_str(), params.at(name).c_str());
    }
    return true;
  }
@@ -318,7 +349,15 @@ struct ParameterAlias {
      { "save_binary", "is_save_binary_file" },
      { "early_stopping_rounds", "early_stopping_round"},
      { "early_stopping", "early_stopping_round"},
-      { "verbosity", "verbose" }
+      { "verbosity", "verbose" },
+      { "header", "has_header" },
+      { "label", "label_column" },
+      { "weight", "weight_column" },
+      { "group", "group_column" },
+      { "query", "group_column" },
+      { "query_column", "group_column" },
+      { "ignore_feature", "ignore_column" },
+      { "blacklist", "ignore_column" }
    });
    std::unordered_map<std::string, std::string> tmp_map;
    for (const auto& pair : *params) {

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -5,11 +5,13 @@
 #include <LightGBM/utils/text_reader.h>
 #include <LightGBM/meta.h>
+#include <LightGBM/config.h>
 #include <vector>
 #include <utility>
 #include <functional>
 #include <string>
+#include <unordered_set>
 namespace LightGBM {
@@ -56,10 +58,12 @@ public:
  ~Metadata();
  /*!
-  * \brief Initial work, will auto load weight, inital scores
+  * \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
  * \param num_data Number of training data
+  * \param weight_idx Index of weight column, < 0 means doesn't exists
+  * \param query_idx Index of query id column, < 0 means doesn't exists
  */
-  void InitLabel(data_size_t num_data);
+  void Init(data_size_t num_data, int weight_idx, int query_idx);
  /*!
  * \brief Partition label by used indices
@@ -79,7 +83,7 @@ public:
  * \brief Set initial scores
  * \param init_score Initial scores, this class will manage memory for init_score.
  */
-  void SetInitScore(score_t* init_score);
+  void SetInitScore(const float* init_score, data_size_t len);
  /*!
@@ -104,9 +108,29 @@ public:
  * \param idx Index of this record
  * \param value Label value of this record
  */
-  inline void SetLabelAt(data_size_t idx, double value)
+  inline void SetLabelAt(data_size_t idx, float value)
  {
-    label_[idx] = static_cast<float>(value);
+    label_[idx] = value;
+  }
+  /*!
+  * \brief Set Weight for one record
+  * \param idx Index of this record
+  * \param value Weight value of this record
+  */
+  inline void SetWeightAt(data_size_t idx, float value)
+  {
+    weights_[idx] = value;
+  }
+  /*!
+  * \brief Set Query Id for one record
+  * \param idx Index of this record
+  * \param value Query Id value of this record
+  */
+  inline void SetQueryAt(data_size_t idx, float value)
+  {
+    queries_[idx] = static_cast<data_size_t>(value);
  }
  /*!
@@ -142,7 +166,7 @@ public:
  * \brief Get initial scores, if not exists, will return nullptr
  * \return Pointer of initial scores
  */
-  inline const score_t* init_score() const { return init_score_; }
+  inline const float* init_score() const { return init_score_; }
  /*! \brief Load initial scores from file */
  void LoadInitialScore();
@@ -177,46 +201,40 @@ private:
  /*! \brief Number of Initial score, used to check correct weight file */
  data_size_t num_init_score_;
  /*! \brief Initial score */
-  score_t* init_score_;
+  float* init_score_;
+  /*! \brief Queries data */
+  data_size_t* queries_;
 };
 /*! \brief Interface for Parser */
 class Parser {
 public:
  /*! \brief virtual destructor */
  virtual ~Parser() {}
-  /*!
-  * \brief Parse one line with label
-  * \param str One line record, string format, should end with '\0'
-  * \param out_features Output features, store in (feature_idx, feature_value)
-  * \param out_label Output label
-  */
-  virtual void ParseOneLine(const char* str,
-    std::vector<std::pair<int, double>>* out_features,
-    double* out_label) const = 0;
  /*!
  * \brief Parse one line with label
  * \param str One line record, string format, should end with '\0'
-  * \param out_features Output features, store in (feature_idx, feature_value)
+  * \param out_features Output columns, store in (column_idx, values)
-  * \param out_label Output label
+  * \param out_label Label will store to this if exists
  */
  virtual void ParseOneLine(const char* str,
-    std::vector<std::pair<int, double>>* out_features) const = 0;
+    std::vector<std::pair<int, float>>* out_features, float* out_label) const = 0;
  /*!
  * \brief Create a object of parser, will auto choose the format depend on file
  * \param filename One Filename of data
  * \param num_features Pass num_features of this data file if you know, <=0 means don't know
-  * \param has_label output, if num_features > 0, will output this data has label or not
+  * \param label_idx index of label column
  * \return Object of parser
  */
-  static Parser* CreateParser(const char* filename, int num_features, bool* has_label);
+  static Parser* CreateParser(const char* filename, bool has_header, int num_features, int label_idx);
 };
 using PredictFunction =
-  std::function<double(const std::vector<std::pair<int, double>>&)>;
+  std::function<float(const std::vector<std::pair<int, float>>&)>;
 /*! \brief The main class of data set,
 *          which are used to traning or validation
@@ -227,29 +245,21 @@ public:
  * \brief Constructor
  * \param data_filename Filename of dataset
  * \param init_score_filename Filename of initial score
-  * \param is_int_label True if label is int type
+  * \param io_config configs for IO
-  * \param max_bin The maximal number of bin that feature values will bucket in
-  * \param random_seed The seed for random generator
-  * \param is_enable_sparse True for sparse feature
  * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
  */
  Dataset(const char* data_filename, const char* init_score_filename,
-    int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun);
+    const IOConfig& io_config, const PredictFunction& predict_fun);
  /*!
  * \brief Constructor
  * \param data_filename Filename of dataset
-  * \param is_int_label True if label is int type
+  * \param io_config configs for IO
-  * \param max_bin The maximal number of bin that feature values will bucket in
-  * \param random_seed The seed for random generator
-  * \param is_enable_sparse True for sparse feature
  * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
  */
  Dataset(const char* data_filename,
-    int max_bin, int random_seed, bool is_enable_sparse,
+    const IOConfig& io_config, const PredictFunction& predict_fun)
-                     const PredictFunction& predict_fun)
+    : Dataset(data_filename, "", io_config, predict_fun) {
-    : Dataset(data_filename, "", max_bin, random_seed,
-                                    is_enable_sparse, predict_fun) {
  }
  /*! \brief Destructor */
@@ -304,6 +314,12 @@ public:
  /*! \brief Get Number of total features */
  inline int num_total_features() const { return num_total_features_; }
+  /*! \brief Get the index of label column */
+  inline int label_idx() const { return label_idx_; }
+  /*! \brief Get names of current data set */
+  inline std::vector<std::string> feature_names() const { return feature_names_; }
  /*! \brief Get Number of data */
  inline data_size_t num_data() const { return num_data_; }
@@ -394,10 +410,20 @@ private:
  bool is_loading_from_binfile_;
  /*! \brief Number of global data, used for distributed learning */
  size_t global_num_data_ = 0;
-  // used to local used data indices
+  /*! \brief used to local used data indices */
  std::vector<data_size_t> used_data_indices_;
-  // prediction function for initial model
+  /*! \brief prediction function for initial model */
  const PredictFunction& predict_fun_;
+  /*! \brief index of label column */
+  int label_idx_ = 0;
+  /*! \brief index of weight column */
+  int weight_idx_ = -1;
+  /*! \brief index of group column */
+  int group_idx_ = -1;
+  /*! \brief Mapper from real feature index to used index*/
+  std::unordered_set<int> ignore_features_;
+  /*! \brief store feature names */
+  std::vector<std::string> feature_names_;
 };
 }  // namespace LightGBM

--- a/include/LightGBM/feature.h
+++ b/include/LightGBM/feature.h
@@ -71,7 +71,7 @@ public:
  * \param idx Index of record
  * \param value feature value of record
  */
-  inline void PushData(int tid, data_size_t line_idx, double value) {
+  inline void PushData(int tid, data_size_t line_idx, float value) {
    unsigned int bin = bin_mapper_->ValueToBin(value);
    bin_data_->Push(tid, line_idx, bin);
  }
@@ -89,7 +89,7 @@ public:
  * \param bin
  * \return Feature value of this bin
  */
-  inline double BinToValue(unsigned int bin)
+  inline float BinToValue(unsigned int bin)
    const { return bin_mapper_->BinToValue(bin); }
  /*!

--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -12,7 +12,7 @@ namespace LightGBM {
 /*! \brief Type of data size, it is better to use signed type*/
 typedef int32_t data_size_t;
 /*! \brief Type of score, and gradients */
-typedef double score_t;
+typedef float score_t;
 const score_t kMinScore = -std::numeric_limits<score_t>::infinity();

--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -11,7 +11,7 @@ namespace LightGBM {
 /*!
 * \brief The interface of metric.
-*        Metric is used to calculate and output metric result on training / validation data.
+*        Metric is used to calculate metric result
 */
 class Metric {
 public:
@@ -27,12 +27,14 @@ public:
  virtual void Init(const char* test_name,
    const Metadata& metadata, data_size_t num_data) = 0;
+  virtual const char* GetName() const = 0;
+  virtual bool is_bigger_better() const = 0;
  /*!
  * \brief Calcaluting and printing metric result
-  * \param iter Current iteration
  * \param score Current prediction score
  */
-  virtual score_t PrintAndGetLoss(int iter, const score_t* score) const = 0;
+  virtual std::vector<float> Eval(const score_t* score) const = 0;
  /*!
  * \brief Create object of metrics
@@ -41,8 +43,6 @@ public:
  */
  static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
-  bool the_bigger_the_better = false;
-  int early_stopping_round_ = 0;
 };
 /*!
@@ -54,7 +54,7 @@ public:
  * \brief Initial logic
  * \param label_gain Gain for labels, default is 2^i - 1
  */
-  static void Init(std::vector<double> label_gain);
+  static void Init(std::vector<float> label_gain);
  /*!
  * \brief Calculate the DCG score at position k
@@ -64,7 +64,7 @@ public:
  * \param num_data Number of data
  * \return The DCG score
  */
-  static double CalDCGAtK(data_size_t k, const float* label,
+  static float CalDCGAtK(data_size_t k, const float* label,
    const score_t* score, data_size_t num_data);
  /*!
@@ -77,7 +77,7 @@ public:
  */
  static void CalDCG(const std::vector<data_size_t>& ks,
    const float* label, const score_t* score,
-    data_size_t num_data, std::vector<double>* out);
+    data_size_t num_data, std::vector<float>* out);
  /*!
  * \brief Calculate the Max DCG score at position k
@@ -86,7 +86,7 @@ public:
  * \param num_data Number of data
  * \return The max DCG score
  */
-  static double CalMaxDCGAtK(data_size_t k,
+  static float CalMaxDCGAtK(data_size_t k,
    const float* label, data_size_t num_data);
  /*!
@@ -97,22 +97,22 @@ public:
  * \param out Output result
  */
  static void CalMaxDCG(const std::vector<data_size_t>& ks,
-    const float* label, data_size_t num_data, std::vector<double>* out);
+    const float* label, data_size_t num_data, std::vector<float>* out);
  /*!
  * \brief Get discount score of position k
  * \param k The position
  * \return The discount of this position
  */
-  inline static double GetDiscount(data_size_t k) { return discount_[k]; }
+  inline static float GetDiscount(data_size_t k) { return discount_[k]; }
 private:
  /*! \brief True if inited, avoid init multi times */
  static bool is_inited_;
  /*! \brief store gains for different label */
-  static std::vector<double> label_gain_;
+  static std::vector<float> label_gain_;
  /*! \brief store discount score for different position */
-  static std::vector<double> discount_;
+  static std::vector<float> discount_;
  /*! \brief max position for eval */
  static const data_size_t kMaxPosition;
 };

--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -36,7 +36,7 @@ public:
  *        This function is used for prediction task, if has sigmoid param, the prediction value will be transform by sigmoid function.
  * \return Sigmoid param, if <=0.0 means don't use sigmoid transform on this objective.
  */
-  virtual double GetSigmoid() const = 0;
+  virtual float GetSigmoid() const = 0;
  /*!
  * \brief Create object of objective function

--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -36,18 +36,18 @@ public:
  * \param feature Index of feature; the converted index after removing useless features
  * \param threshold Threshold(bin) of split
  * \param real_feature Index of feature, the original index on data
-  * \param threshold_double Threshold on feature value
+  * \param threshold_float Threshold on feature value
  * \param left_value Model Left child output
  * \param right_value Model Right child output
  * \param gain Split gain
  * \return The index of new leaf.
  */
  int Split(int leaf, int feature, unsigned int threshold, int real_feature,
-            double threshold_double, score_t left_value,
+    float threshold_float, float left_value,
-            score_t right_value, double gain);
+    float right_value, float gain);
  /*! \brief Get the output of one leave */
-  inline score_t LeafOutput(int leaf) const { return leaf_value_[leaf]; }
+  inline float LeafOutput(int leaf) const { return leaf_value_[leaf]; }
  /*!
  * \brief Adding prediction value of this tree model to scores
@@ -74,20 +74,26 @@ public:
  * \param feature_values Feature value of this record
  * \return Prediction result
  */
-  inline score_t Predict(const double* feature_values) const;
+  inline float Predict(const float* feature_values) const;
-  inline int PredictLeafIndex(const double* feature_values) const;
+  inline int PredictLeafIndex(const float* feature_values) const;
  /*! \brief Get Number of leaves*/
  inline int num_leaves() const { return num_leaves_; }
+  /*! \brief Get depth of specific leaf*/
+  inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
+  /*! \brief Get feature of specific split*/
+  inline int split_feature_real(int split_idx) const { return split_feature_real_[split_idx]; }
  /*!
  * \brief Shrinkage for the tree's output
  *        shrinkage rate (a.k.a learning rate) is used to tune the traning process
  * \param rate The factor of shrinkage
  */
-  inline void Shrinkage(double rate) {
+  inline void Shrinkage(float rate) {
    for (int i = 0; i < num_leaves_; ++i) {
-      leaf_value_[i] = static_cast<score_t>(leaf_value_[i] * rate);
+      leaf_value_[i] = leaf_value_[i] * rate;
    }
  }
@@ -113,7 +119,7 @@ private:
  * \param feature_values Feature value of this record
  * \return Leaf index
  */
-  inline int GetLeaf(const double* feature_values) const;
+  inline int GetLeaf(const float* feature_values) const;
  /*! \brief Number of max leaves*/
  int max_leaves_;
@@ -131,23 +137,25 @@ private:
  /*! \brief A non-leaf node's split threshold in bin */
  unsigned int* threshold_in_bin_;
  /*! \brief A non-leaf node's split threshold in feature value */
-  double* threshold_;
+  float* threshold_;
  /*! \brief A non-leaf node's split gain */
-  double* split_gain_;
+  float* split_gain_;
  // used for leaf node
  /*! \brief The parent of leaf */
  int* leaf_parent_;
  /*! \brief Output of leaves */
-  score_t* leaf_value_;
+  float* leaf_value_;
+  /*! \brief Depth for leaves */
+  int* leaf_depth_;
 };
-inline score_t Tree::Predict(const double* feature_values) const {
+inline float Tree::Predict(const float* feature_values) const {
  int leaf = GetLeaf(feature_values);
  return LeafOutput(leaf);
 }
-inline int Tree::PredictLeafIndex(const double* feature_values) const {
+inline int Tree::PredictLeafIndex(const float* feature_values) const {
  int leaf = GetLeaf(feature_values);
  return leaf;
 }
@@ -166,7 +174,7 @@ inline int Tree::GetLeaf(const std::vector<BinIterator*>& iterators,
  return ~node;
 }
-inline int Tree::GetLeaf(const double* feature_values) const {
+inline int Tree::GetLeaf(const float* feature_values) const {
  int node = 0;
  while (node >= 0) {
    if (feature_values[split_feature_real_[node]] <= threshold_[node]) {

--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -43,14 +43,39 @@ inline static std::string& RemoveQuotationSymbol(std::string& str) {
  str.erase(0, str.find_first_not_of("'\""));
  return str;
 }
+inline static bool StartsWith(const std::string& str, const std::string prefix) {
+  if (str.substr(0, prefix.size()) == prefix) {
+    return true;
+  } else {
+    return false;
+  }
+}
+inline static std::vector<std::string> Split(const char* c_str, char delimiter) {
+  std::vector<std::string> ret;
+  std::string str(c_str);
+  size_t i = 0;
+  size_t pos = str.find(delimiter);
+  while (pos != std::string::npos) {
+    ret.push_back(str.substr(i, pos - i));
+    i = ++pos;
+    pos = str.find(delimiter, pos);
+  }
+  ret.push_back(str.substr(i));
+  return ret;
+}
-inline static std::vector<std::string> Split(const char* str, char delimiter) {
+inline static std::vector<std::string> Split(const char* c_str, const char* delimiters) {
-  std::stringstream ss(str);
+  // will split when met any chars in delimiters
-  std::string tmp_str;
  std::vector<std::string> ret;
-  while (std::getline(ss, tmp_str, delimiter)) {
+  std::string str(c_str);
-    ret.push_back(tmp_str);
+  size_t i = 0;
+  size_t pos = str.find_first_of(delimiters);
+  while (pos != std::string::npos) {
+    ret.push_back(str.substr(i, pos - i));
+    i = ++pos;
+    pos = str.find_first_of(delimiters, pos);
  }
+  ret.push_back(str.substr(i));
  return ret;
 }
@@ -78,9 +103,9 @@ inline static const char* Atoi(const char* p, int* out) {
 }
 //ref to http://www.leapsecond.com/tools/fast_atof.c
-inline static const char* Atof(const char* p, double* out) {
+inline static const char* Atof(const char* p, float* out) {
  int frac;
-  double sign, value, scale;
+  float sign, value, scale;
  *out = 0;
  // Skip leading white space, if any.
  while (*p == ' ') {
@@ -88,9 +113,9 @@ inline static const char* Atof(const char* p, double* out) {
  }
  // Get sign, if any.
-  sign = 1.0;
+  sign = 1.0f;
  if (*p == '-') {
-    sign = -1.0;
+    sign = -1.0f;
    ++p;
  }
  else if (*p == '+') {
@@ -100,24 +125,24 @@ inline static const char* Atof(const char* p, double* out) {
  // is a number
  if ((*p >= '0' && *p <= '9') || *p == '.' || *p == 'e' || *p == 'E') {
    // Get digits before decimal point or exponent, if any.
-    for (value = 0.0; *p >= '0' && *p <= '9'; ++p) {
+    for (value = 0.0f; *p >= '0' && *p <= '9'; ++p) {
-      value = value * 10.0 + (*p - '0');
+      value = value * 10.0f + (*p - '0');
    }
    // Get digits after decimal point, if any.
    if (*p == '.') {
-      double pow10 = 10.0;
+      float pow10 = 10.0f;
      ++p;
      while (*p >= '0' && *p <= '9') {
        value += (*p - '0') / pow10;
-        pow10 *= 10.0;
+        pow10 *= 10.0f;
        ++p;
      }
    }
    // Handle exponent, if any.
    frac = 0;
-    scale = 1.0;
+    scale = 1.0f;
    if ((*p == 'e') || (*p == 'E')) {
      unsigned int expon;
      // Get sign of exponent, if any.
@@ -132,11 +157,9 @@ inline static const char* Atof(const char* p, double* out) {
      for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
        expon = expon * 10 + (*p - '0');
      }
-      if (expon > 308) expon = 308;
+      if (expon > 38) expon = 38;
-      // Calculate scaling factor.
-      while (expon >= 50) { scale *= 1E50; expon -= 50; }
      while (expon >= 8) { scale *= 1E8;  expon -= 8; }
-      while (expon > 0) { scale *= 10.0; expon -= 1; }
+      while (expon > 0) { scale *= 10.0f; expon -= 1; }
    }
    // Return signed and scaled floating point result.
    *out = sign * (frac ? (value / scale) : (value * scale));
@@ -152,9 +175,9 @@ inline static const char* Atof(const char* p, double* out) {
      std::string tmp_str(p, cnt);
      std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), ::tolower);
      if (tmp_str == std::string("na") || tmp_str == std::string("nan")) {
-        *out = 0;
+        *out = 0.0f;
      } else if( tmp_str == std::string("inf") || tmp_str == std::string("infinity")) {
-        *out = sign * 1e308;
+        *out = sign * static_cast<float>(1e38);
      }
      else {
        Log::Fatal("Unknow token %s in data file", tmp_str.c_str());
@@ -170,6 +193,22 @@ inline static const char* Atof(const char* p, double* out) {
  return p;
 }
+inline bool AtoiAndCheck(const char* p, int* out) {
+  const char* after = Atoi(p, out);
+  if (*after != '\0') {
+    return false;
+  }
+  return true;
+}
+inline bool AtofAndCheck(const char* p, float* out) {
+  const char* after = Atof(p, out);
+  if (*after != '\0') {
+    return false;
+  }
+  return true;
+}
 inline static const char* SkipSpaceAndTab(const char* p) {
  while (*p == ' ' || *p == '\t') {
    ++p;
@@ -189,56 +228,57 @@ inline static std::string ArrayToString(const T* arr, int n, char delimiter) {
  if (n <= 0) {
    return std::string("");
  }
-  std::stringstream ss;
+  std::stringstream str_buf;
-  ss << arr[0];
+  str_buf << arr[0];
  for (int i = 1; i < n; ++i) {
-    ss << delimiter;
+    str_buf << delimiter;
-    ss << arr[i];
+    str_buf << arr[i];
  }
-  return ss.str();
+  return str_buf.str();
 }
-inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) {
+template<typename T>
-  std::vector<std::string> strs = Split(str.c_str(), delimiter);
+inline static std::string ArrayToString(std::vector<T> arr, char delimiter) {
-  if (strs.size() != n) {
+  if (arr.size() <= 0) {
-    Log::Fatal("StringToIntArray error, size doesn't matched.");
+    return std::string("");
  }
-  for (size_t i = 0; i < strs.size(); ++i) {
+  std::stringstream str_buf;
-    strs[i] = Trim(strs[i]);
+  str_buf << arr[0];
-    Atoi(strs[i].c_str(), &out[i]);
+  for (size_t i = 1; i < arr.size(); ++i) {
+    str_buf << delimiter;
+    str_buf << arr[i];
  }
+  return str_buf.str();
 }
-inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, double* out) {
+inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) {
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
  if (strs.size() != n) {
-    Log::Fatal("StringToDoubleArray error, size doesn't matched.");
+    Log::Fatal("StringToIntArray error, size doesn't matched.");
  }
  for (size_t i = 0; i < strs.size(); ++i) {
    strs[i] = Trim(strs[i]);
-    Atof(strs[i].c_str(), &out[i]);
+    Atoi(strs[i].c_str(), &out[i]);
  }
 }
-inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, float* out) {
+inline static void StringToFloatArray(const std::string& str, char delimiter, size_t n, float* out) {
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
  if (strs.size() != n) {
-    Log::Fatal("StringToDoubleArray error, size doesn't matched.");
+    Log::Fatal("StringToFloatArray error, size doesn't matched.");
  }
-  double tmp;
  for (size_t i = 0; i < strs.size(); ++i) {
    strs[i] = Trim(strs[i]);
-    Atof(strs[i].c_str(), &tmp);
+    Atof(strs[i].c_str(), &out[i]);
-    out[i] = static_cast<float>(tmp);
  }
 }
-inline static std::vector<double> StringToDoubleArray(const std::string& str, char delimiter) {
+inline static std::vector<float> StringToFloatArray(const std::string& str, char delimiter) {
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
-  std::vector<double> ret;
+  std::vector<float> ret;
  for (size_t i = 0; i < strs.size(); ++i) {
    strs[i] = Trim(strs[i]);
-    double val = 0.0;
+    float val = 0.0f;
    Atof(strs[i].c_str(), &val);
    ret.push_back(val);
  }
@@ -296,6 +336,26 @@ static inline int64_t Pow2RoundUp(int64_t x) {
  return 0;
 }
+/*!
+ * \brief Do inplace softmax transformaton on p_rec
+ * \param p_rec The input/output vector of the values.
+ */
+inline void Softmax(std::vector<float>* p_rec) {
+  std::vector<float> &rec = *p_rec;
+  float wmax = rec[0];
+  for (size_t i = 1; i < rec.size(); ++i) {
+    wmax = std::max(rec[i], wmax);
+  }
+  float wsum = 0.0f;
+  for (size_t i = 0; i < rec.size(); ++i) {
+    rec[i] = std::exp(rec[i] - wmax);
+    wsum += rec[i];
+  }
+  for (size_t i = 0; i < rec.size(); ++i) {
+    rec[i] /= static_cast<float>(wsum);
+  }
+}
 }  // namespace Common
 }  // namespace LightGBM

--- a/include/LightGBM/utils/log.h
+++ b/include/LightGBM/utils/log.h
@@ -85,11 +85,8 @@ private:
  // a trick to use static variable in header file. 
  // May be not good, but avoid to use an additional cpp file
-  static LogLevel& GetLevel() {
+  static LogLevel& GetLevel() { static LogLevel level; return level; }
-    static LogLevel level;
-    return level;
-  };
 };
 }  // namespace LightGBM

--- a/include/LightGBM/utils/lru_pool.h
+++ b/include/LightGBM/utils/lru_pool.h
@@ -5,6 +5,7 @@
 #include <LightGBM/utils/log.h>
 #include <cstring>
+#include <functional>
 namespace LightGBM {
@@ -38,40 +39,41 @@ public:
    cache_size_ = cache_size;
    // at least need 2 bucket to store smaller leaf and larger leaf
    CHECK(cache_size_ >= 2);
    total_size_ = total_size;
+    if (cache_size_ > total_size_) {
-    pool_ = new T[cache_size];
+      cache_size_ = total_size_;
-    mapper_ = new int[total_size_];
+    }
-    inverse_mapper_ = new int[cache_size_];
+    is_enough_ = (cache_size_ == total_size_);
-    last_used_time_ = new int[cache_size_];
+    pool_ = new T[cache_size_];
-    ResetMap();
+    if (!is_enough_) {
+      mapper_ = new int[total_size_];
+      inverse_mapper_ = new int[cache_size_];
+      last_used_time_ = new int[cache_size_];
+      ResetMap();
+    }
  }
-  /*!
-  * \brief Return true if this pool is enough to store all data
-  */
-  bool IsEnough() {
-    return cache_size_ == total_size_;
-  }
  /*!
  * \brief Reset mapper
  */
  void ResetMap() {
-    cur_time_ = 0;
+    if (!is_enough_) {
-    memset(mapper_, -1, sizeof(int)*total_size_);
+      cur_time_ = 0;
-    memset(inverse_mapper_, -1, sizeof(int)*cache_size_);
+      memset(mapper_, -1, sizeof(int)*total_size_);
-    memset(last_used_time_, 0, sizeof(int)*cache_size_);
+      memset(inverse_mapper_, -1, sizeof(int)*cache_size_);
+      memset(last_used_time_, 0, sizeof(int)*cache_size_);
+    }
  }
  /*!
-  * \brief Set data for the pool for specific index
+  * \brief Fill the pool
-  * \param idx which index want to set to
+  * \param obj_create_fun that used to generate object
-  * \param data
  */
-  void Set(int idx, const T& data) {
+  void Fill(std::function<T()> obj_create_fun) {
-    pool_[idx] = data;
+    for (int i = 0; i < cache_size_; ++i) {
+      pool_[i] = obj_create_fun();
+    }
  }
  /*!
@@ -81,7 +83,11 @@ public:
  * \return True if this index is in the pool, False if this index is not in the pool
  */
  bool Get(int idx, T* out) {
-    if (mapper_[idx] >= 0) {
+    if (is_enough_) {
+      *out = pool_[idx];
+      return true;
+    }
+    else if (mapper_[idx] >= 0) {
      int slot = mapper_[idx];
      *out = pool_[slot];
      last_used_time_[slot] = ++cur_time_;
@@ -108,6 +114,10 @@ public:
  * \param dst_idx 
  */
  void Move(int src_idx, int dst_idx) {
+    if (is_enough_) {
+      std::swap(pool_[src_idx], pool_[dst_idx]);
+      return;
+    }
    if (mapper_[src_idx] < 0) {
      return;
    }
@@ -122,6 +132,7 @@ public:
    inverse_mapper_[slot] = dst_idx;
  }
 private:
  void FreeAll(){
    if (pool_ != nullptr) {
      delete[] pool_;
@@ -139,6 +150,7 @@ private:
  T* pool_ = nullptr;
  int cache_size_;
  int total_size_;
+  bool is_enough_ = false;
  int* mapper_ = nullptr;
  int* inverse_mapper_ = nullptr;
  int* last_used_time_ = nullptr;

--- a/include/LightGBM/utils/pipeline_reader.h
+++ b/include/LightGBM/utils/pipeline_reader.h
@@ -21,7 +21,7 @@ public:
  * \param filename Filename of data
  * \process_fun Process function
  */
-  static size_t Read(const char* filename, const std::function<size_t (const char*, size_t)>& process_fun) {
+  static size_t Read(const char* filename, int skip_bytes, const std::function<size_t (const char*, size_t)>& process_fun) {
    FILE* file;
 #ifdef _MSC_VER
@@ -38,8 +38,13 @@ public:
    char* buffer_process = new char[buffer_size];
    // buffer used for the file reading
    char* buffer_read = new char[buffer_size];
+    size_t read_cnt = 0;
+    if (skip_bytes > 0) {
+      // skip first k bytes
+      read_cnt = fread(buffer_process, 1, skip_bytes, file);
+    }
    // read first block
-    size_t read_cnt = fread(buffer_process, 1, buffer_size, file);
+    read_cnt = fread(buffer_process, 1, buffer_size, file);
    size_t last_read_cnt = 0;
    while (read_cnt > 0) {
      // strat read thread

--- a/include/LightGBM/utils/text_reader.h
+++ b/include/LightGBM/utils/text_reader.h
@@ -6,6 +6,7 @@
 #include <LightGBM/utils/random.h>
 #include <cstdio>
+#include <sstream>
 #include <vector>
 #include <string>
@@ -22,9 +23,41 @@ public:
  /*!
  * \brief Constructor
  * \param filename Filename of data
+  * \param is_skip_first_line True if need to skip header
  */
-  TextReader(const char* filename):
+  TextReader(const char* filename, bool is_skip_first_line):
-    filename_(filename){
+    filename_(filename), is_skip_first_line_(is_skip_first_line){
+    if (is_skip_first_line_) {
+      FILE* file;
+#ifdef _MSC_VER
+      fopen_s(&file, filename, "r");
+#else
+      file = fopen(filename, "r");
+#endif
+      std::stringstream str_buf;
+      int read_c = -1;
+      read_c = fgetc(file);
+      while (read_c != EOF) {
+        char tmp_ch = static_cast<char>(read_c);
+        if (tmp_ch == '\n' || tmp_ch == '\r') {
+          break;
+        }
+        str_buf << tmp_ch;
+        ++skip_bytes_;
+        read_c = fgetc(file);
+      }
+      if (static_cast<char>(read_c) == '\r') {
+        read_c = fgetc(file);
+        ++skip_bytes_;
+      }
+      if (static_cast<char>(read_c) == '\n') {
+        read_c = fgetc(file);
+        ++skip_bytes_;
+      }
+      fclose(file);
+      first_line_ = str_buf.str();
+      Log::Debug("skip header:\"%s\" in file %s", first_line_.c_str(), filename_);
+    }
  }
  /*!
  * \brief Destructor
@@ -40,6 +73,12 @@ public:
    lines_.shrink_to_fit();
  }
  /*!
+  * \brief return first line of data
+  */
+  inline std::string first_line() {
+    return first_line_;
+  }
+  /*!
  * \brief Get text data that read from file
  * \return Text data, store in std::vector by line
  */
@@ -48,7 +87,7 @@ public:
  INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
    last_line_ = "";
    INDEX_T total_cnt = 0;
-    PipelineReader::Read(filename_,
+    PipelineReader::Read(filename_, skip_bytes_,
      [this, &total_cnt, &process_fun]
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
@@ -73,7 +112,7 @@ public:
          ++i;
          ++total_cnt;
          // skip end of line
-          while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; }
+          while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
          last_i = i;
        }
        else {
@@ -176,7 +215,7 @@ public:
    last_line_ = "";
    INDEX_T total_cnt = 0;
    INDEX_T used_cnt = 0;
-    PipelineReader::Read(filename_,
+    PipelineReader::Read(filename_, skip_bytes_,
      [this, &total_cnt, &process_fun,&used_cnt, &filter_fun]
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
@@ -208,7 +247,7 @@ public:
          ++i;
          ++total_cnt;
          // skip end of line
-          while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; }
+          while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
          last_i = i;
        }
        else {
@@ -242,7 +281,7 @@ public:
  }
  INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
-    return ReadAllAndProcessParallelWithFilter(process_fun, 
+    return ReadAllAndProcessParallelWithFilter(process_fun,
      [&used_data_indices](INDEX_T used_cnt ,INDEX_T total_cnt) {
      if (used_cnt < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
        return true;
@@ -260,6 +299,12 @@ private:
  std::vector<std::string> lines_;
  /*! \brief Buffer for last line */
  std::string last_line_;
+  /*! \brief first line */
+  std::string first_line_="";
+  /*! \brief is skip first line */
+  bool is_skip_first_line_ = false;
+  /*! \brief is skip first line */
+  int skip_bytes_ = 0;
 };
 }  // namespace LightGBM