first commit

1c774687 · Guolin Ke · 1c774687 · 1c774687 · 1c774687 · 1c774687
Commit 1c774687 authored Aug 05, 2016 by Guolin Ke
20 changed files
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
+#ifndef LIGHTGBM_TREE_LEARNER_H_
+#define LIGHTGBM_TREE_LEARNER_H_
+
+
+#include <LightGBM/meta.h>
+#include <LightGBM/config.h>
+
+#include <vector>
+
+namespace LightGBM {
+
+/*! \brief forward declaration */
+class Tree;
+class Dataset;
+
+/*!
+* \brief Interface for tree learner
+*/
+class TreeLearner {
+public:
+  /*! \brief virtual destructor */
+  virtual ~TreeLearner() {}
+
+  /*!
+  * \brief Init tree learner with training data set and tree config
+  * \param train_data The used training data
+  * \param tree_config The tree setting
+  */
+  virtual void Init(const Dataset* train_data) = 0;
+
+  /*!
+  * \brief fit train data set and return a trained tree
+  * \param gradients The first order gradients
+  * \param hessians The second order gradients
+  * \return A trained tree
+  */
+  virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;
+
+  /*!
+  * \brief SetLabelAt bagging data
+  * \param used_indices Used data indices
+  * \param num_data Number of used data
+  */
+  virtual void SetBaggingData(const data_size_t* used_indices,
+    data_size_t num_data) = 0;
+
+  /*!
+  * \brief Use last trained tree to predition training score, and add to out_score;
+  * \param out_score output score
+  */
+  virtual void AddPredictionToScore(score_t *out_score) const = 0;
+
+  /*!
+  * \brief Create object of tree learner
+  * \param type Type of tree learner
+  */
+  static TreeLearner* CreateTreeLearner(TreeLearnerType type,
+    const TreeConfig& tree_config);
+};
+
+}  // namespace LightGBM
+
+#endif  #endif  // LightGBM_TREE_LEARNER_H_
--- a/include/LightGBM/utils/array_args.h
+++ b/include/LightGBM/utils/array_args.h
+#ifndef LIGHTGBM_UTILS_ARRAY_AGRS_H_
+#define LIGHTGBM_UTILS_ARRAY_AGRS_H_
+
+#include <vector>
+#include <algorithm>
+
+namespace LightGBM {
+
+/*!
+* \brief Contains some operation for a array, e.g. ArgMax, TopK.
+*/
+template<typename VAL_T>
+class ArrayArgs {
+public:
+  inline static size_t ArgMax(const std::vector<VAL_T>& array) {
+    if (array.size() <= 0) {
+      return 0;
+    }
+    size_t argMax = 0;
+    for (size_t i = 1; i < array.size(); ++i) {
+      if (array[i] > array[argMax]) {
+        argMax = i;
+      }
+    }
+    return argMax;
+  }
+
+  inline static size_t ArgMin(const std::vector<VAL_T>& array) {
+    if (array.size() <= 0) {
+      return 0;
+    }
+    size_t argMin = 0;
+    for (size_t i = 1; i < array.size(); ++i) {
+      if (array[i] < array[argMin]) {
+        argMin = i;
+      }
+    }
+    return argMin;
+  }
+
+  inline static size_t ArgMax(const VAL_T* array, size_t n) {
+    if (n <= 0) {
+      return 0;
+    }
+    size_t argMax = 0;
+    for (size_t i = 1; i < n; ++i) {
+      if (array[i] > array[argMax]) {
+        argMax = i;
+      }
+    }
+    return argMax;
+  }
+
+  inline static size_t ArgMin(const VAL_T* array, size_t n) {
+    if (n <= 0) {
+      return 0;
+    }
+    size_t argMin = 0;
+    for (size_t i = 1; i < n; ++i) {
+      if (array[i] < array[argMin]) {
+        argMin = i;
+      }
+    }
+    return argMin;
+  }
+
+  inline static size_t Partition(std::vector<VAL_T>* array, size_t start, size_t end) {
+    VAL_T& pivot = (*array)[end - 1];
+    size_t p_idx = start;
+    for (size_t i = start; i < end - 1; ++i) {
+      if ((*array)[i] > pivot) {
+        std::swap((*array)[p_idx], (*array)[i]);
+        ++p_idx;
+      }
+    }
+    std::swap((*array)[p_idx], (*array)[end - 1]);
+    return p_idx;
+  };
+
+  inline static size_t ArgMaxAtK(std::vector<VAL_T>* array, size_t start, size_t end, size_t k) {
+    if (start == end - 1) {
+      return start;
+    }
+    size_t p_idx = Partition(array, start, end);
+    if (p_idx == k) {
+      return p_idx;
+    }
+    else if (k < p_idx) {
+      return ArgMaxAtK(array, start, p_idx, k);
+    }
+    else {
+      return ArgMaxAtK(array, p_idx + 1, end, k);
+    }
+  }
+
+  inline static void MaxK(const std::vector<VAL_T>& array, size_t k, std::vector<VAL_T>* out) {
+    out->clear();
+    if (k <= 0) {
+      return;
+    }
+    for (auto val : array) {
+      out->push_back(val);
+    }
+    if (k >= array.size()) {
+      return;
+    }
+    ArgMaxAtK(out, 0, out->size(), k - 1);
+    out->erase(out->begin() + k, out->end());
+  }
+
+};
+
+}  // namespace LightGBM
+
+#endif #endif  // LightGBM_UTILS_ARRAY_AGRS_H_
+
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
+#ifndef LIGHTGBM_UTILS_COMMON_FUN_H_
+#define LIGHTGBM_UTILS_COMMON_FUN_H_
+
+#include <LightGBM/utils/log.h>
+
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <cstdint>
+
+namespace LightGBM {
+
+namespace Common {
+
+template<typename T>
+inline static T Max(const T& a, const T& b) {
+  return a > b ? a : b;
+}
+
+template<typename T>
+inline static T Min(const T& a, const T& b) {
+  return a < b ? a : b;
+}
+
+
+
+inline static std::string& Trim(std::string& str) {
+  if (str.size() <= 0) {
+    return str;
+  }
+  str.erase(str.find_last_not_of(" \f\n\r\t\v") + 1);
+  str.erase(0, str.find_first_not_of(" \f\n\r\t\v"));
+  return str;
+}
+
+inline static std::vector<std::string> Split(const char* str, char delimiter) {
+  std::stringstream ss(str);
+  std::string tmp_str;
+  std::vector<std::string> ret;
+  while (std::getline(ss, tmp_str, delimiter)) {
+    ret.push_back(tmp_str);
+  }
+  return ret;
+}
+
+inline static const char* Atoi(const char* p, int* out) {
+  int sign, value;
+  while (*p == ' ') {
+    ++p;
+  }
+  sign = 1;
+  if (*p == '-') {
+    sign = -1;
+    ++p;
+  }
+  else if (*p == '+') {
+    ++p;
+  }
+  for (value = 0; *p >= '0' && *p <= '9'; ++p) {
+    value = value * 10 + (*p - '0');
+  }
+  *out = sign * value;
+  while (*p == ' ') {
+    ++p;
+  }
+  return p;
+}
+
+//ref to http://www.leapsecond.com/tools/fast_atof.c
+inline static const char* Atof(const char* p, double* out) {
+  int frac;
+  double sign, value, scale;
+  // Skip leading white space, if any.
+  while (*p == ' ') {
+    ++p;
+  }
+
+  // Get sign, if any.
+  sign = 1.0;
+  if (*p == '-') {
+    sign = -1.0;
+    ++p;
+  }
+  else if (*p == '+') {
+    ++p;
+  }
+
+  // Get digits before decimal point or exponent, if any.
+  for (value = 0.0; *p >= '0' && *p <= '9'; ++p) {
+    value = value * 10.0 + (*p - '0');
+  }
+
+  // Get digits after decimal point, if any.
+  if (*p == '.') {
+    double pow10 = 10.0;
+    ++p;
+    while (*p >= '0' && *p <= '9') {
+      value += (*p - '0') / pow10;
+      pow10 *= 10.0;
+      ++p;
+    }
+  }
+
+  // Handle exponent, if any.
+  frac = 0;
+  scale = 1.0;
+  if ((*p == 'e') || (*p == 'E')) {
+    unsigned int expon;
+    // Get sign of exponent, if any.
+    ++p;
+    if (*p == '-') {
+      frac = 1;
+      ++p;
+    }
+    else if (*p == '+') {
+      ++p;
+    }
+    // Get digits of exponent, if any.
+    for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
+      expon = expon * 10 + (*p - '0');
+    }
+    if (expon > 308) expon = 308;
+    // Calculate scaling factor.
+    while (expon >= 50) { scale *= 1E50; expon -= 50; }
+    while (expon >= 8) { scale *= 1E8;  expon -= 8; }
+    while (expon > 0) { scale *= 10.0; expon -= 1; }
+  }
+  // Return signed and scaled floating point result.
+  *out = sign * (frac ? (value / scale) : (value * scale));
+  while (*p == ' ') {
+    ++p;
+  }
+  return p;
+}
+
+inline static const char* SkipSpaceAndTab(const char* p) {
+  while (*p == ' ' || *p == '\t') {
+    ++p;
+  }
+  return p;
+}
+
+inline static const char* SkipReturn(const char* p) {
+  while (*p == '\n' || *p == '\r' || *p == ' ') {
+    ++p;
+  }
+  return p;
+}
+
+template<typename T>
+inline static std::string ArrayToString(const T* arr, int n, char delimiter) {
+  if (n <= 0) {
+    return std::string("");
+  }
+  std::stringstream ss;
+  ss << arr[0];
+  for (int i = 1; i < n; ++i) {
+    ss << delimiter;
+    ss << arr[i];
+  }
+  return ss.str();
+}
+
+inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) {
+  std::vector<std::string> strs = Split(str.c_str(), delimiter);
+  if (strs.size() != n) {
+    Log::Stderr("StringToIntArray error, size don't equal.");
+  }
+  for (size_t i = 0; i < strs.size(); ++i) {
+    strs[i] = Trim(strs[i]);
+    Atoi(strs[i].c_str(), &out[i]);
+  }
+}
+
+inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, double* out) {
+  std::vector<std::string> strs = Split(str.c_str(), delimiter);
+  if (strs.size() != n) {
+    Log::Stderr("StringToDoubleArray error, size don't equal");
+  }
+  for (size_t i = 0; i < strs.size(); ++i) {
+    strs[i] = Trim(strs[i]);
+    Atof(strs[i].c_str(), &out[i]);
+  }
+}
+
+inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, float* out) {
+  std::vector<std::string> strs = Split(str.c_str(), delimiter);
+  if (strs.size() != n) {
+    Log::Stderr("StringToDoubleArray error, size don't equal");
+  }
+  double tmp;
+  for (size_t i = 0; i < strs.size(); ++i) {
+    strs[i] = Trim(strs[i]);
+    Atof(strs[i].c_str(), &tmp);
+    out[i] = static_cast<float>(tmp);
+  }
+}
+
+inline static std::vector<double> StringToDoubleArray(const std::string& str, char delimiter) {
+  std::vector<std::string> strs = Split(str.c_str(), delimiter);
+  std::vector<double> ret;
+  for (size_t i = 0; i < strs.size(); ++i) {
+    strs[i] = Trim(strs[i]);
+    double val = 0.0;
+    Atof(strs[i].c_str(), &val);
+    ret.push_back(val);
+  }
+  return ret;
+}
+
+inline static std::vector<int> StringToIntArray(const std::string& str, char delimiter) {
+  std::vector<std::string> strs = Split(str.c_str(), delimiter);
+  std::vector<int> ret;
+  for (size_t i = 0; i < strs.size(); ++i) {
+    strs[i] = Trim(strs[i]);
+    int val = 0;
+    Atoi(strs[i].c_str(), &val);
+    ret.push_back(val);
+  }
+  return ret;
+}
+
+inline static std::string Join(const std::vector<std::string>& strs, char delimiter) {
+  if (strs.size() <= 0) {
+    return std::string("");
+  }
+  std::stringstream ss;
+  ss << strs[0];
+  for (size_t i = 1; i < strs.size(); ++i) {
+    ss << delimiter;
+    ss << strs[i];
+  }
+  return ss.str();
+}
+
+inline static std::string Join(const std::vector<std::string>& strs, size_t start, size_t end, char delimiter) {
+  if (end - start <= 0) {
+    return std::string("");
+  }
+  start = Min<size_t>(start, static_cast<size_t>(strs.size()) - 1);
+  end = Min<size_t>(end, static_cast<size_t>(strs.size()));
+  std::stringstream ss;
+  ss << strs[start];
+  for (size_t i = start + 1; i < end; ++i) {
+    ss << delimiter;
+    ss << strs[i];
+  }
+  return ss.str();
+}
+
+static inline int64_t Pow2RoundUp(int64_t x) {
+  int64_t t = 1;
+  for (int i = 0; i < 64; ++i) {
+    if (t >= x) {
+      return t;
+    }
+    t <<= 1;
+  }
+  return 0;
+}
+
+}  // namespace Common
+
+}  // namespace LightGBM
+
+#endif #endif  // LightGBM_UTILS_COMMON_FUN_H_
--- a/include/LightGBM/utils/log.h
+++ b/include/LightGBM/utils/log.h
+#ifndef LIGHTGBM_UTILS_LOG_H_
+#define LIGHTGBM_UTILS_LOG_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstdarg>
+#include <cstring>
+
+namespace LightGBM {
+
+class Log {
+public:
+
+  inline static void Stderr(const char *format, ...) {
+    va_list argptr;
+    char fixed[512];
+#ifdef _MSC_VER
+    sprintf_s(fixed, "[LightGBM Error] %s \n", format);
+#else
+    sprintf(fixed, "[LightGBM Error] %s \n", format);
+#endif
+    va_start(argptr, format);
+    vfprintf(stderr, fixed, argptr);
+    va_end(argptr);
+    fflush(stderr);
+    std::exit(1);
+  }
+
+  inline static void Stdout(const char *format, ...) {
+    va_list argptr;
+    char fixed[512];
+#ifdef _MSC_VER
+    sprintf_s(fixed, "[LightGBM] %s\n", format);
+#else
+    sprintf(fixed, "[LightGBM] %s\n", format);
+#endif
+    va_start(argptr, format);
+    vfprintf(stdout, fixed, argptr);
+    va_end(argptr);
+    fflush(stdout);
+  }
+};
+
+#define CHECK(condition)                                   \
+  if (!(condition)) Log::Stderr("Check failed: " #condition \
+     " at %s, line %d .\n", __FILE__,  __LINE__);
+
+}  // namespace LightGBM
+#endif #endif  // LightGBM_UTILS_LOG_H_
--- a/include/LightGBM/utils/pipeline_reader.h
+++ b/include/LightGBM/utils/pipeline_reader.h
+#ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
+#define LIGHTGBM_UTILS_PIPELINE_READER_H_
+
+#include <LightGBM/utils/log.h>
+
+#include <cstdio>
+
+#include <algorithm>
+#include <functional>
+#include <thread>
+
+namespace LightGBM{
+
+/*!
+* \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block
+*/
+class PipelineReader {
+public:
+  /*!
+  * \brief Read data from a file, use pipeline methods
+  * \param filename Filename of data
+  * \process_fun Process function
+  */
+  static size_t Read(const char* filename, const std::function<size_t (const char*, size_t)>& process_fun) {
+    FILE* file;
+
+#ifdef _MSC_VER
+    fopen_s(&file, filename, "rb");
+#else
+    file = fopen(filename, "rb");
+#endif
+    if (file == NULL) {
+      return 0;
+    }
+    size_t cnt = 0;
+    const size_t buffer_size =  16 * 1024 * 1024 ;
+    // buffer used for the process_fun
+    char* buffer_process = new char[buffer_size];
+    // buffer used for the file reading
+    char* buffer_read = new char[buffer_size];
+    // read first block
+    size_t read_cnt = fread(buffer_process, 1, buffer_size, file);
+    size_t last_read_cnt = 0;
+    while (read_cnt > 0) {
+      // strat read thread
+      std::thread read_worker = std::thread(
+        [file, buffer_read, buffer_size, &last_read_cnt] {
+        last_read_cnt = fread(buffer_read, 1, buffer_size, file);
+      }
+      );
+      // start process
+      cnt += process_fun(buffer_process, read_cnt);
+      // wait for read thread
+      read_worker.join();
+      // exchange the buffer
+      std::swap(buffer_process, buffer_read);
+      read_cnt = last_read_cnt;
+    }
+    delete[] buffer_process;
+    delete[] buffer_read;
+    // close file
+    fclose(file);
+    return cnt;
+  }
+
+};
+
+}  // namespace LightGBM
+
+#endif #endif  // LightGBM_UTILS_PIPELINE_READER_H_
--- a/include/LightGBM/utils/random.h
+++ b/include/LightGBM/utils/random.h
+#ifndef LIGHTGBM_UTILS_RANDOM_H_
+#define LIGHTGBM_UTILS_RANDOM_H_
+
+#include <cstdint>
+
+#include <random>
+#include <vector>
+
+namespace LightGBM {
+
+/*!
+* \brief A wrapper for random generator
+*/
+class Random {
+public:
+  /*!
+  * \brief Constructor, with random seed
+  */
+  Random() 
+    :distribution_zero_to_one_(0.0, 1.0) {
+    std::random_device rd;
+    generator_ = std::mt19937(rd());
+  }
+  /*!
+  * \brief Constructor, with specific seed
+  */
+  Random(int seed)
+    :generator_(seed), distribution_zero_to_one_(0.0, 1.0) {
+  }
+  /*!
+  * \brief Generate random integer
+  * \param lower_bound lower bound
+  * \param upper_bound upper bound
+  * \return The random integer between [lower_bound, upper_bound)
+  */
+  inline int64_t NextInt(int64_t lower_bound, int64_t upper_bound) {
+    // get random interge in [a,b)
+    std::uniform_int_distribution<int64_t> distribution(lower_bound, upper_bound - 1);
+    return distribution(generator_);
+  }
+  /*!
+  * \brief Generate random float data
+  * \return The random float between [0.0, 1.0)
+  */
+  inline double NextDouble() {
+    // get random float in [0,1)
+    return distribution_zero_to_one_(generator_);
+  }
+  /*!
+  * \brief Sample K data from {0,1,...,N-1}
+  * \param N
+  * \param K
+  * \return K Ordered sampled data from {0,1,...,N-1}
+  */
+  inline std::vector<size_t> Sample(size_t N, size_t K) {
+    std::vector<size_t> ret;
+    if (K > N || K < 0) {
+      return ret;
+    }
+    for (size_t i = 0; i < N; ++i) {
+      double prob = (K - ret.size()) / static_cast<double>(N - i);
+      if (NextDouble() < prob) {
+        ret.push_back(i);
+      }
+    }
+    return ret;
+  }
+private:
+  /*! \brief Random generator */
+  std::mt19937 generator_;
+  /*! \brief Cache distribution for [0.0, 1.0) */
+  std::uniform_real_distribution<double> distribution_zero_to_one_;
+};
+
+}  // namespace LightGBM
+
+#endif #endif  // LightGBM_UTILS_RANDOM_H_
--- a/include/LightGBM/utils/text_reader.h
+++ b/include/LightGBM/utils/text_reader.h
+#ifndef LIGHTGBM_UTILS_TEXT_READER_H_
+#define LIGHTGBM_UTILS_TEXT_READER_H_
+
+#include <LightGBM/utils/pipeline_reader.h>
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/random.h>
+
+#include <cstdio>
+
+#include <vector>
+#include <string>
+#include <functional>
+
+namespace LightGBM {
+
+/*!
+* \brief Read text data from file
+*/
+template<typename INDEX_T>
+class TextReader {
+public:
+  /*!
+  * \brief Constructor
+  * \param filename Filename of data
+  */
+  TextReader(const char* filename):
+    filename_(filename){
+  }
+  /*!
+  * \brief Destructor
+  */
+  ~TextReader() {
+    Clear();
+  }
+  /*!
+  * \brief Clear cached data
+  */
+  inline void Clear() {
+    lines_.clear();
+    lines_.shrink_to_fit();
+  }
+  /*!
+  * \brief Get text data that read from file
+  * \return Text data, store in std::vector by line
+  */
+  inline std::vector<std::string>& Lines() { return lines_; }
+
+  INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
+    last_line_ = "";
+    INDEX_T total_cnt = 0;
+    PipelineReader::Read(filename_,
+      [this, &total_cnt, &process_fun]
+    (const char* buffer_process, size_t read_cnt) {
+      size_t cnt = 0;
+      size_t i = 0;
+      size_t last_i = 0;
+      // skip the break between \r and \n
+      if (last_line_.size() == 0 && buffer_process[0] == '\n') {
+        i = 1;
+        last_i = i;
+      }
+      while (i < read_cnt) {
+        if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
+          if (last_line_.size() > 0) {
+            last_line_.append(buffer_process + last_i, i - last_i);
+            process_fun(total_cnt, last_line_.c_str(), last_line_.size());
+            last_line_ = "";
+          }
+          else {
+            process_fun(total_cnt, buffer_process + last_i, i - last_i);
+          }
+          ++cnt;
+          ++i;
+          ++total_cnt;
+          // skip end of line
+          while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; }
+          last_i = i;
+        }
+        else {
+          ++i;
+        }
+      }
+      if (last_i != read_cnt) {
+        last_line_ = std::string(buffer_process + last_i, read_cnt - last_i);
+      }
+      return cnt;
+    });
+    // if last line of file doesn't contain end of line
+    if (last_line_.size() > 0) {
+      Log::Stdout("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_);
+      process_fun(total_cnt, last_line_.c_str(), last_line_.size());
+      ++total_cnt;
+      last_line_ = "";
+    }
+    return total_cnt;
+  }
+
+  /*!
+  * \brief Read all text data from file in memory
+  * \return number of lines of text data
+  */
+  INDEX_T ReadAllLines() {
+    return ReadAllAndProcess(
+      [this](INDEX_T, const char* buffer, size_t size) {
+      lines_.emplace_back(buffer, size);
+    });
+  }
+
+  INDEX_T SampleFromFile(Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
+    INDEX_T cur_sample_cnt = 0;
+    return ReadAllAndProcess(
+      [this, &random, &cur_sample_cnt, &sample_cnt, &out_sampled_data]
+    (INDEX_T line_idx, const char* buffer, size_t size) {
+      if (cur_sample_cnt < sample_cnt) {
+        out_sampled_data->emplace_back(buffer, size);
+        ++cur_sample_cnt;
+      }
+      else {
+        const size_t idx = random.NextInt(0, line_idx + 1);
+        if (idx < sample_cnt) {
+          out_sampled_data->operator[](idx) = std::string(buffer, size);
+        }
+      }
+    });
+  }
+  /*!
+  * \brief Read part of text data from file in memory, use filter_fun to filter data
+  * \param filter_fun Function that perform data filter
+  * \param out_used_data_indices Store line indices that read text data
+  * \return The number of total data
+  */
+  INDEX_T ReadAndFilterLines(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices) {
+    out_used_data_indices->clear();
+    INDEX_T total_cnt = ReadAllAndProcess(
+      [this, &out_used_data_indices, &filter_fun]
+    (INDEX_T line_idx , const char* buffer, size_t size) {
+      bool is_used = filter_fun(line_idx);
+      if (is_used) { out_used_data_indices->push_back(line_idx); }
+      if (is_used) { lines_.emplace_back(buffer, size); }
+    });
+    return total_cnt;
+  }
+
+  INDEX_T SampleAndFilterFromFile(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices,
+    Random& random, size_t sample_cnt, std::vector<std::string>* out_sampled_data) {
+    INDEX_T cur_sample_cnt = 0;
+    out_used_data_indices->clear();
+    INDEX_T total_cnt = ReadAllAndProcess(
+      [this, &out_used_data_indices, &filter_fun, &random, &cur_sample_cnt, &sample_cnt, &out_sampled_data]
+    (INDEX_T line_idx, const char* buffer, size_t size) {
+      bool is_used = filter_fun(line_idx);
+      if (is_used) { out_used_data_indices->push_back(line_idx); }
+      if (is_used) {
+        if (cur_sample_cnt < sample_cnt) {
+          out_sampled_data->emplace_back(buffer, size);
+          ++cur_sample_cnt;
+        }
+        else {
+          const size_t idx = random.NextInt(0, out_used_data_indices->size());
+          if (idx < sample_cnt) {
+            out_sampled_data->operator[](idx) = std::string(buffer, size);
+          }
+        }
+      }
+    });
+    return total_cnt;
+  }
+
+  INDEX_T CountLine() {
+    return ReadAllAndProcess(
+      [this](INDEX_T, const char*, size_t) {
+    });
+  }
+
+  INDEX_T ReadAllAndProcessParallelWithFilter(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun, const std::function<bool(INDEX_T,INDEX_T)>& filter_fun) {
+    last_line_ = "";
+    INDEX_T total_cnt = 0;
+    INDEX_T used_cnt = 0;
+    PipelineReader::Read(filename_,
+      [this, &total_cnt, &process_fun,&used_cnt, &filter_fun]
+    (const char* buffer_process, size_t read_cnt) {
+      size_t cnt = 0;
+      size_t i = 0;
+      size_t last_i = 0;
+      INDEX_T start_idx = used_cnt;
+      // skip the break between \r and \n
+      if (last_line_.size() == 0 && buffer_process[0] == '\n') {
+        i = 1;
+        last_i = i;
+      }
+      while (i < read_cnt) {
+        if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
+          if (last_line_.size() > 0) {
+            last_line_.append(buffer_process + last_i, i - last_i);
+            if (filter_fun(used_cnt, total_cnt)) {
+              lines_.push_back(last_line_);
+              ++used_cnt;
+            }
+            last_line_ = "";
+          }
+          else {
+            if (filter_fun(used_cnt, total_cnt)) {
+              lines_.emplace_back(buffer_process + last_i, i - last_i);
+              ++used_cnt;
+            }
+          }
+          ++cnt;
+          ++i;
+          ++total_cnt;
+          // skip end of line
+          while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; }
+          last_i = i;
+        }
+        else {
+          ++i;
+        }
+      }
+      process_fun(start_idx, lines_);
+      lines_.clear();
+      if (last_i != read_cnt) {
+        last_line_ = std::string(buffer_process + last_i, read_cnt - last_i);
+      }
+      return cnt;
+    });
+    // if last line of file doesn't contain end of line
+    if (last_line_.size() > 0) {
+      Log::Stdout("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_);
+      if (filter_fun(used_cnt, total_cnt)) {
+        lines_.push_back(last_line_);
+        process_fun(used_cnt, lines_);
+      }
+      lines_.clear();
+      ++total_cnt;
+      ++used_cnt;
+      last_line_ = "";
+    }
+    return total_cnt;
+  }
+
+  INDEX_T ReadAllAndProcessParallel(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
+    return ReadAllAndProcessParallelWithFilter(process_fun, [](INDEX_T, INDEX_T) { return true; });
+  }
+
+  INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
+    return ReadAllAndProcessParallelWithFilter(process_fun, 
+      [&used_data_indices](INDEX_T used_cnt ,INDEX_T total_cnt) {
+      if (used_cnt < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
+        return true;
+      }
+      else {
+        return false;
+      }
+    });
+  }
+
+private:
+  /*! \brief Filename of text data */
+  const char* filename_;
+  /*! \brief Cache the read text data */
+  std::vector<std::string> lines_;
+  /*! \brief Buffer for last line */
+  std::string last_line_;
+};
+
+}  // namespace LightGBM
+
+#endif #endif  // LightGBM_UTILS_TEXT_READER_H_
+
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
+#ifndef LIGHTGBM_UTILS_THREADING_H_
+#define LIGHTGBM_UTILS_THREADING_H_
+
+#include <omp.h>
+
+#include <vector>
+#include <functional>
+
+namespace LightGBM {
+
+class Threading {
+public:
+
+  template<typename INDEX_T>
+  static inline void For(INDEX_T start, INDEX_T end, const std::function<void(int, INDEX_T, INDEX_T)>& inner_fun) {
+    int num_threads = 1;
+    #pragma omp parallel
+    #pragma omp master
+    {
+      num_threads = omp_get_num_threads();
+    }
+    INDEX_T num_inner = (end - start + num_threads - 1) / num_threads;
+    if (num_inner <= 0) { num_inner = 1; }
+    #pragma omp parallel for schedule(static,1)
+    for (int i = 0; i < num_threads; ++i) {
+      INDEX_T inner_start = start + num_inner * i;
+      INDEX_T inner_end = inner_start + num_inner;
+      if (inner_end > end) { inner_end = end; }
+      if (inner_start < end) {
+        inner_fun(i, inner_start, inner_end);
+      }
+    }
+  }
+};
+
+}   // namespace LightGBM
+
+#endif #endif  // LightGBM_UTILS_THREADING_H_
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+include_directories(${LightGBM_HEADER_DIR})
+
+if(USE_MPI)
+  include_directories(${MPI_CXX_INCLUDE_PATH})
+endif()
+
+AUX_SOURCE_DIRECTORY(./application/ APPLICATION_SRC) 
+AUX_SOURCE_DIRECTORY(./boosting/ BOOSTING_SRC) 
+AUX_SOURCE_DIRECTORY(./io/ IO_SRC) 
+AUX_SOURCE_DIRECTORY(./metric/ METRIC_SRC) 
+AUX_SOURCE_DIRECTORY(./objective/ OBJECTIVE_SRC) 
+AUX_SOURCE_DIRECTORY(./network/ NETWORK_SRC) 
+AUX_SOURCE_DIRECTORY(./treelearner/ TREELEARNER_SRC) 
+
+add_executable(LightGBM main.cpp ${APPLICATION_SRC} ${BOOSTING_SRC} ${IO_SRC} ${METRIC_SRC} ${OBJECTIVE_SRC} ${NETWORK_SRC} ${TREELEARNER_SRC})
+
+if(USE_MPI)
+  TARGET_LINK_LIBRARIES(LightGBM ${MPI_CXX_LIBRARIES})
+endif(USE_MPI)
+
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
+#include <LightGBM/application.h>
+
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/text_reader.h>
+
+#include <LightGBM/network.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/boosting.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/metric.h>
+
+#include "predictor.hpp"
+
+#include <omp.h>
+
+#include <cstdio>
+#include <ctime>
+
+#include <chrono>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace LightGBM {
+
+Application::Application(int argc, char** argv)
+  :train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) {
+  LoadParameters(argc, argv);
+  // set number of threads for openmp
+  if (config_.num_threads > 0) {
+    omp_set_num_threads(config_.num_threads);
+  }
+}
+
+Application::~Application() {
+  if (train_data_ != nullptr) { delete train_data_; }
+  for (auto& data : valid_datas_) {
+    if (data != nullptr) { delete data; }
+  }
+  valid_datas_.clear();
+  for (auto& metric : train_metric_) {
+    if (metric != nullptr) { delete metric; }
+  }
+  for (auto& metric : valid_metrics_) {
+    for (auto& sub_metric : metric) {
+      if (sub_metric != nullptr) { delete sub_metric; }
+    }
+  }
+  valid_metrics_.clear();
+  if (boosting_ != nullptr) { delete boosting_; }
+  if (objective_fun_ != nullptr) { delete objective_fun_; }
+  if (config_.is_parallel) {
+    Network::Dispose();
+  }
+}
+
+void Application::LoadParameters(int argc, char** argv) {
+  std::unordered_map<std::string, std::string> params;
+  for (int i = 0; i < argc; ++i) {
+    std::vector<std::string> tmp_strs = Common::Split(argv[i], '=');
+    if (tmp_strs.size() == 2) {
+      std::string key = Common::Trim(tmp_strs[0]);
+      std::string value = Common::Trim(tmp_strs[1]);
+      if (key.size() <= 0) {
+        continue;
+      }
+      params[key] = value;
+    }
+  }
+  // check for alias
+  ParameterAlias::KeyAliasTransform(&params);
+  // read parameters from config file
+  if (params.count("config_file") > 0) {
+    TextReader<size_t> config_reader(params["config_file"].c_str());
+    config_reader.ReadAllLines();
+    if (config_reader.Lines().size() > 0) {
+      for (auto& line : config_reader.Lines()) {
+        line = Common::Trim(line);
+        // skip comment
+        if (line.size() == 0 || line[0] == '#') {
+          continue;
+        }
+        std::vector<std::string> tmp_strs = Common::Split(line.c_str(), '=');
+        if (tmp_strs.size() == 2) {
+          std::string key = Common::Trim(tmp_strs[0]);
+          std::string value = Common::Trim(tmp_strs[1]);
+          if (key.size() <= 0) {
+            continue;
+          }
+          // Command line have higher priority
+          if (params.count(key) == 0) {
+            params[key] = value;
+          }
+        }
+      }
+    } else {
+      Log::Stdout("config file: %s doesn't exist, will ignore",
+                                params["config_file"].c_str());
+    }
+  }
+  // check for alias again
+  ParameterAlias::KeyAliasTransform(&params);
+  // load configs
+  config_.Set(params);
+  Log::Stdout("finished load parameters");
+}
+
+void Application::LoadData() {
+  auto start_time = std::chrono::high_resolution_clock::now();
+  // predition is needed if using input initial model(continued train)
+  PredictFunction predict_fun = nullptr;
+  Predictor* predictor = nullptr;
+  // load init model
+  if (config_.io_config.input_model.size() > 0) {
+    LoadModel();
+    if (boosting_->NumberOfSubModels() > 0) {
+      predictor = new Predictor(boosting_, config_.io_config.is_sigmoid);
+      predict_fun =
+        [&predictor](const std::vector<std::pair<int, double>>& features) {
+        return predictor->PredictRawOneLine(features);
+      };
+    }
+  }
+  // sync up random seed for data partition
+  if (config_.is_parallel_find_bin) {
+    config_.io_config.data_random_seed =
+       GlobalSyncUpByMin<int>(config_.io_config.data_random_seed);
+  }
+  train_data_ = new Dataset(config_.io_config.data_filename.c_str(),
+                         config_.io_config.input_init_score.c_str(),
+                                          config_.io_config.max_bin,
+                                 config_.io_config.data_random_seed,
+                                 config_.io_config.is_enable_sparse,
+                                                       predict_fun);
+  // load Training data
+  if (config_.is_parallel_find_bin) {
+    // load data for parallel training
+    train_data_->LoadTrainData(Network::rank(), Network::num_machines(),
+                                     config_.io_config.is_pre_partition,
+                               config_.io_config.use_two_round_loading);
+  } else {
+    // load data for single machine
+    train_data_->LoadTrainData(config_.io_config.use_two_round_loading);
+  }
+  // need save binary file
+  if (config_.io_config.is_save_binary_file) {
+    train_data_->SaveBinaryFile();
+  }
+  // create training metric
+  if (config_.metric_config.is_provide_training_metric) {
+    for (auto metric_type : config_.metric_types) {
+      Metric* metric =
+        Metric::CreateMetric(metric_type, config_.metric_config);
+      if (metric == nullptr) { continue; }
+      metric->Init("training", train_data_->metadata(),
+                              train_data_->num_data());
+      train_metric_.push_back(metric);
+    }
+  }
+  // Add validation data, if exists
+  for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
+    // add
+    valid_datas_.push_back(
+      new Dataset(config_.io_config.valid_data_filenames[i].c_str(),
+                                          config_.io_config.max_bin,
+                                 config_.io_config.data_random_seed,
+                                 config_.io_config.is_enable_sparse,
+                                                      predict_fun));
+    // load validation data like train data
+    valid_datas_.back()->LoadValidationData(train_data_,
+                config_.io_config.use_two_round_loading);
+    // need save binary file
+    if (config_.io_config.is_save_binary_file) {
+      valid_datas_.back()->SaveBinaryFile();
+    }
+
+    // add metric for validation data
+    valid_metrics_.emplace_back();
+    for (auto metric_type : config_.metric_types) {
+      Metric* metric = Metric::CreateMetric(metric_type, config_.metric_config);
+      if (metric == nullptr) { continue; }
+      metric->Init(config_.io_config.valid_data_filenames[i].c_str(),
+                                     valid_datas_.back()->metadata(),
+                                    valid_datas_.back()->num_data());
+      valid_metrics_.back().push_back(metric);
+    }
+  }
+  if (predictor != nullptr) {
+    delete predictor;
+  }
+  auto end_time = std::chrono::high_resolution_clock::now();
+  // output used time on each iteration
+  Log::Stdout("Finish loading data, use %f seconds ",
+    std::chrono::duration<double, std::milli>(end_time - start_time) * 1e-3);
+}
+
+void Application::InitTrain() {
+  if (config_.is_parallel) {
+    // need init network
+    Network::Init(config_.network_config);
+    Log::Stdout("finish network initialization");
+    // sync global random seed for feature patition
+    if (config_.boosting_type == BoostingType::kGBDT) {
+      GBDTConfig* gbdt_config =
+        dynamic_cast<GBDTConfig*>(config_.boosting_config);
+      gbdt_config->tree_config.feature_fraction_seed =
+        GlobalSyncUpByMin<int>(gbdt_config->tree_config.feature_fraction_seed);
+      gbdt_config->tree_config.feature_fraction =
+        GlobalSyncUpByMin<double>(gbdt_config->tree_config.feature_fraction);
+    }
+  }
+  // create boosting
+  boosting_ =
+    Boosting::CreateBoosting(config_.boosting_type, config_.boosting_config);
+  // create objective function
+  objective_fun_ =
+    ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
+                                             config_.objective_config);
+  // load training data
+  LoadData();
+  // initialize the objective function
+  objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
+  // initialize the boosting
+  boosting_->Init(train_data_, objective_fun_,
+    ConstPtrInVectorWarpper<Metric>(train_metric_),
+            config_.io_config.output_model.c_str());
+  // add validation data into boosting
+  for (size_t i = 0; i < valid_datas_.size(); ++i) {
+    boosting_->AddDataset(valid_datas_[i],
+      ConstPtrInVectorWarpper<Metric>(valid_metrics_[i]));
+  }
+  Log::Stdout("finish training init");
+}
+
+void Application::Train() {
+  Log::Stdout("start train");
+  boosting_->Train();
+  Log::Stdout("finish train");
+}
+
+
+void Application::Predict() {
+  // create predictor
+  Predictor predictor(boosting_, config_.io_config.is_sigmoid);
+  predictor.Predict(config_.io_config.data_filename.c_str(),
+    config_.io_config.data_has_label, config_.io_config.output_result.c_str());
+  Log::Stdout("finish predict");
+}
+
+void Application::InitPredict() {
+  boosting_ =
+    Boosting::CreateBoosting(config_.boosting_type, config_.boosting_config);
+  LoadModel();
+  Log::Stdout("finish predict init");
+}
+
+void Application::LoadModel() {
+  TextReader<size_t> model_reader(config_.io_config.input_model.c_str());
+  model_reader.ReadAllLines();
+  std::stringstream ss;
+  for (auto& line : model_reader.Lines()) {
+    ss << line << '\n';
+  }
+  boosting_->ModelsFromString(ss.str(), config_.io_config.num_model_predict);
+}
+
+template<typename T>
+T Application::GlobalSyncUpByMin(T& local) {
+  T global = local;
+  if (!config_.is_parallel) {
+    // not need to sync if not parallel learning
+    return global;
+  }
+  Network::Allreduce(reinterpret_cast<char*>(&local),
+                         sizeof(local), sizeof(local),
+                     reinterpret_cast<char*>(&global),
+              [](const char* src, char* dst, int len) {
+    int used_size = 0;
+    const int type_size = sizeof(T);
+    const T *p1;
+    T *p2;
+    while (used_size < len) {
+      p1 = reinterpret_cast<const T *>(src);
+      p2 = reinterpret_cast<T *>(dst);
+      if (*p1 < *p2) {
+        std::memcpy(dst, src, type_size);
+      }
+      src += type_size;
+      dst += type_size;
+      used_size += type_size;
+    }
+  });
+  return global;
+}
+
+}  // namespace LightGBM
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
+#ifndef LIGHTGBM_PREDICTOR_HPP_
+#define LIGHTGBM_PREDICTOR_HPP_
+
+#include <LightGBM/meta.h>
+#include <LightGBM/boosting.h>
+#include <LightGBM/utils/text_reader.h>
+#include <LightGBM/dataset.h>
+
+#include <omp.h>
+
+#include <cstring>
+#include <cstdio>
+#include <vector>
+#include <utility>
+#include <functional>
+#include <string>
+
+namespace LightGBM {
+
+/*!
+* \brief Used to prediction data with input model
+*/
+class Predictor {
+public:
+  /*!
+  * \brief Constructor
+  * \param boosting Input boosting model
+  * \param is_sigmoid True if need to predict result with sigmoid transform(if needed, like binary classification)
+  */
+  Predictor(const Boosting* boosting, bool is_simgoid)
+    : is_simgoid_(is_simgoid) {
+    boosting_ = boosting;
+    num_features_ = boosting_->MaxFeatureIdx() + 1;
+#pragma omp parallel
+#pragma omp master
+    {
+      num_threads_ = omp_get_num_threads();
+    }
+    features_ = new double*[num_threads_];
+    for (int i = 0; i < num_threads_; ++i) {
+      features_[i] = new double[num_features_];
+    }
+  }
+  /*!
+  * \brief Destructor
+  */
+  ~Predictor() {
+    if (features_ != nullptr) {
+      for (int i = 0; i < num_threads_; ++i) {
+        delete[] features_[i];
+      }
+      delete[] features_;
+    }
+  }
+
+  /*!
+  * \brief prediction for one record, only raw result(not sigmoid transform)
+  * \param features Feature for this record
+  * \return Prediction result
+  */
+  double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
+    const int tid = omp_get_thread_num();
+    // init feature value
+    std::memset(features_[tid], 0, sizeof(double)*num_features_);
+    // put feature value
+    for (const auto& p : features) {
+      if (p.first < num_features_) {
+        features_[tid][p.first] = p.second;
+      }
+    }
+    // get result without sigmoid transform
+    return boosting_->PredictRaw(features_[tid]);
+  }
+
+  /*!
+  * \brief prediction for one record, will use sigmoid transform if needed(only needs in binary classification now)
+  * \param features Feature for this record
+  * \return Prediction result
+  */
+  double PredictOneLine(const std::vector<std::pair<int, double>>& features) {
+    const int tid = omp_get_thread_num();
+    // init feature value
+    std::memset(features_[tid], 0, sizeof(double)*num_features_);
+    // put feature value
+    for (const auto& p : features) {
+      if (p.first < num_features_) {
+        features_[tid][p.first] = p.second;
+      }
+    }
+    // get result with sigmoid transform
+    return boosting_->Predict(features_[tid]);
+  }
+  /*!
+  * \brief prediction for a data, and save result
+  * \param data_filename Filename of data
+  * \param has_label True if this data contains label
+  * \param result_filename Filename of output result
+  */
+  void Predict(const char* data_filename, bool has_label, const char* result_filename) {
+    FILE* result_file;
+
+#ifdef _MSC_VER
+    fopen_s(&result_file, result_filename, "w");
+#else
+    result_file = fopen(result_filename, "w");
+#endif
+
+    if (result_file == NULL) {
+      Log::Stderr("predition result file %s doesn't exists", data_filename);
+    }
+
+    Parser* parser = Parser::CreateParser(data_filename);
+
+    if (parser == nullptr) {
+      Log::Stderr("can regonise input data format, filename %s", data_filename);
+    }
+
+    // function for parse data
+    std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
+    double tmp_label;
+    if (has_label) {
+      // parse function with label
+      parser_fun = [this, &parser, &tmp_label]
+      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
+        parser->ParseOneLine(buffer, feature, &tmp_label);
+      };
+      Log::Stdout("start prediction for data %s, and data has label", data_filename);
+    } else {
+      // parse function without label
+      parser_fun = [this, &parser]
+      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
+        parser->ParseOneLine(buffer, feature);
+      };
+      Log::Stdout("start prediction for data %s, and data doesn't has label", data_filename);
+    }
+    std::function<double(const std::vector<std::pair<int, double>>&)> predict_fun;
+    if (is_simgoid_) {
+      predict_fun = [this](const std::vector<std::pair<int, double>>& features) {
+        return PredictOneLine(features);
+      };
+    } else {
+      predict_fun = [this](const std::vector<std::pair<int, double>>& features) {
+        return PredictRawOneLine(features);
+      };
+    }
+    std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
+      [this, &parser_fun, &predict_fun, &result_file]
+    (data_size_t, const std::vector<std::string>& lines) {
+      std::vector<std::pair<int, double>> oneline_features;
+      std::vector<double> pred_result(lines.size(), 0.0f);
+#pragma omp parallel for schedule(static) private(oneline_features)
+      for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
+        oneline_features.clear();
+        // parser
+        parser_fun(lines[i].c_str(), &oneline_features);
+        // predict
+        pred_result[i] = predict_fun(oneline_features);
+      }
+
+      for (size_t i = 0; i < pred_result.size(); ++i) {
+        fprintf(result_file, "%f\n", pred_result[i]);
+      }
+    };
+
+    TextReader<data_size_t> predict_data_reader(data_filename);
+    predict_data_reader.ReadAllAndProcessParallel(process_fun);
+
+    fclose(result_file);
+    delete parser;
+  }
+
+private:
+  /*! \brief Boosting model */
+  const Boosting* boosting_;
+  /*! \brief Buffer for feature values */
+  double** features_;
+  /*! \brief Number of features */
+  int num_features_;
+  /*! \brief True if need to predict result with sigmoid transform */
+  bool is_simgoid_;
+  /*! \brief Number of threads */
+  int num_threads_;
+};
+
+}  // namespace LightGBM
+
+#endif  #endif  // LightGBM_PREDICTOR_HPP_
--- a/src/boosting/boosting.cpp
+++ b/src/boosting/boosting.cpp
+#include <LightGBM/boosting.h>
+#include "gbdt.h"
+
+namespace LightGBM {
+
+Boosting* Boosting::CreateBoosting(BoostingType type,
+                         const BoostingConfig* config) {
+  if (type == BoostingType::kGBDT) {
+    return new GBDT(config);
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace LightGBM
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
+#include "gbdt.h"
+
+#include <LightGBM/utils/common.h>
+
+#include <LightGBM/feature.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/metric.h>
+
+#include <ctime>
+
+#include <sstream>
+#include <chrono>
+#include <string>
+#include <vector>
+
+
+namespace LightGBM {
+
+GBDT::GBDT(const BoostingConfig* config)
+  : tree_learner_(nullptr), train_score_updater_(nullptr),
+  gradients_(nullptr), hessians_(nullptr),
+  out_of_bag_data_indices_(nullptr), bag_data_indices_(nullptr) {
+  max_feature_idx_ = 0;
+  gbdt_config_ = dynamic_cast<const GBDTConfig*>(config);
+}
+
+GBDT::~GBDT() {
+  if (tree_learner_ != nullptr) { delete tree_learner_; }
+  if (gradients_ != nullptr) { delete[] gradients_; }
+  if (hessians_ != nullptr) { delete[] hessians_; }
+  if (out_of_bag_data_indices_ != nullptr) { delete[] out_of_bag_data_indices_; }
+  if (bag_data_indices_ != nullptr) { delete[] bag_data_indices_; }
+  for (auto& tree : models_) {
+    if (tree != nullptr) { delete tree; }
+  }
+  if (train_score_updater_ != nullptr) { delete train_score_updater_; }
+  for (auto& score_tracker : valid_score_updater_) {
+    if (score_tracker != nullptr) { delete score_tracker; }
+  }
+}
+
+void GBDT::Init(const Dataset* train_data, const ObjectiveFunction* object_function,
+     const std::vector<const Metric*>& training_metrics, const char* output_model_filename) {
+  train_data_ = train_data;
+  // create tree learner
+  tree_learner_ =
+    TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config);
+  // init tree learner
+  tree_learner_->Init(train_data_);
+  object_function_ = object_function;
+  // push training metrics
+  for (const auto& metric : training_metrics) {
+    training_metrics_.push_back(metric);
+  }
+  // create score tracker
+  train_score_updater_ = new ScoreUpdater(train_data_);
+  num_data_ = train_data_->num_data();
+  // create buffer for gradients and hessians
+  gradients_ = new score_t[num_data_];
+  hessians_ = new score_t[num_data_];
+
+  // get max feature index
+  for (int i = 0; i < train_data->num_features(); ++i) {
+    max_feature_idx_ = Common::Max<int>(max_feature_idx_,
+              train_data->FeatureAt(i)->feature_index());
+  }
+
+  // if need bagging, create buffer
+  if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
+    out_of_bag_data_indices_ = new data_size_t[num_data_];
+    bag_data_indices_ = new data_size_t[num_data_];
+  } else {
+    out_of_bag_data_cnt_ = 0;
+    out_of_bag_data_indices_ = nullptr;
+    bag_data_cnt_ = num_data_;
+    bag_data_indices_ = nullptr;
+  }
+  // initialize random generator
+  random_ = Random(gbdt_config_->bagging_seed);
+
+  // open model output file
+  #ifdef _MSC_VER
+  fopen_s(&output_model_file, output_model_filename, "w");
+  #else
+  output_model_file = fopen(output_model_filename, "w");
+  #endif
+  // output models
+  fprintf(output_model_file, "%s", this->ModelsToString().c_str());
+}
+
+
+
+void GBDT::AddDataset(const Dataset* valid_data,
+         const std::vector<const Metric*>& valid_metrics) {
+  // for a validation dataset, we need its score and metric
+  valid_score_updater_.push_back(new ScoreUpdater(valid_data));
+  valid_metrics_.emplace_back();
+  for (const auto& metric : valid_metrics) {
+    valid_metrics_.back().push_back(metric);
+  }
+}
+
+
+void GBDT::Bagging(int iter) {
+  // if need bagging
+  if (out_of_bag_data_indices_ != nullptr && iter % gbdt_config_->bagging_freq == 0) {
+    // if doesn't have query data
+    if (train_data_->metadata().query_boundaries() == nullptr) {
+      bag_data_cnt_ =
+        static_cast<data_size_t>(gbdt_config_->bagging_fraction * num_data_);
+      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
+      data_size_t cur_left_cnt = 0;
+      data_size_t cur_right_cnt = 0;
+      // random bagging, minimal unit is one record
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        double prob =
+          (bag_data_cnt_ - cur_left_cnt) / static_cast<double>(num_data_ - i);
+        if (random_.NextDouble() < prob) {
+          bag_data_indices_[cur_left_cnt++] = i;
+        } else {
+          out_of_bag_data_indices_[cur_right_cnt++] = i;
+        }
+      }
+    } else {
+      // if have query data
+      const data_size_t* query_boundaries = train_data_->metadata().query_boundaries();
+      data_size_t num_query = train_data_->metadata().num_queries();
+      data_size_t bag_query_cnt =
+          static_cast<data_size_t>(num_query * gbdt_config_->bagging_fraction);
+      data_size_t cur_left_query_cnt = 0;
+      data_size_t cur_left_cnt = 0;
+      data_size_t cur_right_cnt = 0;
+      // random bagging, minimal unit is one query
+      for (data_size_t i = 0; i < num_query; ++i) {
+        double prob =
+            (bag_query_cnt - cur_left_query_cnt) / static_cast<double>(num_query - i);
+        if (random_.NextDouble() < prob) {
+          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
+            bag_data_indices_[cur_left_cnt++] = j;
+          }
+          cur_left_query_cnt++;
+        } else {
+          for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
+            out_of_bag_data_indices_[cur_right_cnt++] = j;
+          }
+        }
+      }
+      bag_data_cnt_ = cur_left_cnt;
+      out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
+    }
+    Log::Stdout("re-bagging, using %d data to train", bag_data_cnt_);
+    // set bagging data to tree learner
+    tree_learner_->SetBaggingData(bag_data_indices_, bag_data_cnt_);
+  }
+}
+
+void GBDT::UpdateScoreOutOfBag(const Tree* tree) {
+  // we need to predict out-of-bag data's socres for boosing
+  if (out_of_bag_data_indices_ != nullptr) {
+    train_score_updater_->
+      AddScore(tree, out_of_bag_data_indices_, out_of_bag_data_cnt_);
+  }
+}
+
+void GBDT::Train() {
+  // training start time
+  auto start_time = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < gbdt_config_->num_iterations; ++iter) {
+    // boosting first
+    Boosting();
+    // bagging logic
+    Bagging(iter);
+    // train a new tree
+    Tree * new_tree = TrainOneTree();
+    // if cannon learn a new tree, stop
+    if (new_tree->num_leaves() <= 1) {
+      Log::Stdout("Cannot do any boosting for tree cannot split");
+      break;
+    }
+    // Shrinkage by learning rate
+    new_tree->Shrinkage(gbdt_config_->learning_rate);
+    // update score
+    UpdateScore(new_tree);
+    UpdateScoreOutOfBag(new_tree);
+    // print message for metric
+    OutputMetric(iter + 1);
+    // add model
+    models_.push_back(new_tree);
+    // write model to file on every iteration
+    fprintf(output_model_file, "Tree=%d\n", iter);
+    fprintf(output_model_file, "%s\n", new_tree->ToString().c_str());
+    fflush(output_model_file);
+    auto end_time = std::chrono::high_resolution_clock::now();
+    // output used time on each iteration
+    Log::Stdout("%f seconds elapsed, finished %d iteration", std::chrono::duration<double,
+                                     std::milli>(end_time - start_time) * 1e-3, iter + 1);
+  }
+  // close file
+  fclose(output_model_file);
+}
+
+Tree* GBDT::TrainOneTree() {
+  return tree_learner_->Train(gradients_, hessians_);
+}
+
+void GBDT::UpdateScore(const Tree* tree) {
+  // update training score
+  train_score_updater_->AddScore(tree_learner_);
+  // update validation score
+  for (auto& score_tracker : valid_score_updater_) {
+    score_tracker->AddScore(tree);
+  }
+}
+
+void GBDT::OutputMetric(int iter) {
+  // print training metric
+  for (auto& sub_metric : training_metrics_) {
+    sub_metric->Print(iter, train_score_updater_->score());
+  }
+  // print validation metric
+  for (size_t i = 0; i < valid_metrics_.size(); ++i) {
+    for (auto& sub_metric : valid_metrics_[i]) {
+      sub_metric->Print(iter, valid_score_updater_[i]->score());
+    }
+  }
+}
+
+void GBDT::Boosting() {
+  // objective function will calculation gradients and hessians
+  object_function_->
+    GetGradients(train_score_updater_->score(), gradients_, hessians_);
+}
+
+
+std::string GBDT::ModelsToString() const {
+  // serialize this object to string
+  std::stringstream ss;
+  // output max_feature_idx
+  ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
+  // output sigmoid parameter
+  ss << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
+  ss << std::endl;
+
+  // output tree models
+  for (size_t i = 0; i < models_.size(); ++i) {
+    ss << "Tree=" << i << std::endl;
+    ss << models_[i]->ToString() << std::endl;
+  }
+  return ss.str();
+}
+
+void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
+  // use serialized string to restore this object
+  models_.clear();
+  std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
+  size_t i = 0;
+  // get max_feature_idx first
+  while (i < lines.size()) {
+    size_t find_pos = lines[i].find("max_feature_idx=");
+    if (find_pos != std::string::npos) {
+      std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
+      Common::Atoi(strs[1].c_str(), &max_feature_idx_);
+      ++i;
+      break;
+    } else {
+      ++i;
+    }
+  }
+  if (i == lines.size()) {
+    Log::Stderr("The model doesn't contain max_feature_idx");
+    return;
+  }
+  // get sigmoid parameter
+  i = 0;
+  while (i < lines.size()) {
+    size_t find_pos = lines[i].find("sigmoid=");
+    if (find_pos != std::string::npos) {
+      std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
+      Common::Atof(strs[1].c_str(), &sigmoid_);
+      ++i;
+      break;
+    } else {
+      ++i;
+    }
+  }
+  // if sigmoid doesn't exists
+  if (i == lines.size()) {
+    sigmoid_ = -1.0;
+  }
+  // get tree models
+  i = 0;
+  while (i < lines.size()) {
+    size_t find_pos = lines[i].find("Tree=");
+    if (find_pos != std::string::npos) {
+      ++i;
+      int start = static_cast<int>(i);
+      while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
+      int end = static_cast<int>(i);
+      std::string tree_str = Common::Join(lines, start, end, '\n');
+      models_.push_back(new Tree(tree_str));
+      if (num_used_model > 0 && models_.size() >= num_used_model) {
+        break;
+      }
+    } else {
+      ++i;
+    }
+  }
+
+  Log::Stdout("Loaded %d modles\n", models_.size());
+}
+
+double GBDT::PredictRaw(const double* value) const {
+  double ret = 0.0;
+  for (size_t i = 0; i < models_.size(); ++i) {
+    ret += models_[i]->Predict(value);
+  }
+  return ret;
+}
+
+double GBDT::Predict(const double* value) const {
+  double ret = 0.0;
+  for (size_t i = 0; i < models_.size(); ++i) {
+    ret += models_[i]->Predict(value);
+  }
+  // if need sigmoid transform
+  if (sigmoid_ > 0) {
+    ret = 1.0 / (1.0 + std::exp(-sigmoid_ * ret));
+  }
+  return ret;
+}
+
+}  // namespace LightGBM
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
+#ifndef LIGHTGBM_BOOSTING_GBDT_H_
+#define LIGHTGBM_BOOSTING_GBDT_H_
+
+#include <LightGBM/boosting.h>
+#include "score_updater.hpp"
+
+#include <cstdio>
+#include <vector>
+#include <string>
+
+namespace LightGBM {
+/*!
+* \brief GBDT algorithm implementation. including Training, prediction, bagging.
+*/
+class GBDT: public Boosting {
+public:
+  /*!
+  * \brief Constructor
+  * \param config Config of GBDT
+  */
+  explicit GBDT(const BoostingConfig* config);
+  /*!
+  * \brief Destructor
+  */
+  ~GBDT();
+  /*!
+  * \brief Initial logic
+  * \param config Config for boosting
+  * \param train_data Training data
+  * \param object_function Training objective function
+  * \param training_metrics Training metrics
+  * \param output_model_filename Filename of output model
+  */
+  void Init(const Dataset* train_data, const ObjectiveFunction* object_function,
+                             const std::vector<const Metric*>& training_metrics,
+                                              const char* output_model_filename)
+                                                                       override;
+  /*!
+  * \brief Add a validation data
+  * \param valid_data Validation data
+  * \param valid_metrics Metrics for validation data
+  */
+  void AddDataset(const Dataset* valid_data,
+       const std::vector<const Metric*>& valid_metrics) override;
+  /*!
+  * \brief one training iteration
+  */
+  void Train() override;
+  /*!
+  * \brief Predtion for one record, not use sigmoid
+  * \param feature_values Feature value on this record
+  * \return Prediction result for this record
+  */
+  double PredictRaw(const double * feature_values) const override;
+
+  /*!
+  * \brief Predtion for one record, will use sigmoid transform if needed
+  * \param feature_values Feature value on this record
+  * \return Prediction result for this record
+  */
+  double Predict(const double * feature_values) const override;
+  /*!
+  * \brief Serialize models by string
+  * \return String output of tranined model
+  */
+  std::string ModelsToString() const override;
+  /*!
+  * \brief Restore from a serialized string
+  * \param model_str The string of model
+  */
+  void ModelsFromString(const std::string& model_str, int num_used_model) override;
+  /*!
+  * \brief Get max feature index of this model
+  * \return Max feature index of this model
+  */
+  inline int MaxFeatureIdx() const override { return max_feature_idx_; }
+  /*!
+  * \brief Get number of weak sub-models
+  * \return Number of weak sub-models
+  */
+  inline int NumberOfSubModels() const override { return static_cast<int>(models_.size()); }
+
+private:
+  /*!
+  * \brief Implement bagging logic
+  * \param iter Current interation
+  */
+  void Bagging(int iter);
+  /*!
+  * \brief update score for out-of-bag data.
+  * It is necessary for this update, since we may re-bagging data on training
+  * \param tree Trained tree of this iteration
+  */
+  void UpdateScoreOutOfBag(const Tree* tree);
+  /*!
+  * \brief calculate the object function
+  */
+  void Boosting();
+  /*!
+  * \brief train one tree
+  * \return Trained tree of this iteration
+  */
+  Tree* TrainOneTree();
+  /*!
+  * \brief update score after tree trained
+  * \param tree Trained tree of this iteration
+  */
+  void UpdateScore(const Tree* tree);
+  /*!
+  * \brief Print Metric result of current iteration
+  * \param iter Current interation
+  */
+  void OutputMetric(int iter);
+
+  /*! \brief Pointer to training data */
+  const Dataset* train_data_;
+  /*! \brief Config of gbdt */
+  const GBDTConfig* gbdt_config_;
+  /*! \brief Tree learner, will use tihs class to learn trees */
+  TreeLearner* tree_learner_;
+  /*! \brief Objective function */
+  const ObjectiveFunction* object_function_;
+  /*! \brief Store and update traning data's score */
+  ScoreUpdater* train_score_updater_;
+  /*! \brief Metrics for training data */
+  std::vector<const Metric*> training_metrics_;
+  /*! \brief Store and update validation data's scores */
+  std::vector<ScoreUpdater*> valid_score_updater_;
+  /*! \brief Metric for validation data */
+  std::vector<std::vector<const Metric*>> valid_metrics_;
+  /*! \brief Trained models(trees) */
+  std::vector<Tree*> models_;
+  /*! \brief Max feature index of training data*/
+  int max_feature_idx_;
+  /*! \brief First order derivative of training data */
+  score_t* gradients_;
+  /*! \brief Secend order derivative of training data */
+  score_t* hessians_;
+  /*! \brief Store the data indices of out-of-bag */
+  data_size_t* out_of_bag_data_indices_;
+  /*! \brief Number of out-of-bag data */
+  data_size_t out_of_bag_data_cnt_;
+  /*! \brief Store the indices of in-bag data */
+  data_size_t* bag_data_indices_;
+  /*! \brief Number of in-bag data */
+  data_size_t bag_data_cnt_;
+  /*! \brief Number of traning data */
+  data_size_t num_data_;
+  /*! \brief Random generator, used for bagging */
+  Random random_;
+  /*! \brief The filename that the models will save to */
+  FILE * output_model_file;
+  /*!
+  *   \brief Sigmoid parameter, used for prediction.
+  *          if > 0 meas output score will transform by sigmoid function
+  */
+  double sigmoid_;
+};
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_BOOSTING_GBDT_H_
--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
+#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
+#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
+
+#include <LightGBM/meta.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/tree_learner.h>
+
+#include <cstring>
+
+namespace LightGBM {
+/*!
+* \brief Used to store and update score for data
+*/
+class ScoreUpdater {
+public:
+  /*!
+  * \brief Constructor, will pass a const pointer of dataset
+  * \param data This class will bind with this data set
+  */
+  explicit ScoreUpdater(const Dataset* data)
+    :data_(data) {
+    num_data_ = data->num_data();
+    score_ = new score_t[num_data_];
+    // default start score is zero
+    std::memset(score_, 0, sizeof(score_t)*num_data_);
+    const score_t* init_score = data->metadata().init_score();
+    // if exists initial score, will start from it
+    if (init_score != nullptr) {
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        score_[i] = init_score[i];
+      }
+    }
+  }
+  /*! \brief Destructor */
+  ~ScoreUpdater() {
+    delete[] score_;
+  }
+  /*!
+  * \brief Use tree model to get prediction, then add to score for all data
+  * Note: this function generally will be used for validation data.
+  * \param tree Trained tree model
+  */
+  inline void AddScore(const Tree* tree) {
+    tree->AddPredictionToScore(data_, num_data_, score_);
+  }
+  /*!
+  * \brief Add prediction score, only used for training data.
+  * After trained a tree, the training data is partitioned into tree leaves. 
+  * We can get prediction by faster speed based on this.
+  * \param tree_learner
+  */
+  inline void AddScore(const TreeLearner* tree_learner) {
+    tree_learner->AddPredictionToScore(score_);
+  }
+  /*!
+  * \brief Like AddScore(const Tree* tree), but only for part of data
+  * Used for prediction of training out-of-bad data
+  * \param tree Trained tree model
+  * \param data_indices Indices of data that want proccess to
+  * \param data_cnt Number of data that want proccess to
+  */
+  inline void AddScore(const Tree* tree, const data_size_t* data_indices,
+                                                  data_size_t data_cnt) {
+    tree->AddPredictionToScore(data_, data_indices, data_cnt, score_);
+  }
+  /*! \brief Pointer of score */
+  inline const score_t * score() { return score_; }
+
+private:
+  /*! \brief Number of total data */
+  data_size_t num_data_;
+  /*! \brief Pointer of data set */
+  const Dataset* data_;
+  /*! \brief scores for data set */
+  score_t* score_;
+};
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_BOOSTING_SCORE_UPDATER_HPP_
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
+#include <LightGBM/bin.h>
+
+#include "dense_bin.hpp"
+#include "sparse_bin.hpp"
+
+#include <cmath>
+#include <cstring>
+#include <cstdint>
+
+#include <limits>
+#include <vector>
+#include <algorithm>
+
+namespace LightGBM {
+
+BinMapper::BinMapper()
+  :bin_upper_bound_(nullptr) {
+}
+
+// deep copy function for BinMapper
+BinMapper::BinMapper(const BinMapper& other)
+  : bin_upper_bound_(nullptr) {
+  num_bin_ = other.num_bin_;
+  is_trival_ = other.is_trival_;
+  sparse_rate_ = other.sparse_rate_;
+  bin_upper_bound_ = new double[num_bin_];
+  for (int i = 0; i < num_bin_; ++i) {
+    bin_upper_bound_[i] = other.bin_upper_bound_[i];
+  }
+}
+
+BinMapper::BinMapper(const void* memory)
+  :bin_upper_bound_(nullptr) {
+  CopyFrom(reinterpret_cast<const char*>(memory));
+}
+
+BinMapper::~BinMapper() {
+  delete[] bin_upper_bound_;
+}
+
+void BinMapper::FindBin(std::vector<double>* values, int max_bin) {
+  size_t sample_size = values->size();
+  // find distinct_values first
+  double* distinct_values = new double[sample_size];
+  int *counts = new int[sample_size];
+  int num_values = 1;
+  std::sort(values->begin(), values->end());
+  distinct_values[0] = (*values)[0];
+  counts[0] = 1;
+  for (size_t i = 1; i < values->size(); ++i) {
+    if ((*values)[i] != (*values)[i - 1]) {
+      distinct_values[num_values] = (*values)[i];
+      counts[num_values] = 1;
+      ++num_values;
+    } else {
+      ++counts[num_values - 1];
+    }
+  }
+  int cnt_in_bin0 = 0;
+
+  if (num_values <= max_bin) {
+    // use distinct value is enough
+    num_bin_ = num_values;
+    bin_upper_bound_ = new double[num_values];
+    for (int i = 0; i < num_values - 1; ++i) {
+      bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
+    }
+    cnt_in_bin0 = counts[0];
+    bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
+  } else {
+    // need find bins
+    num_bin_ = max_bin;
+    bin_upper_bound_ = new double[max_bin];
+    double * bin_lower_bound = new double[max_bin];
+    // mean size for one bin
+    double mean_bin_size = sample_size / static_cast<double>(max_bin);
+    int rest_sample_cnt = static_cast<int>(sample_size);
+    int cur_cnt_inbin = 0;
+    int bin_cnt = 0;
+    bin_lower_bound[0] = distinct_values[0];
+    for (int i = 0; i < num_values - 1; ++i) {
+      rest_sample_cnt -= counts[i];
+      cur_cnt_inbin += counts[i];
+      // need a new bin
+      if (cur_cnt_inbin >= mean_bin_size) {
+        bin_upper_bound_[bin_cnt] = distinct_values[i];
+        if (bin_cnt == 0) { cnt_in_bin0 = cur_cnt_inbin; }
+        ++bin_cnt;
+        bin_lower_bound[bin_cnt] = distinct_values[i + 1];
+        cur_cnt_inbin = 0;
+        mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
+      }
+    }
+    cur_cnt_inbin += counts[num_values - 1];
+    // update bin upper bound
+    for (int i = 0; i < bin_cnt; ++i) {
+      bin_upper_bound_[i] = (bin_upper_bound_[i] + bin_lower_bound[i + 1]) / 2.0;
+    }
+    // last bin upper bound
+    bin_upper_bound_[bin_cnt] = std::numeric_limits<double>::infinity();
+    ++bin_cnt;
+    delete[] bin_lower_bound;
+    // if no so much bin
+    if (bin_cnt < max_bin) {
+      // old bin data
+      double * tmp_bin_upper_bound = bin_upper_bound_;
+      num_bin_ = bin_cnt;
+      bin_upper_bound_ = new double[num_bin_];
+      // copy back
+      for (int i = 0; i < num_bin_; ++i) {
+        bin_upper_bound_[i] = tmp_bin_upper_bound[i];
+      }
+      // free old space
+      delete[] tmp_bin_upper_bound;
+    }
+  }
+  delete[] distinct_values;
+  delete[] counts;
+  // check trival(num_bin_ == 1) feature
+  if (num_bin_ <= 1) {
+    is_trival_ = true;
+  } else {
+    is_trival_ = false;
+  }
+  // calculate sparse rate
+  sparse_rate_ = static_cast<double>(cnt_in_bin0) / static_cast<double>(sample_size);
+}
+
+
+int BinMapper::SizeForSpecificBin(int bin) {
+  int size = 0;
+  size += sizeof(int);
+  size += sizeof(bool);
+  size += sizeof(double);
+  size += bin * sizeof(double);
+  return size;
+}
+
+void BinMapper::CopyTo(char * buffer) {
+  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
+  buffer += sizeof(num_bin_);
+  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
+  buffer += sizeof(is_trival_);
+  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
+  buffer += sizeof(sparse_rate_);
+  std::memcpy(buffer, bin_upper_bound_, num_bin_ * sizeof(double));
+}
+
+void BinMapper::CopyFrom(const char * buffer) {
+  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
+  buffer += sizeof(num_bin_);
+  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
+  buffer += sizeof(is_trival_);
+  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
+  buffer += sizeof(sparse_rate_);
+  if (bin_upper_bound_ != nullptr) { delete[] bin_upper_bound_; }
+  bin_upper_bound_ = new double[num_bin_];
+  std::memcpy(bin_upper_bound_, buffer, num_bin_ * sizeof(double));
+}
+
+void BinMapper::SaveBinaryToFile(FILE* file) const {
+  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
+  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
+  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
+  fwrite(bin_upper_bound_, sizeof(double), num_bin_, file);
+}
+
+size_t BinMapper::SizesInByte() const {
+  return sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_) + sizeof(double) * num_bin_;
+}
+
+template class DenseBin<uint8_t>;
+template class DenseBin<uint16_t>;
+template class DenseBin<uint32_t>;
+
+template class SparseBin<uint8_t>;
+template class SparseBin<uint16_t>;
+template class SparseBin<uint32_t>;
+
+template class OrderedSparseBin<uint8_t>;
+template class OrderedSparseBin<uint16_t>;
+template class OrderedSparseBin<uint32_t>;
+
+
+Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, bool is_enable_sparse, bool* is_sparse) {
+  // sparse threshold
+  const double kSparseThreshold = 0.8;
+  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
+    *is_sparse = true;
+    return CreateSparseBin(num_data, num_bin);
+  } else {
+    *is_sparse = false;
+    return CreateDenseBin(num_data, num_bin);
+  }
+}
+
+Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
+  if (num_bin <= 256) {
+    return new DenseBin<uint8_t>(num_data);
+  } else if (num_bin <= 65536) {
+    return new DenseBin<uint16_t>(num_data);
+  } else {
+    return new DenseBin<uint32_t>(num_data);
+  }
+}
+
+Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
+  if (num_bin <= 256) {
+    return new SparseBin<uint8_t>(num_data);
+  } else if (num_bin <= 65536) {
+    return new SparseBin<uint16_t>(num_data);
+  } else {
+    return new SparseBin<uint32_t>(num_data);
+  }
+}
+
+}  // namespace LightGBM
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
+#include <LightGBM/config.h>
+
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+
+namespace LightGBM {
+
+void OverallConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  // load main config types
+  GetInt(params, "num_threads", &num_threads);
+  GetTaskType(params);
+
+  // prediction task, default not has label
+  if (task_type == TaskType::kPredict) {
+    io_config.data_has_label = false;
+  }
+
+  GetBoostingType(params);
+  GetObjectiveType(params);
+  GetMetricType(params);
+
+  // construct boosting configs
+  if (boosting_type == BoostingType::kGBDT) {
+    boosting_config = new GBDTConfig();
+  }
+
+
+  // sub-config setup
+  network_config.Set(params);
+  io_config.Set(params);
+
+  boosting_config->Set(params);
+  objective_config.Set(params);
+  metric_config.Set(params);
+  // check for conflicts
+  CheckParamConflict();
+}
+
+void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::string>& params) {
+  std::string value;
+  if (GetString(params, "boosting_type", &value)) {
+    std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+    if (value == std::string("gbdt") || value == std::string("gbrt")) {
+      boosting_type = BoostingType::kGBDT;
+    } else {
+      Log::Stderr("boosting type %s error", value.c_str());
+    }
+  }
+}
+
+void OverallConfig::GetObjectiveType(const std::unordered_map<std::string, std::string>& params) {
+  std::string value;
+  if (GetString(params, "objective", &value)) {
+    std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+    objective_type = value;
+  }
+}
+
+void OverallConfig::GetMetricType(const std::unordered_map<std::string, std::string>& params) {
+  std::string value;
+  if (GetString(params, "metric", &value)) {
+    // clear old metrics
+    metric_types.clear();
+    // to lower
+    std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+    // split
+    std::vector<std::string> metrics = Common::Split(value.c_str(), ',');
+    // remove dumplicate
+    std::unordered_map<std::string, int> metric_maps;
+    for (auto& metric : metrics) {
+      std::transform(metric.begin(), metric.end(), metric.begin(), ::tolower);
+      if (metric_maps.count(metric) <= 0) {
+        metric_maps[metric] = 1;
+      }
+    }
+    for (auto& pair : metric_maps) {
+      std::string sub_metric_str = pair.first;
+      metric_types.push_back(sub_metric_str);
+    }
+  }
+}
+
+
+void OverallConfig::GetTaskType(const std::unordered_map<std::string, std::string>& params) {
+  std::string value;
+  if (GetString(params, "task", &value)) {
+    std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+    if (value == std::string("train") || value == std::string("training")) {
+      task_type = TaskType::kTrain;
+    } else if (value == std::string("predict") || value == std::string("prediction")
+      || value == std::string("test")) {
+      task_type = TaskType::kPredict;
+    } else {
+      Log::Stderr("task type error");
+    }
+  }
+}
+
+void OverallConfig::CheckParamConflict() {
+  if (network_config.num_machines > 1) {
+    is_parallel = true;
+  } else {
+    is_parallel = false;
+    dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type =
+                                                TreeLearnerType::kSerialTreeLearner;
+  }
+
+  if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
+                                                TreeLearnerType::kSerialTreeLearner) {
+    is_parallel = false;
+    network_config.num_machines = 1;
+  }
+
+  if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
+                                                 TreeLearnerType::kSerialTreeLearner ||
+    dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
+                                                 TreeLearnerType::kFeatureParallelTreelearner) {
+    is_parallel_find_bin = false;
+  } else if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
+                                                 TreeLearnerType::kDataParallelTreeLearner) {
+    is_parallel_find_bin = true;
+  }
+
+  if (task_type == TaskType::kTrain && io_config.data_has_label == false) {
+    Log::Stderr("Data should have label in training task");
+  }
+
+}
+
+void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  GetInt(params, "max_bin", &max_bin);
+  CHECK(max_bin > 0);
+  GetInt(params, "data_random_seed", &data_random_seed);
+
+  if (!GetString(params, "data", &data_filename)) {
+    Log::Stderr("No training/prediction data, application quit");
+  }
+  GetInt(params, "num_model_predict", &num_model_predict);
+  GetBool(params, "data_has_label", &data_has_label);
+  GetBool(params, "is_pre_partition", &is_pre_partition);
+  GetBool(params, "is_enable_sparse", &is_enable_sparse);
+  GetBool(params, "use_two_round_loading", &use_two_round_loading);
+  GetBool(params, "is_save_binary_file", &is_save_binary_file);
+  GetBool(params, "is_sigmoid", &is_sigmoid);
+  GetString(params, "output_model", &output_model);
+  GetString(params, "input_model", &input_model);
+  GetString(params, "output_result", &output_result);
+  GetString(params, "input_init_score", &input_init_score);
+  std::string tmp_str = "";
+  if (GetString(params, "valid_data", &tmp_str)) {
+    valid_data_filenames = Common::Split(tmp_str.c_str(), ',');
+  }
+}
+
+
+void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  GetBool(params, "is_unbalance", &is_unbalance);
+  GetDouble(params, "sigmoid", &sigmoid);
+  GetInt(params, "max_position", &max_position);
+  CHECK(max_position > 0);
+  std::string tmp_str = "";
+  if (GetString(params, "label_gain", &tmp_str)) {
+    label_gain = Common::StringToDoubleArray(tmp_str, ',');
+  } else {
+    // label_gain = 2^i - 1, may overflow, so we use 31 here
+    const int max_label = 31;
+    label_gain.push_back(0.0);
+    for (int i = 1; i < max_label; ++i) {
+      label_gain.push_back((1 << i) - 1);
+    }
+  }
+}
+
+
+void MetricConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  GetInt(params, "metric_freq", &output_freq);
+  CHECK(output_freq >= 0);
+  GetDouble(params, "sigmoid", &sigmoid);
+  GetBool(params, "is_training_metric", &is_provide_training_metric);
+  std::string tmp_str = "";
+  if (GetString(params, "label_gain", &tmp_str)) {
+    label_gain = Common::StringToDoubleArray(tmp_str, ',');
+  } else {
+    // label_gain = 2^i - 1, may overflow, so we use 31 here
+    const int max_label = 31;
+    label_gain.push_back(0.0);
+    for (int i = 1; i < max_label; ++i) {
+      label_gain.push_back((1 << i) - 1);
+    }
+  }
+  if (GetString(params, "ndcg_eval_at", &tmp_str)) {
+    eval_at = Common::StringToIntArray(tmp_str, ',');
+    std::sort(eval_at.begin(), eval_at.end());
+    for (size_t i = 0; i < eval_at.size(); ++i) {
+      CHECK(eval_at[i] > 0);
+    }
+  } else {
+    // default eval ndcg @[1-5]
+    for (int i = 1; i <= 5; ++i) {
+      eval_at.push_back(i);
+    }
+  }
+}
+
+
+void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
+  CHECK(min_data_in_leaf > 0);
+  GetDouble(params, "min_sum_hessian_in_leaf", &min_sum_hessian_in_leaf);
+  CHECK(min_sum_hessian_in_leaf >= 0.0);
+  GetInt(params, "num_leaves", &num_leaves);
+  CHECK(num_leaves > 0);
+  GetInt(params, "feature_fraction_seed", &feature_fraction_seed);
+  GetDouble(params, "feature_fraction", &feature_fraction);
+  CHECK(feature_fraction > 0.0 && feature_fraction <= 1.0);
+}
+
+
+void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  GetInt(params, "num_iterations", &num_iterations);
+  CHECK(num_iterations >= 0);
+  GetInt(params, "bagging_seed", &bagging_seed);
+  GetInt(params, "bagging_freq", &bagging_freq);
+  CHECK(bagging_freq >= 0);
+  GetDouble(params, "bagging_fraction", &bagging_fraction);
+  CHECK(bagging_fraction > 0.0 && bagging_fraction <= 1.0);
+  GetDouble(params, "learning_rate", &learning_rate);
+  CHECK(learning_rate > 0.0);
+}
+
+void GBDTConfig::GetTreeLearnerType(const std::unordered_map<std::string, std::string>& params) {
+  std::string value;
+  if (GetString(params, "tree_learner", &value)) {
+    std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+    if (value == std::string("serial")) {
+      tree_learner_type = TreeLearnerType::kSerialTreeLearner;
+    } else if (value == std::string("feature") || value == std::string("feature_parallel")) {
+      tree_learner_type = TreeLearnerType::kFeatureParallelTreelearner;
+    } else if (value == std::string("data") || value == std::string("data_parallel")) {
+      tree_learner_type = TreeLearnerType::kDataParallelTreeLearner;
+    }
+    else {
+      Log::Stderr("tree learner type error");
+    }
+  }
+}
+
+void GBDTConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  BoostingConfig::Set(params);
+  GetTreeLearnerType(params);
+  tree_config.Set(params);
+}
+
+void NetworkConfig::Set(const std::unordered_map<std::string, std::string>& params) {
+  GetInt(params, "num_machines", &num_machines);
+  CHECK(num_machines >= 1);
+  GetInt(params, "local_listen_port", &local_listen_port);
+  CHECK(local_listen_port > 0);
+  GetInt(params, "time_out", &time_out);
+  CHECK(time_out > 0);
+  GetString(params, "machine_list_file", &machine_list_filename);
+}
+
+}  // namespace LightGBM
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
+#include <LightGBM/dataset.h>
+
+#include <LightGBM/feature.h>
+#include <LightGBM/network.h>
+
+#include <omp.h>
+
+#include <cstdio>
+#include <unordered_map>
+#include <limits>
+#include <vector>
+#include <utility>
+#include <string>
+
+namespace LightGBM {
+
+Dataset::Dataset(const char* data_filename, const char* init_score_filename,
+                 int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun)
+  :data_filename_(data_filename), random_(random_seed),
+  max_bin_(max_bin), is_enable_sparse_(is_enable_sparse), predict_fun_(predict_fun) {
+
+  CheckCanLoadFromBin();
+  if (is_loading_from_binfile_ && predict_fun != nullptr) {
+    Log::Stdout("cannot perform initial prediction for binary file, will use text file instead");
+    is_loading_from_binfile_ = false;
+  }
+
+  if (!is_loading_from_binfile_) {
+    // load weight, query information and initilize score
+    metadata_.Init(data_filename, init_score_filename);
+    // create text parser
+    parser_ = Parser::CreateParser(data_filename_);
+    if (parser_ == nullptr) {
+      Log::Stderr("cannot recognise input data format, filename: %s", data_filename_);
+    }
+    // create text reader
+    text_reader_ = new TextReader<data_size_t>(data_filename);
+  } else {
+    // only need to load initilize score, other meta data will load from bin flie
+    metadata_.Init(init_score_filename);
+    Log::Stdout("will load data set from binary file");
+    parser_ = nullptr;
+    text_reader_ = nullptr;
+  }
+
+}
+
+Dataset::~Dataset() {
+  if (parser_ != nullptr) { delete parser_; }
+  if (text_reader_ != nullptr) { delete text_reader_; }
+  for (auto& feature : features_) {
+    delete feature;
+  }
+  features_.clear();
+}
+
+void Dataset::LoadDataToMemory(int rank, int num_machines, bool is_pre_partition) {
+  used_data_indices_.clear();
+  if (num_machines == 1 || is_pre_partition) {
+    // read all lines
+    num_data_ = text_reader_->ReadAllLines();
+    global_num_data_ = num_data_;
+  } else {  // need partition data
+    // get query data
+    const data_size_t* query_boundaries = metadata_.query_boundaries();
+
+    if (query_boundaries == nullptr) {
+      // if not contain query data, minimal sample unit is one record
+      global_num_data_ = text_reader_->ReadAndFilterLines([this, rank, num_machines](data_size_t) {
+        if (random_.NextInt(0, num_machines) == rank) {
+          return true;
+        } else {
+          return false;
+        }
+      }, &used_data_indices_);
+    } else {
+      // if contain query data, minimal sample unit is one query
+      data_size_t num_queries = metadata_.num_queries();
+      data_size_t qid = -1;
+      bool is_query_used = false;
+      global_num_data_ = text_reader_->ReadAndFilterLines(
+        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
+      (data_size_t line_idx) {
+        if (qid >= num_queries) {
+          Log::Stderr("current query is exceed the range of query file, please ensure your query file is correct");
+        }
+        if (line_idx >= query_boundaries[qid + 1]) {
+          // if is new query
+          is_query_used = false;
+          if (random_.NextInt(0, num_machines) == rank) {
+            is_query_used = true;
+          }
+          ++qid;
+        }
+        return is_query_used;
+      }, &used_data_indices_);
+    }
+    // set number of data
+    num_data_ = static_cast<data_size_t>(used_data_indices_.size());
+  }
+}
+
+void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) {
+  const size_t sample_cnt = static_cast<size_t>(num_data_ < 50000 ? num_data_ : 50000);
+  std::vector<size_t> sample_indices = random_.Sample(num_data_, sample_cnt);
+  out_data->clear();
+  for (size_t i = 0; i < sample_indices.size(); ++i) {
+    const size_t idx = sample_indices[i];
+    out_data->push_back(text_reader_->Lines()[idx]);
+  }
+}
+
+void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partition,
+                                             std::vector<std::string>* out_data) {
+  used_data_indices_.clear();
+  const size_t sample_cnt = 50000;
+  if (num_machines == 1 || is_pre_partition) {
+    num_data_ = static_cast<data_size_t>(text_reader_->SampleFromFile(random_, sample_cnt, out_data));
+    global_num_data_ = num_data_;
+  } else {  // need partition data
+    // get query data
+    const data_size_t* query_boundaries = metadata_.query_boundaries();
+    if (query_boundaries == nullptr) {
+      // if not contain query file, minimal sample unit is one record
+      global_num_data_ = text_reader_->SampleAndFilterFromFile([this, rank, num_machines]
+      (data_size_t) {
+        if (random_.NextInt(0, num_machines) == rank) {
+          return true;
+        } else {
+          return false;
+        }
+      }, &used_data_indices_, random_, sample_cnt, out_data);
+    } else {
+      // if contain query file, minimal sample unit is one query
+      data_size_t num_queries = metadata_.num_queries();
+      data_size_t qid = -1;
+      bool is_query_used = false;
+      global_num_data_ = text_reader_->SampleAndFilterFromFile(
+        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
+      (data_size_t line_idx) {
+        if (qid >= num_queries) {
+          Log::Stderr("current query is exceed the range of query file, \
+                             please ensure your query file is correct");
+        }
+        if (line_idx >= query_boundaries[qid + 1]) {
+          // if is new query
+          is_query_used = false;
+          if (random_.NextInt(0, num_machines) == rank) {
+            is_query_used = true;
+          }
+          ++qid;
+        }
+        return is_query_used;
+      }, &used_data_indices_, random_, sample_cnt, out_data);
+    }
+    num_data_ = static_cast<data_size_t>(used_data_indices_.size());
+  }
+}
+
+void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<std::string>& sample_data) {
+  // sample_values[i][j], means the value of j-th sample on i-th feature
+  std::vector<std::vector<double>> sample_values;
+  // temp buffer for one line features and label
+  std::vector<std::pair<int, double>> oneline_features;
+  double label;
+  for (size_t i = 0; i < sample_data.size(); ++i) {
+    oneline_features.clear();
+    // parse features
+    parser_->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
+    // push 0 first, then edit the value according existing feature values
+    for (auto& feature_values : sample_values) {
+      feature_values.push_back(0.0);
+    }
+    for (std::pair<int, double>& inner_data : oneline_features) {
+      if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
+        // if need expand feature set
+        size_t need_size = inner_data.first - sample_values.size() + 1;
+        for (size_t j = 0; j < need_size; ++j) {
+          // push i+1 0
+          sample_values.emplace_back(i + 1, 0.0);
+        }
+      }
+      // edit the feature value
+      sample_values[inner_data.first][i] = inner_data.second;
+    }
+  }
+
+  features_.clear();
+
+  // -1 means doesn't use this feature
+  used_feature_map_ = std::vector<int>(sample_values.size(), -1);
+
+  // start find bins
+  if (num_machines == 1) {
+    std::vector<BinMapper*> bin_mappers(sample_values.size());
+    // if only 1 machines, find bin locally
+    #pragma omp parallel for schedule(guided)
+    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
+      bin_mappers[i] = new BinMapper();
+      bin_mappers[i]->FindBin(&sample_values[i], max_bin_);
+    }
+
+    for (size_t i = 0; i < sample_values.size(); ++i) {
+      if (!bin_mappers[i]->is_trival()) {
+        // map real feature index to used feature index
+        used_feature_map_[i] = static_cast<int>(features_.size());
+        // push new feature
+        features_.push_back(new Feature(static_cast<int>(i), bin_mappers[i],
+                                             num_data_, is_enable_sparse_));
+      } else {
+        // if feature is trival(only 1 bin), free spaces
+        delete bin_mappers[i];
+      }
+    }
+  } else {
+    // if have multi-machines, need find bin distributed
+    // different machines will find bin for different features
+
+    // start and len will store the process feature indices for different machines
+    // machine i will find bins for features in [ strat[i], start[i] + len[i] )
+    int* start = new int[num_machines];
+    int* len = new int[num_machines];
+    int total_num_feature = static_cast<int>(sample_values.size());
+    int step = (total_num_feature + num_machines - 1) / num_machines;
+    if (step < 1) { step = 1; }
+
+    start[0] = 0;
+    for (int i = 0; i < num_machines - 1; ++i) {
+      len[i] = Common::Min<int>(step, total_num_feature - start[i]);
+      start[i + 1] = start[i] + len[i];
+    }
+    len[num_machines - 1] = total_num_feature - start[num_machines - 1];
+    // get size of bin mapper with max_bin_ size
+    int type_size = BinMapper::SizeForSpecificBin(max_bin_);
+    // since sizes of different feature may not be same, we expand all bin mapper to type_size 
+    int buffer_size = type_size * total_num_feature;
+    char* input_buffer = new char[buffer_size];
+    char* output_buffer = new char[buffer_size];
+
+    // find local feature bins and copy to buffer
+    #pragma omp parallel for schedule(guided)
+    for (int i = 0; i < len[rank]; ++i) {
+      BinMapper* bin_mapper = new BinMapper();
+      bin_mapper->FindBin(&sample_values[start[rank] + i], max_bin_);
+      bin_mapper->CopyTo(input_buffer + i * type_size);
+      // don't need this any more
+      delete bin_mapper;
+    }
+    // convert to binary size
+    for (int i = 0; i < num_machines; ++i) {
+      start[i] *= type_size;
+      len[i] *= type_size;
+    }
+    // gather global feature bin mappers
+    Network::Allgather(input_buffer, buffer_size, start, len, output_buffer);
+    // restore features bins from buffer
+    for (int i = 0; i < total_num_feature; ++i) {
+      BinMapper* bin_mapper = new BinMapper();
+      bin_mapper->CopyFrom(output_buffer + i * type_size);
+      if (!bin_mapper->is_trival()) {
+        used_feature_map_[i] = static_cast<int>(features_.size());
+        features_.push_back(new Feature(static_cast<int>(i), bin_mapper, num_data_, is_enable_sparse_));
+      } else {
+        delete bin_mapper;
+      }
+    }
+    // free buffer
+    delete[] start;
+    delete[] len;
+    delete[] input_buffer;
+    delete[] output_buffer;
+  }
+  num_features_ = static_cast<int>(features_.size());
+}
+
+
+void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, bool use_two_round_loading) {
+  used_data_indices_.clear();
+  if (!is_loading_from_binfile_ ) {
+    if (!use_two_round_loading) {
+      // read data to memory
+      LoadDataToMemory(rank, num_machines, is_pre_partition);
+      std::vector<std::string> sample_data;
+      // sample data
+      SampleDataFromMemory(&sample_data);
+      // construct feature bin mappers
+      ConstructBinMappers(rank, num_machines, sample_data);
+      // initialize label
+      metadata_.InitLabel(num_data_);
+      // extract features
+      ExtractFeaturesFromMemory();
+    } else {
+      std::vector<std::string> sample_data;
+      // sample data from file
+      SampleDataFromFile(rank, num_machines, is_pre_partition, &sample_data);
+      // construct feature bin mappers
+      ConstructBinMappers(rank, num_machines, sample_data);
+      // initialize label
+      metadata_.InitLabel(num_data_);
+
+      // extract features
+      ExtractFeaturesFromFile();
+    }
+  } else {
+    // load data from binary file
+    LoadDataFromBinFile(rank, num_machines, is_pre_partition);
+  }
+  // check meta data
+  metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_);
+  // free memory
+  used_data_indices_.clear();
+  used_data_indices_.shrink_to_fit();
+  // need to check training data
+  CheckDataset();
+}
+
+void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_loading) {
+  used_data_indices_.clear();
+  if (!is_loading_from_binfile_ ) {
+    if (!use_two_round_loading) {
+      // read data in memory
+      LoadDataToMemory(0, 1, false);
+      // initialize label
+      metadata_.InitLabel(num_data_);
+      features_.clear();
+      // copy feature bin mapper data
+      for (Feature* feature : train_set->features_) {
+        features_.push_back(new Feature(feature->feature_index(), new BinMapper(*feature->bin_mapper()), num_data_, is_enable_sparse_));
+      }
+      used_feature_map_ = train_set->used_feature_map_;
+      num_features_ = static_cast<int>(features_.size());
+      // extract features
+      ExtractFeaturesFromMemory();
+    } else {
+      // Get number of lines of data file
+      num_data_ = static_cast<data_size_t>(text_reader_->CountLine());
+      // initialize label
+      metadata_.InitLabel(num_data_);
+      features_.clear();
+      // copy feature bin mapper data
+      for (Feature* feature : train_set->features_) {
+        features_.push_back(new Feature(feature->feature_index(), new BinMapper(*feature->bin_mapper()), num_data_, is_enable_sparse_));
+      }
+      used_feature_map_ = train_set->used_feature_map_;
+      num_features_ = static_cast<int>(features_.size());
+      // extract features
+      ExtractFeaturesFromFile();
+    }
+  } else {
+    // load from binary file
+    LoadDataFromBinFile(0, 1, false);
+  }
+  // not need to check validation data
+  // check meta data
+  metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_);
+  // CheckDataset();
+}
+
+void Dataset::ExtractFeaturesFromMemory() {
+  std::vector<std::pair<int, double>> oneline_features;
+  double tmp_label = 0.0;
+  if (predict_fun_ == nullptr) {
+    // if doesn't need to prediction with initial model
+    #pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
+    for (data_size_t i = 0; i < num_data_; ++i) {
+      const int tid = omp_get_thread_num();
+      oneline_features.clear();
+      // parser
+      parser_->ParseOneLine(text_reader_->Lines()[i].c_str(), &oneline_features, &tmp_label);
+      // set label
+      metadata_.SetLabelAt(i, tmp_label);
+      // free processed line:
+      text_reader_->Lines()[i].clear();
+      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
+      // text_reader_->Lines()[i].shrink_to_fit();
+      // push data
+      for (auto& inner_data : oneline_features) {
+        int feature_idx = used_feature_map_[inner_data.first];
+        if (feature_idx >= 0) {
+          // if is used feature
+          features_[feature_idx]->PushData(tid, i, inner_data.second);
+        }
+      }
+    }
+  } else {
+    // if need to prediction with initial model
+    score_t* init_score = new score_t[num_data_];
+    #pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
+    for (data_size_t i = 0; i < num_data_; ++i) {
+      const int tid = omp_get_thread_num();
+      oneline_features.clear();
+      // parser
+      parser_->ParseOneLine(text_reader_->Lines()[i].c_str(), &oneline_features, &tmp_label);
+      // set initial score
+      init_score[i] = static_cast<score_t>(predict_fun_(oneline_features));
+      // set label
+      metadata_.SetLabelAt(i, tmp_label);
+      // free processed line:
+      text_reader_->Lines()[i].clear();
+      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
+      // text_reader_->Lines()[i].shrink_to_fit();
+      // push data
+      for (auto& inner_data : oneline_features) {
+        int feature_idx = used_feature_map_[inner_data.first];
+        if (feature_idx >= 0) {
+          // if is used feature
+          features_[feature_idx]->PushData(tid, i, inner_data.second);
+        }
+      }
+    }
+    // metadata_ will manage space of init_score
+    metadata_.SetInitScore(init_score);
+  }
+
+  #pragma omp parallel for schedule(guided)
+  for (int i = 0; i < num_features_; i++) {
+    features_[i]->FinishLoad();
+  }
+  // text data can be free after loaded feature values
+  text_reader_->Clear();
+}
+
+
+void Dataset::ExtractFeaturesFromFile() {
+  score_t* init_score = nullptr;
+  if (predict_fun_ != nullptr) {
+    init_score = new score_t[num_data_];
+  }
+  std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
+    [this, &init_score]
+  (data_size_t start_idx, const std::vector<std::string>& lines) {
+    std::vector<std::pair<int, double>> oneline_features;
+    double tmp_label = 0.0;
+    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
+    for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
+      const int tid = omp_get_thread_num();
+      oneline_features.clear();
+      // parser
+      parser_->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
+      // set initial score
+      if (init_score != nullptr) {
+        init_score[start_idx + i] = static_cast<score_t>(predict_fun_(oneline_features));
+      }
+      // set label
+      metadata_.SetLabelAt(start_idx + i, tmp_label);
+      // push data
+      for (auto& inner_data : oneline_features) {
+        int feature_idx = used_feature_map_[inner_data.first];
+        if (feature_idx >= 0) {
+          // if is used feature
+          features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
+        }
+      }
+    }
+  };
+
+  if (used_data_indices_.size() > 0) {
+    // only need part of data
+    text_reader_->ReadPartAndProcessParallel(used_data_indices_, process_fun);
+  } else {
+    // need full data
+    text_reader_->ReadAllAndProcessParallel(process_fun);
+  }
+
+  // metadata_ will manage space of init_score
+  if (init_score != nullptr) {
+    metadata_.SetInitScore(init_score);
+  }
+
+  #pragma omp parallel for schedule(guided)
+  for (int i = 0; i < num_features_; i++) {
+    features_[i]->FinishLoad();
+  }
+}
+
+void Dataset::SaveBinaryFile() {
+  // if is loaded from binary file, not need to save 
+  if (!is_loading_from_binfile_) {
+    std::string bin_filename(data_filename_);
+    bin_filename.append(".bin");
+    FILE* file;
+    #ifdef _MSC_VER
+    fopen_s(&file, bin_filename.c_str(), "wb");
+    #else
+    file = fopen(bin_filename.c_str(), "wb");
+    #endif
+    if (file == NULL) {
+      Log::Stderr("cannot write binary data to %s ", bin_filename.c_str());
+    }
+
+    Log::Stdout("start save binary file for data %s", data_filename_);
+
+    // get size of header
+    size_t size_of_header = sizeof(global_num_data_) + sizeof(is_enable_sparse_)
+      + sizeof(max_bin_) + sizeof(num_data_) + sizeof(num_features_) + sizeof(size_t) + sizeof(int) * used_feature_map_.size();
+    fwrite(&size_of_header, sizeof(size_of_header), 1, file);
+    // write header
+    fwrite(&global_num_data_, sizeof(global_num_data_), 1, file);
+    fwrite(&is_enable_sparse_, sizeof(is_enable_sparse_), 1, file);
+    fwrite(&max_bin_, sizeof(max_bin_), 1, file);
+    fwrite(&num_data_, sizeof(num_data_), 1, file);
+    fwrite(&num_features_, sizeof(num_features_), 1, file);
+    size_t num_used_feature_map = used_feature_map_.size();
+    fwrite(&num_used_feature_map, sizeof(num_used_feature_map), 1, file);
+    fwrite(used_feature_map_.data(), sizeof(int), num_used_feature_map, file);
+
+    // get size of meta data
+    size_t size_of_metadata = metadata_.SizesInByte();
+    fwrite(&size_of_metadata, sizeof(size_of_metadata), 1, file);
+    // write meta data
+    metadata_.SaveBinaryToFile(file);
+
+    // write feature data
+    for (int i = 0; i < num_features_; ++i) {
+      // get size of feature
+      size_t size_of_feature = features_[i]->SizesInByte();
+      fwrite(&size_of_feature, sizeof(size_of_feature), 1, file);
+      // write feature
+      features_[i]->SaveBinaryToFile(file);
+    }
+    fclose(file);
+  }
+}
+
+void Dataset::CheckCanLoadFromBin() {
+  std::string bin_filename(data_filename_);
+  bin_filename.append(".bin");
+
+  FILE* file;
+
+  #ifdef _MSC_VER
+  fopen_s(&file, bin_filename.c_str(), "rb");
+  #else
+  file = fopen(bin_filename.c_str(), "rb");
+  #endif
+
+  if (file == NULL) {
+    is_loading_from_binfile_ = false;
+  } else {
+    is_loading_from_binfile_ = true;
+    fclose(file);
+  }
+}
+
+void Dataset::LoadDataFromBinFile(int rank, int num_machines, bool is_pre_partition) {
+  std::string bin_filename(data_filename_);
+  bin_filename.append(".bin");
+
+  FILE* file;
+
+  #ifdef _MSC_VER
+  fopen_s(&file, bin_filename.c_str(), "rb");
+  #else
+  file = fopen(bin_filename.c_str(), "rb");
+  #endif
+
+  if (file == NULL) {
+    Log::Stderr("cannot read binary data from %s", bin_filename.c_str());
+  }
+
+  // buffer to read binary file
+  size_t buffer_size = 16 * 1024 * 1024;
+  char* buffer = new char[buffer_size];
+
+  // read size of header
+  size_t read_cnt = fread(buffer, sizeof(size_t), 1, file);
+
+  if (read_cnt != 1) {
+    Log::Stderr("binary file format error at header size");
+  }
+
+  size_t size_of_head = *(reinterpret_cast<size_t*>(buffer));
+
+  // re-allocmate space if not enough
+  if (size_of_head > buffer_size) {
+    delete[] buffer;
+    buffer_size = size_of_head;
+    buffer = new char[buffer_size];
+  }
+  // read header
+  read_cnt = fread(buffer, 1, size_of_head, file);
+
+  if (read_cnt != size_of_head) {
+    Log::Stderr("binary file format error at header");
+  }
+  // get header 
+  const char* mem_ptr = buffer;
+  global_num_data_ = *(reinterpret_cast<const size_t*>(mem_ptr));
+  mem_ptr += sizeof(global_num_data_);
+  is_enable_sparse_ = *(reinterpret_cast<const bool*>(mem_ptr));
+  mem_ptr += sizeof(is_enable_sparse_);
+  max_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
+  mem_ptr += sizeof(max_bin_);
+  num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
+  mem_ptr += sizeof(num_data_);
+  num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
+  mem_ptr += sizeof(num_features_);
+  size_t num_used_feature_map = *(reinterpret_cast<const size_t*>(mem_ptr));
+  mem_ptr += sizeof(num_used_feature_map);
+  const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
+  used_feature_map_.clear();
+  for (size_t i = 0; i < num_used_feature_map; ++i) {
+    used_feature_map_.push_back(tmp_feature_map[i]);
+  }
+
+  // read size of meta data
+  read_cnt = fread(buffer, sizeof(size_t), 1, file);
+
+  if (read_cnt != 1) {
+    Log::Stderr("binary file format error at size of meta data");
+  }
+
+  size_t size_of_metadata = *(reinterpret_cast<size_t*>(buffer));
+
+  // re-allocmate space if not enough
+  if (size_of_metadata > buffer_size) {
+    delete[] buffer;
+    buffer_size = size_of_metadata;
+    buffer = new char[buffer_size];
+  }
+  //  read meta data
+  read_cnt = fread(buffer, 1, size_of_metadata, file);
+
+  if (read_cnt != size_of_metadata) {
+    Log::Stderr("binary file format error at meta data");
+  }
+  // load meta data
+  metadata_.LoadFromMemory(buffer);
+
+  used_data_indices_.clear();
+  global_num_data_ = num_data_;
+  // sample local used data if need to partition
+  if (num_machines > 1 && !is_pre_partition) {
+    const data_size_t* query_boundaries = metadata_.query_boundaries();
+    if (query_boundaries == nullptr) {
+      // if not contain query file, minimal sample unit is one record
+      for (data_size_t i = 0; i < num_data_; i++) {
+        if (random_.NextInt(0, num_machines) == rank) {
+          used_data_indices_.push_back(i);
+        } 
+      }
+    } else {
+      // if contain query file, minimal sample unit is one query
+      data_size_t num_queries = metadata_.num_queries();
+      data_size_t qid = -1;
+      bool is_query_used = false;
+      for (data_size_t i = 0; i < num_data_; i++) {
+        if (qid >= num_queries) {
+          Log::Stderr("current query is exceed the range of query file, please ensure your query file is correct");
+        }
+        if (i >= query_boundaries[qid + 1]) {
+          // if is new query
+          is_query_used = false;
+          if (random_.NextInt(0, num_machines) == rank) {
+            is_query_used = true;
+          }
+          ++qid;
+        }
+        if (is_query_used) {
+          used_data_indices_.push_back(i);
+        }
+      }
+    }
+    num_data_ = static_cast<data_size_t>(used_data_indices_.size());
+  }
+  metadata_.PartitionLabel(used_data_indices_);
+  // read feature data
+  for (int i = 0; i < num_features_; ++i) {
+    // read feature size
+    read_cnt = fread(buffer, sizeof(size_t), 1, file);
+    if (read_cnt != 1) {
+      Log::Stderr("binary file format error at feature %d's size", i);
+    }
+    size_t size_of_feature = *(reinterpret_cast<size_t*>(buffer));
+    // re-allocmate space if not enough
+    if (size_of_feature > buffer_size) {
+      delete[] buffer;
+      buffer_size = size_of_feature;
+      buffer = new char[buffer_size];
+    }
+
+    read_cnt = fread(buffer, 1, size_of_feature, file);
+
+    if (read_cnt != size_of_feature) {
+      Log::Stderr("binary file format error at feature %d loading , read count %d", i, read_cnt);
+    }
+    features_.push_back(new Feature(buffer, static_cast<data_size_t>(global_num_data_), used_data_indices_));
+  }
+  delete[] buffer;
+  fclose(file);
+}
+
+void Dataset::CheckDataset() {
+  if (num_data_ <= 0) {
+    Log::Stderr("data size of %s is zero", data_filename_);
+  }
+  if (features_.size() <= 0) {
+    Log::Stderr("not useful feature of data %s", data_filename_);
+  }
+}
+
+}  // namespace LightGBM
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
+#ifndef LIGHTGBM_IO_DENSE_BIN_HPP_
+#define LIGHTGBM_IO_DENSE_BIN_HPP_
+
+#include <LightGBM/bin.h>
+
+#include <vector>
+#include <cstring>
+#include <cstdint>
+
+namespace LightGBM {
+
+/*!
+* \brief Used to Store bins for dense feature
+* Use template to reduce memory cost
+*/
+template <typename VAL_T>
+class DenseBin: public Bin {
+public:
+  explicit DenseBin(data_size_t num_data)
+    : num_data_(num_data) {
+    data_ = new VAL_T[num_data_];
+    std::memset(data_, 0, sizeof(VAL_T)*num_data_);
+  }
+
+  ~DenseBin() {
+    delete[] data_;
+  }
+
+  void Push(int, data_size_t idx, uint32_t value) override {
+    data_[idx] = static_cast<VAL_T>(value);
+  }
+
+  inline uint32_t Get(data_size_t idx) const {
+    return static_cast<uint32_t>(data_[idx]);
+  }
+
+  BinIterator* GetIterator(data_size_t start_idx) const override;
+
+  void ConstructHistogram(data_size_t* data_indices, data_size_t num_data,
+                          const score_t* ordered_gradients, const score_t* ordered_hessians,
+                          HistogramBinEntry* out) const override {
+    // use 4-way unrolling, will be faster
+    if (data_indices != nullptr) {  // if use part of data
+      data_size_t rest = num_data % 4;
+      data_size_t i = 0;
+      for (; i < num_data - rest; i += 4) {
+        VAL_T bin0 = data_[data_indices[i]];
+        VAL_T bin1 = data_[data_indices[i + 1]];
+        VAL_T bin2 = data_[data_indices[i + 2]];
+        VAL_T bin3 = data_[data_indices[i + 3]];
+
+        out[bin0].sum_gradients += ordered_gradients[i];
+        out[bin1].sum_gradients += ordered_gradients[i + 1];
+        out[bin2].sum_gradients += ordered_gradients[i + 2];
+        out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+        out[bin0].sum_hessians += ordered_hessians[i];
+        out[bin1].sum_hessians += ordered_hessians[i + 1];
+        out[bin2].sum_hessians += ordered_hessians[i + 2];
+        out[bin3].sum_hessians += ordered_hessians[i + 3];
+
+        ++out[bin0].cnt;
+        ++out[bin1].cnt;
+        ++out[bin2].cnt;
+        ++out[bin3].cnt;
+      }
+      for (; i < num_data; ++i) {
+        VAL_T bin = data_[data_indices[i]];
+        out[bin].sum_gradients += ordered_gradients[i];
+        out[bin].sum_hessians += ordered_hessians[i];
+        ++out[bin].cnt;
+      }
+    }
+    else {  // use full data
+      data_size_t rest = num_data % 4;
+      data_size_t i = 0;
+      for (; i < num_data - rest; i += 4) {
+        VAL_T bin0 = data_[i];
+        VAL_T bin1 = data_[i + 1];
+        VAL_T bin2 = data_[i + 2];
+        VAL_T bin3 = data_[i + 3];
+
+        out[bin0].sum_gradients += ordered_gradients[i];
+        out[bin1].sum_gradients += ordered_gradients[i + 1];
+        out[bin2].sum_gradients += ordered_gradients[i + 2];
+        out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+        out[bin0].sum_hessians += ordered_hessians[i];
+        out[bin1].sum_hessians += ordered_hessians[i + 1];
+        out[bin2].sum_hessians += ordered_hessians[i + 2];
+        out[bin3].sum_hessians += ordered_hessians[i + 3];
+
+        ++out[bin0].cnt;
+        ++out[bin1].cnt;
+        ++out[bin2].cnt;
+        ++out[bin3].cnt;
+      }
+      for (; i < num_data; ++i) {
+        VAL_T bin = data_[i];
+        out[bin].sum_gradients += ordered_gradients[i];
+        out[bin].sum_hessians += ordered_hessians[i];
+        ++out[bin].cnt;
+      }
+    }
+  }
+
+  data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
+                         data_size_t* lte_indices, data_size_t* gt_indices) const override {
+    data_size_t lte_count = 0;
+    data_size_t gt_count = 0;
+    for (data_size_t i = 0; i < num_data; ++i) {
+      data_size_t idx = data_indices[i];
+      if (data_[idx] > threshold) {
+        gt_indices[gt_count++] = idx;
+      } else {
+        lte_indices[lte_count++] = idx;
+      }
+    }
+    return lte_count;
+  }
+  data_size_t num_data() const override { return num_data_; }
+
+  /*! \brief not ordered bin for dense feature */
+  OrderedBin* CreateOrderedBin() const override { return nullptr; }
+
+  void FinishLoad() override {}
+
+  void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
+    const VAL_T* mem_data = reinterpret_cast<const VAL_T*>(memory);
+    if (local_used_indices.size() > 0) {
+      for (int i = 0; i < num_data_; ++i) {
+        data_[i] = mem_data[local_used_indices[i]];
+      }
+    } else {
+      for (int i = 0; i < num_data_; ++i) {
+        data_[i] = mem_data[i];
+      }
+    }
+  }
+
+  void SaveBinaryToFile(FILE* file) const override {
+    fwrite(data_, sizeof(VAL_T), num_data_, file);
+  }
+
+  size_t SizesInByte() const override {
+    return sizeof(VAL_T) * num_data_;
+  }
+
+private:
+  data_size_t num_data_;
+  VAL_T* data_;
+};
+
+template <typename VAL_T>
+class DenseBinIterator: public BinIterator {
+public:
+  explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data)
+    : bin_data_(bin_data) {
+  }
+  uint32_t Get(data_size_t idx) override {
+    return bin_data_->Get(idx);
+  }
+private:
+  const DenseBin<VAL_T>* bin_data_;
+};
+
+template <typename VAL_T>
+BinIterator* DenseBin<VAL_T>::GetIterator(data_size_t) const {
+  return new DenseBinIterator<VAL_T>(this);
+}
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_IO_DENSE_BIN_HPP_
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
+#include <LightGBM/dataset.h>
+
+#include <LightGBM/utils/common.h>
+
+#include <vector>
+#include <string>
+
+namespace LightGBM {
+
+Metadata::Metadata()
+  :label_(nullptr), label_int_(nullptr), weights_(nullptr), 
+  query_boundaries_(nullptr),
+  query_weights_(nullptr), init_score_(nullptr) {
+
+}
+
+void Metadata::Init(const char * data_filename, const char* init_score_filename) {
+  data_filename_ = data_filename;
+  init_score_filename_ = init_score_filename;
+  // for lambdarank, it needs query data for partition data in parallel learning
+  LoadQueryBoundaries();
+  LoadWeights();
+  LoadQueryWeights();
+  LoadInitialScore();
+}
+
+void Metadata::Init(const char* init_score_filename) {
+  init_score_filename_ = init_score_filename;
+  LoadInitialScore();
+}
+
+
+Metadata::~Metadata() {
+  if (label_ != nullptr) { delete[] label_; }
+  if (weights_ != nullptr) { delete[] weights_; }
+  if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
+  if (query_weights_ != nullptr) { delete[] query_weights_; }
+  if (init_score_ != nullptr) { delete[] init_score_; }
+}
+
+
+void Metadata::InitLabel(data_size_t num_data) {
+  num_data_ = num_data;
+  label_ = new float[num_data_];
+}
+
+void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
+  if (used_indices.size() <= 0) {
+    return;
+  }
+  float* old_label = label_;
+  num_data_ = static_cast<data_size_t>(used_indices.size());
+  label_ = new float[num_data_];
+  for (data_size_t i = 0; i < num_data_; ++i) {
+    label_[i] = old_label[used_indices[i]];
+  }
+  delete[] old_label;
+}
+
+void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data_size_t>& used_data_indices) {
+  if (used_data_indices.size() == 0) {
+    // check weights
+    if (weights_ != nullptr && num_weights_ != num_data_) {
+      Log::Stdout("init weight size doesn't equal with data file, will ignore");
+      delete[] weights_;
+      num_weights_ = 0;
+      weights_ = nullptr;
+    }
+
+    // check query boundries
+    if (query_boundaries_ != nullptr && query_boundaries_[num_queries_] != num_data_) {
+      Log::Stdout("init query size doesn't equal with data file, will ignore");
+      delete[] query_boundaries_;
+      num_queries_ = 0;
+      query_boundaries_ = nullptr;
+    }
+
+    // contain initial score file
+    if (init_score_ != nullptr && num_init_score_ != num_data_) {
+      delete[] init_score_;
+      Log::Stdout("init score size doesn't equal with data file, will ignore");
+      num_init_score_ = 0;
+    }
+  } else {
+    data_size_t num_used_data = static_cast<data_size_t>(used_data_indices.size());
+    // check weights
+    if (weights_ != nullptr && num_weights_ != num_all_data) {
+      Log::Stdout("init weight size doesn't equal with data file, will ignore");
+      delete[] weights_;
+      num_weights_ = 0;
+      weights_ = nullptr;
+    }
+    // check query boundries
+    if (query_boundaries_ != nullptr && query_boundaries_[num_queries_] != num_all_data) {
+      Log::Stdout("init query size doesn't equal with data file, will ignore");
+      delete[] query_boundaries_;
+      num_queries_ = 0;
+      query_boundaries_ = nullptr;
+    }
+
+    // contain initial score file
+    if (init_score_ != nullptr && num_init_score_ != num_all_data) {
+      Log::Stdout("init score size doesn't equal with data file, will ignore");
+      delete[] init_score_;
+      num_init_score_ = 0;
+    }
+
+    // get local weights
+    if (weights_ != nullptr) {
+      float* old_weights = weights_;
+      num_weights_ = num_data_;
+      weights_ = new float[num_data_];
+      for (size_t i = 0; i < used_data_indices.size(); ++i) {
+        weights_[i] = old_weights[used_data_indices[i]];
+      }
+      delete[] old_weights;
+    }
+
+    // get local query boundaries
+    if (query_boundaries_ != nullptr) {
+      std::vector<data_size_t> used_query;
+      data_size_t data_idx = 0;
+      for (data_size_t qid = 0; qid < num_queries_ && data_idx < num_used_data; ++qid) {
+        data_size_t start = query_boundaries_[qid];
+        data_size_t end = query_boundaries_[qid + 1];
+        data_size_t len = end - start;
+        if (used_data_indices[data_idx] > start) {
+          continue;
+        } else if (used_data_indices[data_idx] == start) {
+          if (num_used_data >= data_idx + len && used_data_indices[data_idx + len - 1] == end - 1) {
+            used_query.push_back(qid);
+            data_idx += len;
+          } else {
+            Log::Stderr("data partition error, not according to query");
+          }
+        } else {
+          Log::Stderr("data partition error, not according to query");
+        }
+      }
+      data_size_t * old_query_boundaries = query_boundaries_;
+      query_boundaries_ = new data_size_t[used_query.size() + 1];
+      num_queries_ = static_cast<data_size_t>(used_query.size());
+      query_boundaries_[0] = 0;
+      for (data_size_t i = 0; i < num_queries_; ++i) {
+        data_size_t qid = used_query[i];
+        data_size_t len = old_query_boundaries[qid + 1] - old_query_boundaries[qid];
+        query_boundaries_[i + 1] = query_boundaries_[i] + len;
+      }
+      delete[] old_query_boundaries;
+    }
+
+    // get local initial scores
+    if (init_score_ != nullptr) {
+      score_t* old_scores = init_score_;
+      num_init_score_ = num_data_;
+      init_score_ = new score_t[num_init_score_];
+      for (size_t i = 0; i < used_data_indices.size(); ++i) {
+        init_score_[i] = old_scores[used_data_indices[i]];
+      }
+      delete[] old_scores;
+    }
+
+    // re-load query weight
+    LoadQueryWeights();
+  }
+}
+
+
+void Metadata::SetInitScore(score_t* init_score) {
+  if (init_score_ != nullptr) { delete[] init_score_; }
+  num_init_score_ = num_data_;
+  init_score_ = init_score;
+}
+
+void Metadata::LoadWeights() {
+  num_weights_ = 0;
+  std::string weight_filename(data_filename_);
+  // default weight file name
+  weight_filename.append(".weight");
+  TextReader<size_t> reader(weight_filename.c_str());
+  reader.ReadAllLines();
+  if (reader.Lines().size() <= 0) {
+    return;
+  }
+  Log::Stdout("Start to load weights");
+  num_weights_ = static_cast<data_size_t>(reader.Lines().size());
+  weights_ = new float[num_weights_];
+  for (data_size_t i = 0; i < num_weights_; ++i) {
+    double tmp_weight;
+    Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
+    weights_[i] = static_cast<float>(tmp_weight);
+  }
+}
+
+void Metadata::LoadInitialScore() {
+  num_init_score_ = 0;
+  if (init_score_filename_[0] == '\0') { return; }
+  TextReader<size_t> reader(init_score_filename_);
+  reader.ReadAllLines();
+
+  Log::Stdout("Start to load initial score");
+  num_init_score_ = static_cast<data_size_t>(reader.Lines().size());
+  init_score_ = new score_t[num_init_score_];
+  double tmp;
+  for (data_size_t i = 0; i < num_init_score_; ++i) {
+    Common::Atof(reader.Lines()[i].c_str(), &tmp);
+    init_score_[i] = static_cast<score_t>(tmp);
+  }
+}
+
+void Metadata::LoadQueryBoundaries() {
+  num_queries_ = 0;
+  std::string query_filename(data_filename_);
+  // default query file name
+  query_filename.append(".query");
+  TextReader<size_t> reader(query_filename.c_str());
+  reader.ReadAllLines();
+  if (reader.Lines().size() <= 0) {
+    return;
+  }
+  Log::Stdout("Start to load query boundries");
+  query_boundaries_ = new data_size_t[reader.Lines().size() + 1];
+  num_queries_ = static_cast<data_size_t>(reader.Lines().size());
+  query_boundaries_[0] = 0;
+  for (size_t i = 0; i < reader.Lines().size(); ++i) {
+    int tmp_cnt;
+    Common::Atoi(reader.Lines()[i].c_str(), &tmp_cnt);
+    query_boundaries_[i + 1] = query_boundaries_[i] + static_cast<data_size_t>(tmp_cnt);
+  }
+}
+
+void Metadata::LoadQueryWeights() {
+  if (weights_ == nullptr || query_boundaries_ == nullptr) {
+    return;
+  }
+  Log::Stdout("Start to load query weights");
+  query_weights_ = new float[num_queries_];
+  for (data_size_t i = 0; i < num_queries_; ++i) {
+    query_weights_[i] = 0.0f;
+    for (data_size_t j = query_boundaries_[i]; j < query_boundaries_[i + 1]; ++j) {
+      query_weights_[i] += weights_[j];
+    }
+    query_weights_[i] /= (query_boundaries_[i + 1] - query_boundaries_[i]);
+  }
+}
+
+void Metadata::LoadFromMemory(const void* memory) {
+  const char* mem_ptr = reinterpret_cast<const char*>(memory);
+
+  num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
+  mem_ptr += sizeof(num_data_);
+  num_weights_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
+  mem_ptr += sizeof(num_weights_);
+  num_queries_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
+  mem_ptr += sizeof(num_queries_);
+
+  if (label_ != nullptr) { delete[] label_; }
+  label_ = new float[num_data_];
+  std::memcpy(label_, mem_ptr, sizeof(float)*num_data_);
+  mem_ptr += sizeof(float)*num_weights_;
+
+  if (num_weights_ > 0) {
+    if (weights_ != nullptr) { delete[] weights_; }
+    weights_ = new float[num_weights_];
+    std::memcpy(weights_, mem_ptr, sizeof(float)*num_weights_);
+    mem_ptr += sizeof(float)*num_weights_;
+  }
+  if (num_queries_ > 0) {
+    if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
+    query_boundaries_ = new data_size_t[num_queries_ + 1];
+    std::memcpy(query_boundaries_, mem_ptr, sizeof(data_size_t)*(num_queries_ + 1));
+    mem_ptr += sizeof(data_size_t)*(num_queries_ + 1);
+  }
+  if (num_weights_ > 0 && num_queries_ > 0) {
+    if (query_weights_ != nullptr) { delete[] query_weights_; }
+    query_weights_ = new float[num_queries_];
+    std::memcpy(query_weights_, mem_ptr, sizeof(float)*num_queries_);
+    mem_ptr += sizeof(float)*num_queries_;
+  }
+}
+
+void Metadata::SaveBinaryToFile(FILE* file) const {
+  fwrite(&num_data_, sizeof(num_data_), 1, file);
+  fwrite(&num_weights_, sizeof(num_weights_), 1, file);
+  fwrite(&num_queries_, sizeof(num_queries_), 1, file);
+  fwrite(label_, sizeof(float), num_data_, file);
+  if (weights_ != nullptr) {
+    fwrite(weights_, sizeof(float), num_weights_, file);
+  }
+  if (query_boundaries_ != nullptr) {
+    fwrite(query_boundaries_, sizeof(data_size_t), num_queries_ + 1, file);
+  }
+  if (query_weights_ != nullptr) {
+    fwrite(query_weights_, sizeof(float), num_queries_, file);
+  }
+
+}
+
+size_t Metadata::SizesInByte() const  {
+  size_t size = sizeof(num_data_) + sizeof(num_weights_)
+    + sizeof(num_queries_);
+  size += sizeof(float) * num_data_;
+  if (weights_ != nullptr) {
+    size += sizeof(float) * num_weights_;
+  }
+  if (query_boundaries_ != nullptr) {
+    size += sizeof(data_size_t) * (num_queries_ + 1);
+  }
+  if (query_weights_ != nullptr) {
+    size += sizeof(float) * num_queries_;
+  }
+  return size;
+}
+
+
+}  // namespace LightGBM