first commit

1c774687 · Guolin Ke · 1c774687 · 1c774687 · 1c774687 · 1c774687
Commit 1c774687 authored Aug 05, 2016 by Guolin Ke
14 changed files
--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
+#ifndef LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
+#define LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
+
+#include <LightGBM/objective_function.h>
+
+namespace LightGBM {
+/*!
+* \brief Objective funtion for regression
+*/
+class RegressionL2loss: public ObjectiveFunction {
+public:
+  explicit RegressionL2loss(const ObjectiveConfig&) {
+  }
+
+  ~RegressionL2loss() {
+  }
+
+  void Init(const Metadata& metadata, data_size_t num_data) override {
+    num_data_ = num_data;
+    label_ = metadata.label();
+    weights_ = metadata.weights();
+  }
+
+  void GetGradients(const score_t* score, score_t* gradients,
+                    score_t* hessians) const override {
+    if (weights_ == nullptr) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        gradients[i] = (score[i] - label_[i]);
+        hessians[i] = 1.0;
+      }
+    } else {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        gradients[i] = (score[i] - label_[i]) * weights_[i];
+        hessians[i] = weights_[i];
+      }
+    }
+  }
+
+  double GetSigmoid() const override {
+    // not sigmoid transform, return -1
+    return -1.0;
+  }
+
+private:
+  /*! \brief Number of data */
+  data_size_t num_data_;
+  /*! \brief Pointer of label */
+  const float* label_;
+  /*! \brief Pointer of weights */
+  const float* weights_;
+};
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
+#include "parallel_tree_learner.h"
+
+#include <cstring>
+
+#include <tuple>
+#include <vector>
+
+namespace LightGBM {
+
+DataParallelTreeLearner::DataParallelTreeLearner(const TreeConfig& tree_config)
+  :SerialTreeLearner(tree_config), input_buffer_(nullptr),
+  output_buffer_(nullptr), is_feature_aggregated_(nullptr),
+  block_start_(nullptr), block_len_(nullptr),
+  buffer_write_start_pos_(nullptr), buffer_read_start_pos_(nullptr),
+  global_data_count_in_leaf_(nullptr) {
+}
+
+DataParallelTreeLearner::~DataParallelTreeLearner() {
+  if (input_buffer_ != nullptr) { delete[] input_buffer_; }
+  if (output_buffer_ != nullptr) { delete[] output_buffer_; }
+  if (is_feature_aggregated_ != nullptr) { delete[] is_feature_aggregated_; }
+  if (block_start_ != nullptr) { delete[] block_start_; }
+  if (block_len_ != nullptr) { delete[] block_len_; }
+  if (buffer_write_start_pos_ != nullptr) { delete[] buffer_write_start_pos_; }
+  if (buffer_read_start_pos_ != nullptr) { delete[] buffer_read_start_pos_; }
+  if (global_data_count_in_leaf_ != nullptr) { delete[] global_data_count_in_leaf_; }
+}
+
+void DataParallelTreeLearner::Init(const Dataset* train_data) {
+  // initialize SerialTreeLearner
+  SerialTreeLearner::Init(train_data);
+  // Get local rank and global machine size
+  rank_ = Network::rank();
+  num_machines_ = Network::num_machines();
+  // allocate buffer for communication
+  size_t buffer_size = 0;
+  for (int i = 0; i < num_features_; ++i) {
+    buffer_size += train_data_->FeatureAt(i)->num_bin() * sizeof(HistogramBinEntry);
+  }
+
+  input_buffer_ = new char[buffer_size];
+  output_buffer_ = new char[buffer_size];
+
+  is_feature_aggregated_ = new bool[num_features_];
+
+  block_start_ = new int[num_machines_];
+  block_len_ = new int[num_machines_];
+
+  buffer_write_start_pos_ = new int[num_features_];
+  buffer_read_start_pos_ = new int[num_features_];
+  global_data_count_in_leaf_ = new data_size_t[num_leaves_];
+}
+
+
+
+void DataParallelTreeLearner::BeforeTrain() {
+  SerialTreeLearner::BeforeTrain();
+  // generate feature partition for current tree
+  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
+  std::vector<int> num_bins_distributed(num_machines_, 0);
+  for (int i = 0; i < train_data_->num_features(); ++i) {
+    if (is_feature_used_[i]) {
+      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
+      feature_distribution[cur_min_machine].push_back(i);
+      num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
+    }
+    is_feature_aggregated_[i] = false;
+  }
+  // get local used feature
+  for (auto fid : feature_distribution[rank_]) {
+    is_feature_aggregated_[fid] = true;
+  }
+
+  // get block start and block len for reduce scatter
+  reduce_scatter_size_ = 0;
+  for (int i = 0; i < num_machines_; ++i) {
+    block_len_[i] = 0;
+    for (auto fid : feature_distribution[i]) {
+      block_len_[i] += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
+    }
+    reduce_scatter_size_ += block_len_[i];
+  }
+
+  block_start_[0] = 0;
+  for (int i = 1; i < num_machines_; ++i) {
+    block_start_[i] = block_start_[i - 1] + block_len_[i - 1];
+  }
+
+  // get buffer_write_start_pos_
+  int bin_size = 0;
+  for (int i = 0; i < num_machines_; ++i) {
+    for (auto fid : feature_distribution[i]) {
+      buffer_write_start_pos_[fid] = bin_size;
+      bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
+    }
+  }
+
+  // get buffer_read_start_pos_
+  bin_size = 0;
+  for (auto fid : feature_distribution[rank_]) {
+    buffer_read_start_pos_[fid] = bin_size;
+    bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
+  }
+
+  // sync global data sumup info
+  std::tuple<data_size_t, score_t, score_t> data(smaller_leaf_splits_->num_data_in_leaf(),
+             smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
+  int size = sizeof(data);
+  std::memcpy(input_buffer_, &data, size);
+  // global sumup reduce
+  Network::Allreduce(input_buffer_, size, size, output_buffer_, [](const char *src, char *dst, int len) {
+    int used_size = 0;
+    int type_size = sizeof(std::tuple<data_size_t, score_t, score_t>);
+    const std::tuple<data_size_t, score_t, score_t> *p1;
+    std::tuple<data_size_t, score_t, score_t> *p2;
+    while (used_size < len) {
+      p1 = reinterpret_cast<const std::tuple<data_size_t, score_t, score_t> *>(src);
+      p2 = reinterpret_cast<std::tuple<data_size_t, score_t, score_t> *>(dst);
+      std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1);
+      std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1);
+      std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1);
+      src += type_size;
+      dst += type_size;
+      used_size += type_size;
+    }
+  });
+  // copy back
+  std::memcpy(&data, output_buffer_, size);
+  // set global sumup info
+  smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
+  // init global data count in leaf
+  global_data_count_in_leaf_[0] = std::get<0>(data);
+}
+
+void DataParallelTreeLearner::FindBestThresholds() {
+  // construct local histograms
+  #pragma omp parallel for schedule(guided)
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    if ((is_feature_used_ != nullptr && is_feature_used_[feature_index] == false)) continue;
+    // construct histograms for smaller leaf
+    if (ordered_bins_[feature_index] == nullptr) {
+      smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
+                                                             smaller_leaf_splits_->num_data_in_leaf(),
+                                                             smaller_leaf_splits_->sum_gradients(),
+                                                             smaller_leaf_splits_->sum_hessians(),
+                                                             ptr_to_ordered_gradients_,
+                                                             ptr_to_ordered_hessians_);
+    } else {
+      smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
+                                                             smaller_leaf_splits_->LeafIndex(),
+                                                             smaller_leaf_splits_->num_data_in_leaf(),
+                                                             smaller_leaf_splits_->sum_gradients(),
+                                                             smaller_leaf_splits_->sum_hessians(),
+                                                             gradients_,
+                                                             hessians_);
+    }
+    // copy to buffer
+    std::memcpy(input_buffer_ + buffer_write_start_pos_[feature_index],
+                smaller_leaf_histogram_array_[feature_index].HistogramData(),
+                smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
+  }
+
+  // Reduce scatter for histogram
+  Network::ReduceScatter(input_buffer_, reduce_scatter_size_, block_start_,
+                         block_len_, output_buffer_, &HistogramBinEntry::SumReducer);
+  #pragma omp parallel for schedule(guided)
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    if (!is_feature_aggregated_[feature_index]) continue;
+    // copy global sumup info
+    smaller_leaf_histogram_array_[feature_index].SetSumup(
+        GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
+                                smaller_leaf_splits_->sum_gradients(), 
+                                smaller_leaf_splits_->sum_hessians());
+
+    // restore global histograms from buffer
+    smaller_leaf_histogram_array_[feature_index].FromMemory(
+        output_buffer_ + buffer_read_start_pos_[feature_index]);
+
+    // find best threshold for smaller child
+    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
+        &smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
+
+    // only root leaf
+    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
+
+    // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
+    larger_leaf_histogram_array_[feature_index].Subtract(
+        smaller_leaf_histogram_array_[feature_index]);
+    // set sumup info for histogram
+    larger_leaf_histogram_array_[feature_index].SetSumup(
+        GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
+                                                         larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians());
+    // find best threshold for larger child
+    larger_leaf_histogram_array_[feature_index].FindBestThreshold(
+        &larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
+  }
+
+}
+
+void DataParallelTreeLearner::FindBestSplitsForLeaves() {
+  int smaller_best_feature = -1, larger_best_feature = -1;
+  SplitInfo smaller_best, larger_best;
+  std::vector<double> gains;
+  // find local best split for smaller leaf
+  for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
+    gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
+  }
+  smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
+  smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
+  // find local best split for larger leaf
+  if (larger_leaf_splits_->LeafIndex() >= 0) {
+    gains.clear();
+    for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
+      gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
+    }
+    larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
+    larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
+  }
+
+  // sync global best info
+  std::memcpy(input_buffer_, &smaller_best, sizeof(SplitInfo));
+  std::memcpy(input_buffer_ + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo));
+
+  Network::Allreduce(input_buffer_, sizeof(SplitInfo) * 2, sizeof(SplitInfo),
+                     output_buffer_, &SplitInfo::MaxReducer);
+
+  std::memcpy(&smaller_best, output_buffer_, sizeof(SplitInfo));
+  std::memcpy(&larger_best, output_buffer_ + sizeof(SplitInfo), sizeof(SplitInfo));
+
+  // set best split
+  best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()] = smaller_best;
+  if (larger_leaf_splits_->LeafIndex() >= 0) {
+    best_split_per_leaf_[larger_leaf_splits_->LeafIndex()] = larger_best;
+  }
+}
+
+void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
+  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
+  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+  // need update global number of data in leaf
+  global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count;
+  global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count;
+}
+
+
+}  // namespace LightGBM
--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
+#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
+#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
+
+#include <LightGBM/meta.h>
+#include <LightGBM/feature.h>
+
+#include <omp.h>
+
+#include <cstring>
+
+#include <vector>
+
+namespace LightGBM {
+/*!
+* \brief DataPartition is used to store the the partition of data on tree.
+*/
+class DataPartition {
+public:
+  DataPartition(data_size_t num_data, int num_leafs)
+    :num_data_(num_data), num_leaves_(num_leafs) {
+    leaf_begin_ = new data_size_t[num_leaves_];
+    leaf_count_ = new data_size_t[num_leaves_];
+    indices_ = new data_size_t[num_data_];
+    temp_left_indices_ = new data_size_t[num_data_];
+    temp_right_indices_ = new data_size_t[num_data_];
+    used_data_indices_ = nullptr;
+#pragma omp parallel
+#pragma omp master
+    {
+      num_threads_ = omp_get_num_threads();
+    }
+    offsets_buf_ = new data_size_t[num_threads_];
+    left_cnts_buf_ = new data_size_t[num_threads_];
+    right_cnts_buf_ = new data_size_t[num_threads_];
+    left_write_pos_buf_ = new data_size_t[num_threads_];
+    right_write_pos_buf_ = new data_size_t[num_threads_];
+  }
+  ~DataPartition() {
+    delete[] leaf_begin_;
+    delete[] leaf_count_;
+    delete[] indices_;
+    delete[] temp_left_indices_;
+    delete[] temp_right_indices_;
+    delete[] offsets_buf_;
+    delete[] left_cnts_buf_;
+    delete[] right_cnts_buf_;
+    delete[] left_write_pos_buf_;
+    delete[] right_write_pos_buf_;
+  }
+
+  /*!
+  * \brief Init, will put all data on the root(leaf_idx = 0)
+  */
+  void Init() {
+    for (int i = 0; i < num_leaves_; ++i) {
+      leaf_count_[i] = 0;
+    }
+    leaf_begin_[0] = 0;
+    if (used_data_indices_ == nullptr) {
+      // if using all data
+      leaf_count_[0] = num_data_;
+#pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        indices_[i] = i;
+      }
+    } else {
+      // if bagging
+      leaf_count_[0] = used_data_count_;
+      std::memcpy(indices_, used_data_indices_, used_data_count_ * sizeof(data_size_t));
+    }
+  }
+
+  /*!
+  * \brief Get the data indices of one leaf
+  * \param leaf index of leaf
+  * \param indices output data indices
+  * \return number of data on this leaf
+  */
+  data_size_t GetIndexOnLeaf(int leaf, data_size_t** indices) const {
+    // copy reference, maybe unsafe, but faster
+    data_size_t begin = leaf_begin_[leaf];
+    (*indices) = static_cast<data_size_t*>(indices_ + begin);
+    return leaf_count_[leaf];
+  }
+
+  /*!
+  * \brief Split the data
+  * \param leaf index of leaf
+  * \param feature_bins feature bin data
+  * \param threshold threshold that want to split
+  * \param right_leaf index of right leaf
+  */
+  void Split(int leaf, const Bin* feature_bins, unsigned int threshold, int right_leaf) {
+    const data_size_t min_inner_size = 1000;
+    // get leaf boundary
+    const data_size_t begin = leaf_begin_[leaf];
+    const data_size_t cnt = leaf_count_[leaf];
+
+    data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
+    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
+    // split data multi-threading
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < num_threads_; ++i) {
+      left_cnts_buf_[i] = 0;
+      right_cnts_buf_[i] = 0;
+      data_size_t cur_start = i * inner_size;
+      if (cur_start > cnt) { continue; }
+      data_size_t cur_cnt = inner_size;
+      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
+      // split data inner, reduce the times of function called
+      data_size_t cur_left_count = feature_bins->Split(threshold, indices_ + begin + cur_start, cur_cnt,
+        temp_left_indices_ + cur_start, temp_right_indices_ + cur_start);
+      offsets_buf_[i] = cur_start;
+      left_cnts_buf_[i] = cur_left_count;
+      right_cnts_buf_[i] = cur_cnt - cur_left_count;
+    }
+    data_size_t left_cnt = 0;
+    left_write_pos_buf_[0] = 0;
+    right_write_pos_buf_[0] = 0;
+    for (int i = 1; i < num_threads_; ++i) {
+      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
+      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
+    }
+    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
+    // copy back indices of right leaf to indices_
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < num_threads_; ++i) {
+      if (left_cnts_buf_[i] > 0) {
+        std::memcpy(indices_ + begin + left_write_pos_buf_[i], temp_left_indices_ + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
+      }
+      if (right_cnts_buf_[i] > 0) {
+        std::memcpy(indices_ + begin + left_cnt + right_write_pos_buf_[i], temp_right_indices_ + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
+      }
+    }
+    // update leaf boundary
+    leaf_count_[leaf] = left_cnt;
+    leaf_begin_[right_leaf] = left_cnt + begin;
+    leaf_count_[right_leaf] = cnt - left_cnt;
+  }
+
+  /*!
+  * \brief SetLabelAt used data indices before training, used for bagging
+  * \param used_data_indices indices of used data
+  * \param num_used_data number of used data
+  */
+  void SetUsedDataIndices(const data_size_t * used_data_indices, data_size_t num_used_data) {
+    used_data_indices_ = used_data_indices;
+    used_data_count_ = num_used_data;
+  }
+
+  /*!
+  * \brief Get number of data on one leaf
+  * \param leaf index of leaf
+  * \return number of data of this leaf
+  */
+  data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }
+
+  /*!
+  * \brief Get leaf begin
+  * \param leaf index of leaf
+  * \return begin index of this leaf
+  */
+  data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }
+
+  const data_size_t* indices() const { return indices_; }
+
+  /*! \brief Get number of leaves */
+  int num_leaves() const { return num_leaves_; }
+
+private:
+  /*! \brief Number of all data */
+  data_size_t num_data_;
+  /*! \brief Number of all leaves */
+  int num_leaves_;
+  /*! \brief start index of data on one leaf */
+  data_size_t* leaf_begin_;
+  /*! \brief number of data on one leaf */
+  data_size_t* leaf_count_;
+  /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
+  data_size_t* indices_;
+  /*! \brief team indices buffer for split */
+  data_size_t* temp_left_indices_;
+  /*! \brief team indices buffer for split */
+  data_size_t* temp_right_indices_;
+  /*! \brief used data indices, used for bagging */
+  const data_size_t* used_data_indices_;
+  /*! \brief used data count, used for bagging */
+  data_size_t used_data_count_;
+  /*! \brief number of threads */
+  int num_threads_;
+  /*! \brief Buffer for multi-threading data partition, used to store offset for different threads */
+  data_size_t* offsets_buf_;
+  /*! \brief Buffer for multi-threading data partition, used to store left count after split for different threads */
+  data_size_t* left_cnts_buf_;
+  /*! \brief Buffer for multi-threading data partition, used to store right count after split for different threads */
+  data_size_t* right_cnts_buf_;
+  /*! \brief Buffer for multi-threading data partition, used to store write position of left leaf for different threads */
+  data_size_t* left_write_pos_buf_;
+  /*! \brief Buffer for multi-threading data partition, used to store write position of right leaf for different threads */
+  data_size_t* right_write_pos_buf_;
+};
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_TREELEARNER_DATA_PARTITION_HPP_
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
+#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
+#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
+
+#include "split_info.hpp"
+#include <LightGBM/feature.h>
+
+#include <cstring>
+
+namespace LightGBM {
+
+/*!
+* \brief FeatureHistogram is used to construct and store a histogram for a feature.
+*/
+class FeatureHistogram {
+public:
+  FeatureHistogram()
+    :data_(nullptr) {
+  }
+  ~FeatureHistogram() {
+    if (data_ != nullptr) { delete[] data_; }
+  }
+
+  /*!
+  * \brief Init the feature histogram
+  * \param feature the feature data for this histogram
+  * \param min_num_data_one_leaf minimal number of data in one leaf
+  */
+  void Init(const Feature* feature, int feature_idx, data_size_t min_num_data_one_leaf,
+    score_t min_sum_hessian_one_leaf) {
+    feature_idx_ = feature_idx;
+    min_num_data_one_leaf_ = min_num_data_one_leaf;
+    min_sum_hessian_one_leaf_ = min_sum_hessian_one_leaf;
+    bin_data_ = feature->bin_data();
+    num_bins_ = feature->num_bin();
+    data_ = new HistogramBinEntry[num_bins_];
+  }
+
+
+  /*!
+  * \brief Construct a histogram
+  * \param num_data number of data in current leaf
+  * \param sum_gradients sum of gradients of current leaf
+  * \param sum_hessians sum of hissians of current leaf
+  * \param ordered_gradients Orederd gradients
+  * \param ordered_hessians  Ordered hessians
+  * \param data_indices data indices of current leaf
+  */
+  void Construct(data_size_t* data_indices, data_size_t num_data, score_t sum_gradients,
+                        score_t sum_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians) {
+    std::memset(data_, 0, sizeof(HistogramBinEntry)* num_bins_);
+    num_data_ = num_data;
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians + 2 * kEpsilon;
+    bin_data_->ConstructHistogram(data_indices, num_data, ordered_gradients, ordered_hessians, data_);
+  }
+
+  /*!
+  * \brief Construct a histogram by ordered bin
+  * \param leaf current leaf
+  * \param num_data number of data in current leaf
+  * \param sum_gradients sum of gradients of current leaf
+  * \param sum_hessians sum of hissians of current leaf
+  * \param gradients
+  * \param hessian
+  */
+  void Construct(const OrderedBin* ordered_bin, int leaf, data_size_t num_data, score_t sum_gradients,
+                        score_t sum_hessians, const score_t* gradients, const score_t* hessians) {
+    std::memset(data_, 0, sizeof(HistogramBinEntry)* num_bins_);
+    num_data_ = num_data;
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians + 2 * kEpsilon;
+    ordered_bin->ConstructHistogram(leaf, gradients, hessians, data_);
+  }
+
+  /*!
+  * \brief Set sumup information for current histogram
+  * \param num_data number of data in current leaf
+  * \param sum_gradients sum of gradients of current leaf
+  * \param sum_hessians sum of hissians of current leaf
+  */
+  void SetSumup(data_size_t num_data, score_t sum_gradients, score_t sum_hessians) {
+    num_data_ = num_data;
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians + 2 * kEpsilon;
+  }
+
+  /*!
+  * \brief Subtract current histograms with other
+  * \param other The histogram that want to subtract
+  */
+  void Subtract(const FeatureHistogram& other) {
+    num_data_ -= other.num_data_;
+    sum_gradients_ -= other.sum_gradients_;
+    sum_hessians_ -= other.sum_hessians_;
+    for (unsigned int i = 0; i < num_bins_; ++i) {
+      data_[i].cnt -= other.data_[i].cnt;
+      data_[i].sum_gradients -= other.data_[i].sum_gradients;
+      data_[i].sum_hessians -= other.data_[i].sum_hessians;
+    }
+  }
+
+  /*!
+  * \brief Find best threshold for this histogram
+  * \param output The best split result
+  */
+  void FindBestThreshold(SplitInfo* output) {
+    score_t best_sum_left_gradient = NAN;
+    score_t best_sum_left_hessian = NAN;
+    score_t best_gain = kMinScore;
+    data_size_t best_left_count = 0;
+    unsigned int best_threshold = static_cast<unsigned int>(num_bins_);
+    score_t sum_right_gradient = 0.0f;
+    score_t sum_right_hessian = kEpsilon;
+    data_size_t right_count = 0;
+    score_t gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
+    is_splittable_ = false;
+    // from right to left, and we don't need data in bin0
+    for (unsigned int t = num_bins_ - 1; t > 0; --t) {
+      sum_right_gradient += data_[t].sum_gradients;
+      sum_right_hessian += data_[t].sum_hessians;
+      right_count += data_[t].cnt;
+      // if data not enough, or sum hessian too small
+      if (right_count < min_num_data_one_leaf_ || sum_right_hessian < min_sum_hessian_one_leaf_) continue;
+      data_size_t left_count = num_data_ - right_count;
+      // if data not enough
+      if (left_count < min_num_data_one_leaf_) break;
+
+      score_t sum_left_hessian = sum_hessians_ - sum_right_hessian;
+      // if sum hessian too small
+      if (sum_left_hessian < min_sum_hessian_one_leaf_) {
+        break;
+      }
+      score_t sum_left_gradient = sum_gradients_ - sum_right_gradient;
+      // current split gain
+      score_t current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian) + GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
+      // gain is worst than no perform split
+      if (current_gain < gain_shift) {
+        continue;
+      }
+      // mark to is splittable
+      is_splittable_ = true;
+      // better split point
+      if (current_gain > best_gain) {
+        best_left_count = left_count;
+        best_sum_left_gradient = sum_left_gradient;
+        best_sum_left_hessian = sum_left_hessian;
+        // left is <= threshold, right is > threshold.  so this is t-1
+        best_threshold = t - 1;
+        best_gain = current_gain;
+      }
+    }
+    // update split information
+    output->feature = feature_idx_;
+    output->threshold = best_threshold;
+    output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
+    output->left_count = best_left_count;
+    output->left_sum_gradient = best_sum_left_gradient;
+    output->left_sum_hessian = best_sum_left_hessian;
+    output->right_output = CalculateSplittedLeafOutput(sum_gradients_ - best_sum_left_gradient,
+      sum_hessians_ - best_sum_left_hessian);
+    output->right_count = num_data_ - best_left_count;
+    output->right_sum_gradient = sum_gradients_ - best_sum_left_gradient;
+    output->right_sum_hessian = sum_hessians_ - best_sum_left_hessian;
+    output->gain = best_gain - gain_shift;
+  }
+
+  /*!
+  * \brief Binary size of this histogram
+  */
+  int SizeOfHistgram() const {
+    return num_bins_ * sizeof(HistogramBinEntry);
+  }
+
+  /*!
+  * \brief Memory pointer to histogram data
+  */
+  const HistogramBinEntry* HistogramData() const {
+    return data_;
+  }
+
+  /*!
+  * \brief Restore histogram from memory
+  */
+  void FromMemory(char* memory_data)  {
+    std::memcpy(data_, memory_data, num_bins_ * sizeof(HistogramBinEntry));
+  }
+
+  /*!
+  * \brief Set min number data in one leaf
+  */
+  void SetMinNumDataOneLeaf(data_size_t new_val) {
+    min_num_data_one_leaf_ = new_val;
+  }
+
+  /*!
+  * \brief Set min sum hessian in one leaf
+  */
+  void SetMinSumHessianOneLeaf(score_t new_val) {
+    min_sum_hessian_one_leaf_ = new_val;
+  }
+
+  /*!
+  * \brief True if this histogram can be splitted
+  */
+  bool is_splittable() { return is_splittable_; }
+
+  /*!
+  * \brief Set splittable to this histogram
+  */
+  void set_is_splittable(bool val) { is_splittable_ = val; }
+
+private:
+  /*!
+  * \brief Calculate the split gain based on sum_gradients and sum_hessians
+  * \param sum_gradients
+  * \param sum_hessians
+  * \return split gain
+  */
+  score_t GetLeafSplitGain(score_t sum_gradients, score_t sum_hessians) const {
+    return (sum_gradients * sum_gradients) / (sum_hessians);
+  }
+
+  /*!
+  * \brief Calculate the output of a leaf based on sum_gradients and sum_hessians
+  * \param sum_gradients
+  * \param sum_hessians
+  * \return leaf output
+  */
+  score_t CalculateSplittedLeafOutput(score_t sum_gradients, score_t sum_hessians) const {
+    return -(sum_gradients) / (sum_hessians);
+  }
+
+  int feature_idx_;
+  /*! \brief minimal number of data in one leaf */
+  data_size_t min_num_data_one_leaf_;
+  /*! \brief minimal sum hessian of data in one leaf */
+  score_t min_sum_hessian_one_leaf_;
+  /*! \brief the bin data of current feature */
+  const Bin* bin_data_;
+  /*! \brief number of bin of histogram */
+  unsigned int num_bins_;
+  /*! \brief sum of gradient of each bin */
+  HistogramBinEntry* data_;
+  /*! \brief number of all data */
+  data_size_t num_data_;
+  /*! \brief sum of gradient of current leaf */
+  score_t sum_gradients_;
+  /*! \brief sum of hessians of current leaf */
+  score_t sum_hessians_;
+  /*! \brief False if this histogram cannot split */
+  bool is_splittable_ = true;
+};
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
+#include "parallel_tree_learner.h"
+
+#include <cstring>
+
+#include <vector>
+
+namespace LightGBM {
+
+FeatureParallelTreeLearner::FeatureParallelTreeLearner(const TreeConfig& tree_config)
+  :SerialTreeLearner(tree_config), input_buffer_(nullptr), output_buffer_(nullptr) {
+}
+
+FeatureParallelTreeLearner::~FeatureParallelTreeLearner() {
+  if (input_buffer_ != nullptr) { delete[] input_buffer_; }
+  if (output_buffer_ != nullptr) { delete[] output_buffer_; }
+}
+void FeatureParallelTreeLearner::Init(const Dataset* train_data) {
+  SerialTreeLearner::Init(train_data);
+  rank_ = Network::rank();
+  num_machines_ = Network::num_machines();
+  input_buffer_ = new char[sizeof(SplitInfo) * 2];
+  output_buffer_ = new char[sizeof(SplitInfo) * 2];
+}
+
+
+
+void FeatureParallelTreeLearner::BeforeTrain() {
+  SerialTreeLearner::BeforeTrain();
+  // get feature partition
+  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
+  std::vector<int> num_bins_distributed(num_machines_, 0);
+  for (int i = 0; i < train_data_->num_features(); ++i) {
+    if (is_feature_used_[i]) {
+      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
+      feature_distribution[cur_min_machine].push_back(i);
+      num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
+      is_feature_used_[i] = false;
+    }
+  }
+  // get local used features
+  for (auto fid : feature_distribution[rank_]) {
+    is_feature_used_[fid] = true;
+  }
+}
+
+void FeatureParallelTreeLearner::FindBestSplitsForLeaves() {
+  int smaller_best_feature = -1, larger_best_feature = -1;
+  SplitInfo smaller_best, larger_best;
+  // get best split at smaller leaf
+  std::vector<double> gains;
+  for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
+    gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
+  }
+  smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
+  smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
+  // get best split at larger leaf
+  if (larger_leaf_splits_->LeafIndex() >= 0) {
+    gains.clear();
+    for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
+      gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
+    }
+    larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
+    larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
+  }
+  // sync global best info
+  std::memcpy(input_buffer_, &smaller_best, sizeof(SplitInfo));
+  std::memcpy(input_buffer_ + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo));
+
+  Network::Allreduce(input_buffer_, sizeof(SplitInfo) * 2, sizeof(SplitInfo),
+                     output_buffer_, &SplitInfo::MaxReducer);
+  // copy back
+  std::memcpy(&smaller_best, output_buffer_, sizeof(SplitInfo));
+  std::memcpy(&larger_best, output_buffer_ + sizeof(SplitInfo), sizeof(SplitInfo));
+  // update best split
+  best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()] = smaller_best;
+  if (larger_leaf_splits_->LeafIndex() >= 0) {
+    best_split_per_leaf_[larger_leaf_splits_->LeafIndex()] = larger_best;
+  }
+}
+
+}  // namespace LightGBM
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
+#ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
+#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
+
+#include <LightGBM/meta.h>
+#include "data_partition.hpp"
+#include "split_info.hpp"
+
+#include <vector>
+
+namespace LightGBM {
+
+/*!
+* \brief used to find splits candidates for a leaf
+*/
+class LeafSplits {
+public:
+  LeafSplits(int num_feature, data_size_t num_data)
+    :num_data_in_leaf_(num_data), num_data_(num_data), num_features_(num_feature),
+    data_indices_(nullptr) {
+    for (int i = 0; i < num_features_; ++i) {
+      best_split_per_feature_.push_back(SplitInfo());
+      best_split_per_feature_[i].feature = i;
+    }
+  }
+  ~LeafSplits() {
+  }
+
+  /*!
+  * \brief Init splits on current leaf, don't need to travesal all data
+  * \param leaf Index of current leaf
+  * \param data_partition current data partition
+  * \param sum_gradients
+  * \param sum_hessians
+  */
+  void Init(int leaf, const DataPartition* data_partition, score_t sum_gradients, score_t sum_hessians) {
+    leaf_index_ = leaf;
+    num_data_in_leaf_ = data_partition->GetIndexOnLeaf(leaf, &data_indices_);
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians;
+    for (SplitInfo& split_info : best_split_per_feature_) {
+      split_info.Reset();
+    }
+  }
+
+  /*!
+  * \brief Init splits on current leaf, need to travesal all data to sum up
+  * \param gradients
+  * \param hessians
+  */
+  void Init(const score_t* gradients, const score_t *hessians) {
+    num_data_in_leaf_ = num_data_;
+    leaf_index_ = 0;
+    data_indices_ = nullptr;
+    score_t tmp_sum_gradients = 0.0;
+    score_t tmp_sum_hessians = 0.0;
+#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
+    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+      tmp_sum_gradients += gradients[i];
+      tmp_sum_hessians += hessians[i];
+    }
+    sum_gradients_ = tmp_sum_gradients;
+    sum_hessians_ = tmp_sum_hessians;
+    for (SplitInfo& split_info : best_split_per_feature_) {
+      split_info.Reset();
+    }
+  }
+
+  /*!
+  * \brief Init splits on current leaf, need to travesal all data to sum up
+  * \param leaf Index of current leaf
+  * \param data_partition current data partition
+  * \param gradients
+  * \param hessians
+  */
+  void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t *hessians) {
+    leaf_index_ = leaf;
+    num_data_in_leaf_ = data_partition->GetIndexOnLeaf(leaf, &data_indices_);
+    score_t tmp_sum_gradients = 0.0;
+    score_t tmp_sum_hessians = 0.0;
+#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
+    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+      data_size_t idx = data_indices_[i];
+      tmp_sum_gradients += gradients[idx];
+      tmp_sum_hessians += hessians[idx];
+    }
+    sum_gradients_ = tmp_sum_gradients;
+    sum_hessians_ = tmp_sum_hessians;
+    for (SplitInfo& split_info : best_split_per_feature_) {
+      split_info.Reset();
+    }
+  }
+
+
+  /*!
+  * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
+  * \param sum_gradients
+  * \param sum_hessians
+  */
+  void Init(score_t sum_gradients, score_t sum_hessians) {
+    leaf_index_ = 0;
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians;
+    for (SplitInfo& split_info : best_split_per_feature_) {
+      split_info.Reset();
+    }
+  }
+
+  /*!
+  * \brief Init splits on current leaf
+  */
+  void Init() {
+    leaf_index_ = -1;
+    for (SplitInfo& split_info : best_split_per_feature_) {
+      split_info.Reset();
+    }
+  }
+
+  /*! \brief Get best splits on all features */
+  std::vector<SplitInfo>& BestSplitPerFeature() { return best_split_per_feature_;}
+
+  /*! \brief Get current leaf index */
+  int LeafIndex() const { return leaf_index_; }
+
+  /*! \brief Get numer of data in current leaf */
+  data_size_t num_data_in_leaf() const { return num_data_in_leaf_; }
+
+  /*! \brief Get sum of gradients of current leaf */
+  score_t sum_gradients() const { return sum_gradients_; }
+  
+  /*! \brief Get sum of hessians of current leaf */
+  score_t sum_hessians() const { return sum_hessians_; }
+
+  /*! \brief Get indices of data of current leaf */
+  data_size_t * data_indices() const { return data_indices_; }
+
+
+private:
+  /*! \brief store best splits of all feature on current leaf */
+  std::vector<SplitInfo> best_split_per_feature_;
+  /*! \brief current leaf index */
+  int leaf_index_;
+  /*! \brief number of data on current leaf */
+  data_size_t num_data_in_leaf_;
+  /*! \brief number of all training data */
+  data_size_t num_data_;
+  /*! \brief number of features */
+  int num_features_;
+  /*! \brief sum of gradients of current leaf */
+  score_t sum_gradients_;
+  /*! \brief sum of hessians of current leaf */
+  score_t sum_hessians_;
+  /*! \brief indices of data of current leaf */
+  data_size_t* data_indices_;
+};
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_TREELEARNER_LEAF_SPLITS_HPP_
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
+#ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
+#define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
+
+#include <LightGBM/utils/array_args.h>
+
+#include <LightGBM/network.h>
+#include "serial_tree_learner.h"
+
+#include <cstring>
+
+#include <vector>
+
+namespace LightGBM {
+
+/*!
+* \brief Feature parallel learning algorithm.
+* Different machine will find best split on different features, then sync global best split
+* When #data is small or #feature is large, you can use this to have better speed-up
+*/
+class FeatureParallelTreeLearner: public SerialTreeLearner {
+public:
+  explicit FeatureParallelTreeLearner(const TreeConfig& tree_config);
+  ~FeatureParallelTreeLearner();
+  virtual void Init(const Dataset* train_data);
+
+protected:
+  void BeforeTrain() override;
+  void FindBestSplitsForLeaves() override;
+private:
+  /*! \brief rank of local machine */
+  int rank_;
+  /*! \brief Number of machines of this parallel task */
+  int num_machines_;
+  /*! \brief Buffer for network send */
+  char* input_buffer_;
+  /*! \brief Buffer for network receive */
+  char* output_buffer_;
+};
+
+/*!
+* \brief Data parallel learning algorithm.
+* Workers use local data to construct histograms locally, then sync up global histograms.
+* When #data is large or #feature is small, you can use this to have better speed-up
+*/
+class DataParallelTreeLearner: public SerialTreeLearner {
+public:
+  explicit DataParallelTreeLearner(const TreeConfig& tree_config);
+  ~DataParallelTreeLearner();
+  void Init(const Dataset* train_data) override;
+protected:
+  void BeforeTrain() override;
+  void FindBestThresholds() override;
+  void FindBestSplitsForLeaves() override;
+  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
+
+  inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override {
+    if (leaf_idx >= 0) {
+      return global_data_count_in_leaf_[leaf_idx];
+    } else {
+      return 0;
+    }
+  }
+
+private:
+  /*! \brief Rank of local machine */
+  int rank_;
+  /*! \brief Number of machines of this parallel task */
+  int num_machines_;
+  /*! \brief Buffer for network send */
+  char* input_buffer_;
+  /*! \brief Buffer for network receive */
+  char* output_buffer_;
+  /*! \brief different machines will aggregate histograms for different features,
+       use this to mark local aggregate features*/
+  bool* is_feature_aggregated_;
+  /*! \brief Block start index for reduce scatter */
+  int* block_start_;
+  /*! \brief Block size for reduce scatter */
+  int* block_len_;
+  /*! \brief Write positions for feature histgrams */
+  int* buffer_write_start_pos_;
+  /*! \brief Read positions for local feature histgrams */
+  int* buffer_read_start_pos_;
+  /*! \brief Size for reduce scatter */
+  int reduce_scatter_size_;
+  /*! \brief Store global number of data in leaves  */
+  data_size_t* global_data_count_in_leaf_;
+};
+
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
+
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
+#include "serial_tree_learner.h"
+
+#include <LightGBM/utils/array_args.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace LightGBM {
+
+SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
+  :data_partition_(nullptr), is_feature_used_(nullptr),
+  historical_histogram_array_(nullptr), smaller_leaf_histogram_array_(nullptr),
+  larger_leaf_histogram_array_(nullptr),
+  smaller_leaf_splits_(nullptr), larger_leaf_splits_(nullptr),
+  ordered_gradients_(nullptr), ordered_hessians_(nullptr), is_data_in_leaf_(nullptr) {
+  // initialize with nullptr
+  num_leaves_ = tree_config.num_leaves;
+  min_num_data_one_leaf_ = static_cast<data_size_t>(tree_config.min_data_in_leaf);
+  min_sum_hessian_one_leaf_ = static_cast<float>(tree_config.min_sum_hessian_in_leaf);
+  feature_fraction_ = tree_config.feature_fraction;
+  random_ = Random(tree_config.feature_fraction_seed);
+}
+
+SerialTreeLearner::~SerialTreeLearner() {
+  if (data_partition_ != nullptr) { delete data_partition_; }
+  if (smaller_leaf_splits_ != nullptr) { delete smaller_leaf_splits_; }
+  if (larger_leaf_splits_ != nullptr) { delete larger_leaf_splits_; }
+  for (int i = 0; i < num_leaves_; ++i) {
+    if (historical_histogram_array_[i] != nullptr) {
+      delete[] historical_histogram_array_[i];
+    }
+  }
+  if (historical_histogram_array_ != nullptr) { delete[] historical_histogram_array_; }
+  if (is_feature_used_ != nullptr) { delete[] is_feature_used_; }
+  if (ordered_gradients_ != nullptr) { delete[] ordered_gradients_; }
+  if (ordered_hessians_ != nullptr) { delete[] ordered_hessians_; }
+  for (auto& bin : ordered_bins_) {
+    delete bin;
+  }
+  if (is_data_in_leaf_ != nullptr) {
+    delete[] is_data_in_leaf_;
+  }
+}
+
+void SerialTreeLearner::Init(const Dataset* train_data) {
+  train_data_ = train_data;
+  num_data_ = train_data_->num_data();
+  num_features_ = train_data_->num_features();
+
+  // allocate the space for historical_histogram_array_
+  historical_histogram_array_ = new FeatureHistogram*[num_leaves_];
+  for (int i = 0; i < num_leaves_; ++i) {
+    historical_histogram_array_[i] = new FeatureHistogram[train_data_->num_features()];
+    for (int j = 0; j < train_data_->num_features(); ++j) {
+      historical_histogram_array_[i][j].Init(train_data_->FeatureAt(j),
+                                             j, min_num_data_one_leaf_,
+                                             min_sum_hessian_one_leaf_);
+    }
+  }
+  // push split information for all leaves
+  for (int i = 0; i < num_leaves_; ++i) {
+    best_split_per_leaf_.push_back(SplitInfo());
+  }
+  // initialize ordered_bins_ with nullptr
+  for (int i = 0; i < num_features_; ++i) {
+    ordered_bins_.push_back(nullptr);
+  }
+
+  // get ordered bin
+  #pragma omp parallel for schedule(guided)
+  for (int i = 0; i < num_features_; ++i) {
+    ordered_bins_[i] = train_data_->FeatureAt(i)->bin_data()->CreateOrderedBin();
+  }
+
+  // check existing for ordered bin
+  for (int i = 0; i < num_features_; ++i) {
+    if (ordered_bins_[i] != nullptr) {
+      has_ordered_bin_ = true;
+      break;
+    }
+  }
+  // initialize  splits for leaf
+  smaller_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());
+  larger_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());
+
+  // initialize data partition
+  data_partition_ = new DataPartition(num_data_, num_leaves_);
+
+  is_feature_used_ = new bool[num_features_];
+
+  // initialize ordered gradients and hessians
+  ordered_gradients_ = new score_t[num_data_];
+  ordered_hessians_ = new score_t[num_data_];
+  // if has ordered bin, need allocata a buffer to fast split
+  if (has_ordered_bin_) {
+    is_data_in_leaf_ = new char[num_data_];
+  }
+  Log::Stdout("#data:%d #feature:%d\n", num_data_, num_features_);
+}
+
+
+Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
+  gradients_ = gradients;
+  hessians_ = hessians;
+  // some initial works before training
+  BeforeTrain();
+  Tree *tree = new Tree(num_leaves_);
+  // root leaf
+  int left_leaf = 0;
+  // only root leaf can be splitted on first time
+  int right_leaf = -1;
+  for (int split = 0; split < num_leaves_ - 1; split++) {
+    // some initial works before finding best split
+    if (BeforeFindBestSplit(left_leaf, right_leaf)) {
+      // find best threshold for every feature
+      FindBestThresholds();
+      // find best split from all features
+      FindBestSplitsForLeaves();
+    }
+    // Get a leaf with max split gain
+    int best_leaf = static_cast<int>(ArrayArgs<SplitInfo>::ArgMax(best_split_per_leaf_));
+    // Get split information for best leaf
+    const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
+    // cannot split, quit
+    if (best_leaf_SplitInfo.gain <= 0.0) {
+      Log::Stdout("cannot find more split with gain = %f , current #leaves=%d\n",
+                   best_leaf_SplitInfo.gain, split + 1);
+      break;
+    }
+    // split tree with best leaf
+    Split(tree, best_leaf, &left_leaf, &right_leaf);
+  }
+  // save pointer to last trained tree
+  last_trained_tree_ = tree;
+  return tree;
+}
+
+void SerialTreeLearner::BeforeTrain() {
+  // initialize used features
+  for (int i = 0; i < num_features_; ++i) {
+    is_feature_used_[i] = false;
+  }
+  // Get used feature at current tree
+  size_t used_feature_cnt = static_cast<size_t>(num_features_*feature_fraction_);
+  std::vector<size_t> used_feature_indices = random_.Sample(num_features_, used_feature_cnt);
+  for (auto idx : used_feature_indices) {
+    is_feature_used_[idx] = true;
+  }
+  // set all histogram to splittable
+  #pragma omp parallel for schedule(static)
+  for (int i = 0; i < num_leaves_; ++i) {
+    for (int j = 0; j < train_data_->num_features(); ++j) {
+      historical_histogram_array_[i][j].set_is_splittable(true);
+    }
+  }
+  // initialize data partition
+  data_partition_->Init();
+
+  // reset the splits for leaves
+  for (int i = 0; i < num_leaves_; ++i) {
+    best_split_per_leaf_[i].Reset();
+  }
+
+  // Sumup for root
+  if (data_partition_->leaf_count(0) == num_data_) {
+    // use all data
+    smaller_leaf_splits_->Init(gradients_, hessians_);
+    // point to gradients, avoid copy
+    ptr_to_ordered_gradients_ = gradients_;
+    ptr_to_ordered_hessians_ = hessians_;
+  } else {
+    // use bagging, only use part of data
+    smaller_leaf_splits_->Init(0, data_partition_, gradients_, hessians_);
+    // copy used gradients and hessians to ordered buffer
+    const data_size_t* indices = data_partition_->indices();
+    data_size_t cnt = data_partition_->leaf_count(0);
+    #pragma omp parallel for schedule(static)
+    for (data_size_t i = 0; i < cnt; ++i) {
+      ordered_gradients_[i] = gradients_[indices[i]];
+      ordered_hessians_[i] = hessians_[indices[i]];
+    }
+    // point to ordered_gradients_ and ordered_hessians_
+    ptr_to_ordered_gradients_ = ordered_gradients_;
+    ptr_to_ordered_hessians_ = ordered_hessians_;
+  }
+
+  larger_leaf_splits_->Init();
+
+  // if has ordered bin, need to initialize the ordered bin
+  if (has_ordered_bin_) {
+    if (data_partition_->leaf_count(0) == num_data_) {
+      // use all data, pass nullptr
+      #pragma omp parallel for schedule(guided)
+      for (int i = 0; i < num_features_; ++i) {
+        if (ordered_bins_[i] != nullptr) {
+          ordered_bins_[i]->Init(nullptr, num_leaves_);
+        }
+      }
+    } else {
+      // bagging, only use part of data
+
+      // mark used data
+      std::memset(is_data_in_leaf_, 0, sizeof(char)*num_data_);
+      const data_size_t* indices = data_partition_->indices();
+      data_size_t begin = data_partition_->leaf_begin(0);
+      data_size_t end = begin + data_partition_->leaf_count(0);
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = begin; i < end; ++i) {
+        is_data_in_leaf_[indices[i]] = 1;
+      }
+      // initialize ordered bin
+      #pragma omp parallel for schedule(guided)
+      for (int i = 0; i < num_features_; ++i) {
+        if (ordered_bins_[i] != nullptr) {
+          ordered_bins_[i]->Init(is_data_in_leaf_, num_leaves_);
+        }
+      }
+    }
+  }
+}
+
+bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
+  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
+  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
+  // no enough data to continue
+  if (num_data_in_right_child < static_cast<data_size_t>(min_num_data_one_leaf_ * 2)
+    && num_data_in_left_child < static_cast<data_size_t>(min_num_data_one_leaf_ * 2)) {
+    best_split_per_leaf_[left_leaf].gain = kMinScore;
+    if (right_leaf >= 0) {
+      best_split_per_leaf_[right_leaf].gain = kMinScore;
+    }
+    return false;
+  }
+  // -1 if only has one leaf. else equal the index of smaller leaf
+  int smaller_leaf = -1;
+  // only have root
+  if (right_leaf < 0) {
+    smaller_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
+    larger_leaf_histogram_array_ = nullptr;
+  } else if (num_data_in_left_child < num_data_in_right_child) {
+    smaller_leaf = left_leaf;
+    // put parent(left) leaf's histograms into larger leaf's histgrams
+    larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
+    smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
+
+    // We will construc histograms for smaller leaf, and smaller_leaf=left_leaf = parent.
+    // if we don't swap the cache, we will overwrite the parent's hisogram cache.
+    std::swap(historical_histogram_array_[left_leaf], historical_histogram_array_[right_leaf]);
+  } else {
+    smaller_leaf = right_leaf;
+    // put parent(left) leaf's histograms to larger leaf's histgrams
+    larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
+    smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
+  }
+
+  // init for the ordered gradients, only initialize when have 2 leaves
+  if (smaller_leaf >= 0) {
+    // only need to initialize for smaller leaf
+
+    // Get leaf boundary
+    const data_size_t* indices = data_partition_->indices();
+    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
+    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
+    // copy
+    #pragma omp parallel for schedule(static)
+    for (data_size_t i = begin; i < end; ++i) {
+      ordered_gradients_[i - begin] = gradients_[indices[i]];
+      ordered_hessians_[i - begin] = hessians_[indices[i]];
+    }
+    // assign pointer
+    ptr_to_ordered_gradients_ = ordered_gradients_;
+    ptr_to_ordered_hessians_ = ordered_hessians_;
+  }
+
+  // split for the ordered bin
+  if (has_ordered_bin_ && right_leaf >= 0) {
+    // mark data that at left-leaf
+    std::memset(is_data_in_leaf_, 0, sizeof(char)*num_data_);
+    const data_size_t* indices = data_partition_->indices();
+    data_size_t begin = data_partition_->leaf_begin(left_leaf);
+    data_size_t end = begin + data_partition_->leaf_count(left_leaf);
+    #pragma omp parallel for schedule(static)
+    for (data_size_t i = begin; i < end; ++i) {
+      is_data_in_leaf_[indices[i]] = 1;
+    }
+    // split the ordered bin
+    #pragma omp parallel for schedule(guided)
+    for (int i = 0; i < num_features_; ++i) {
+      if (ordered_bins_[i] != nullptr) {
+        ordered_bins_[i]->Split(left_leaf, right_leaf, is_data_in_leaf_);
+      }
+    }
+  }
+  return true;
+}
+
+
+void SerialTreeLearner::FindBestThresholds() {
+  #pragma omp parallel for schedule(guided)
+  for (int feature_index = 0; feature_index < num_features_; feature_index++) {
+    // feature is not used
+    if ((is_feature_used_ != nullptr && is_feature_used_[feature_index] == false)) continue;
+    // if parent(larger) leaf cannot split at current feature
+    if (larger_leaf_histogram_array_ != nullptr && !larger_leaf_histogram_array_[feature_index].is_splittable()) {
+      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
+      continue;
+    }
+
+    // construct histograms for smaller leaf
+    if (ordered_bins_[feature_index] == nullptr) {
+      // if not use ordered bin
+      smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
+        smaller_leaf_splits_->num_data_in_leaf(),
+        smaller_leaf_splits_->sum_gradients(),
+        smaller_leaf_splits_->sum_hessians(),
+        ptr_to_ordered_gradients_,
+        ptr_to_ordered_hessians_);
+    } else {
+      // used ordered bin
+      smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
+        smaller_leaf_splits_->LeafIndex(),
+        smaller_leaf_splits_->num_data_in_leaf(),
+        smaller_leaf_splits_->sum_gradients(),
+        smaller_leaf_splits_->sum_hessians(),
+        gradients_,
+        hessians_);
+    }
+    // find best threshold for smaller child
+    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
+
+    // only has root leaf
+    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
+
+    // construct histgroms for large leaf, we initialize larger leaf as the parent,
+    // so we can just subtract the smaller leaf's histograms
+    larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
+
+    // find best threshold for larger child
+    larger_leaf_histogram_array_[feature_index].FindBestThreshold(&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
+  }
+}
+
+
+void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
+  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+
+  // left = parent
+  *left_leaf = best_Leaf;
+  // split tree, will return right leaf
+  *right_leaf = tree->Split(best_Leaf, best_split_info.feature, best_split_info.threshold,
+    train_data_->FeatureAt(best_split_info.feature)->feature_index(),
+    train_data_->FeatureAt(best_split_info.feature)->BinToValue(best_split_info.threshold),
+    best_split_info.left_output, best_split_info.right_output, best_split_info.gain);
+
+  // split data partition
+  data_partition_->Split(best_Leaf, train_data_->FeatureAt(best_split_info.feature)->bin_data(),
+                         best_split_info.threshold, *right_leaf);
+
+  // init the leaves that used on next iteration
+  if (best_split_info.left_count < best_split_info.right_count) {
+    smaller_leaf_splits_->Init(*left_leaf, data_partition_,
+                               best_split_info.left_sum_gradient,
+                               best_split_info.left_sum_hessian);
+    larger_leaf_splits_->Init(*right_leaf, data_partition_,
+                               best_split_info.right_sum_gradient,
+                               best_split_info.right_sum_hessian);
+  } else {
+    smaller_leaf_splits_->Init(*right_leaf, data_partition_, best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
+    larger_leaf_splits_->Init(*left_leaf, data_partition_, best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
+  }
+}
+
+}  // namespace LightGBM
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
+#ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
+#define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
+
+#include <LightGBM/utils/random.h>
+#include <LightGBM/utils/array_args.h>
+
+#include <LightGBM/tree_learner.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/feature.h>
+#include "feature_histogram.hpp"
+#include "data_partition.hpp"
+#include "split_info.hpp"
+#include "leaf_splits.hpp"
+
+#include <cstdio>
+#include <vector>
+#include <random>
+#include <cmath>
+
+namespace LightGBM {
+
+/*!
+* \brief Used for learning a tree by single machine
+*/
+class SerialTreeLearner: public TreeLearner {
+public:
+  explicit SerialTreeLearner(const TreeConfig& tree_config);
+
+  ~SerialTreeLearner();
+
+  void Init(const Dataset* train_data) override;
+
+  Tree* Train(const score_t* gradients, const score_t *hessians) override;
+
+  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
+    data_partition_->SetUsedDataIndices(used_indices, num_data);
+  }
+
+  void AddPredictionToScore(score_t *out_score) const override {
+    #pragma omp parallel for schedule(guided)
+    for (int i = 0; i < data_partition_->num_leaves(); ++i) {
+      double output = last_trained_tree_->LeafOutput(i);
+      data_size_t* tmp_idx = nullptr;
+      data_size_t cnt_leaf_data = data_partition_->GetIndexOnLeaf(i, &tmp_idx);
+      for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
+        out_score[tmp_idx[j]] += static_cast<score_t>(output);
+      }
+    }
+  }
+
+protected:
+  /*!
+  * \brief Some initial works before training
+  */
+  virtual void BeforeTrain();
+
+  /*!
+  * \brief Some initial works before FindBestSplit
+  */
+  virtual bool BeforeFindBestSplit(int left_leaf, int right_leaf);
+
+
+  /*!
+  * \brief Find best thresholds for all features, using multi-threading.
+  *  The result will be stored in smaller_leaf_splits_ and larger_leaf_splits_.
+  *  This function will be called in FindBestSplit.
+  */
+  virtual void FindBestThresholds();
+
+  /*!
+  * \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
+  *  This function will be called after FindBestThresholds.
+  */
+  inline virtual void FindBestSplitsForLeaves();
+
+  /*!
+  * \brief Partition tree and data according best split.
+  * \param tree Current tree, will be splitted on this function.
+  * \param best_leaf The index of leaf that will be splitted.
+  * \param left_leaf The index of left leaf after splitted.
+  * \param right_leaf The index of right leaf after splitted.
+  */
+  virtual void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf);
+
+  /*!
+  * \brief Get the number of data in a leaf
+  * \param leaf_idx The index of leaf
+  * \return The number of data in the leaf_idx leaf
+  */
+  inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
+
+  /*!
+  * \brief Find best features for leaf from leaf_splits
+  * \param leaf_splits
+  */
+  inline void FindBestSplitForLeaf(LeafSplits* leaf_splits);
+
+  /*! \brief Last trained decision tree */
+  const Tree* last_trained_tree_;
+  /*! \brief number of data */
+  data_size_t num_data_;
+  /*! \brief number of features */
+  int num_features_;
+  /*! \brief training data */
+  const Dataset* train_data_;
+  /*! \brief gradients of current iteration */
+  const score_t* gradients_;
+  /*! \brief hessians of current iteration */
+  const score_t* hessians_;
+  /*! \brief number of total leaves */
+  int num_leaves_;
+  /*! \brief mininal data on one leaf */
+  data_size_t min_num_data_one_leaf_;
+  /*! \brief mininal sum hessian on one leaf */
+  score_t min_sum_hessian_one_leaf_;
+  /*! \brief sub-feature fraction rate */
+  double feature_fraction_;
+  /*! \brief training data partition on leaves */
+  DataPartition* data_partition_;
+  /*! \brief used for generate used features */
+  Random random_;
+  /*! \brief used for sub feature training, is_feature_used_[i] = falase means don't used feature i */
+  bool* is_feature_used_;
+  /*! \brief cache historical histogram to speed up */
+  FeatureHistogram** historical_histogram_array_;
+  /*! \brief pointer to histograms array of smaller leaf */
+  FeatureHistogram* smaller_leaf_histogram_array_;
+  /*! \brief pointer to histograms array of larger leaf */
+  FeatureHistogram* larger_leaf_histogram_array_;
+
+  /*! \brief store best split points for all leaves */
+  std::vector<SplitInfo> best_split_per_leaf_;
+
+  /*! \brief stores best thresholds for all feature for smaller leaf */
+  LeafSplits* smaller_leaf_splits_;
+  /*! \brief stores best thresholds for all feature for larger leaf */
+  LeafSplits* larger_leaf_splits_;
+
+  /*! \brief gradients of current iteration, ordered for cache optimized */
+  score_t* ordered_gradients_;
+  /*! \brief hessians of current iteration, ordered for cache optimized */
+  score_t* ordered_hessians_;
+
+  /*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
+  const score_t* ptr_to_ordered_gradients_;
+  /*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
+  const score_t* ptr_to_ordered_hessians_;
+  /*! \brief Store ordered bin */
+  std::vector<OrderedBin*> ordered_bins_;
+  /*! \brief True if has ordered bin */
+  bool has_ordered_bin_ = false;
+  /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
+  char* is_data_in_leaf_;
+};
+
+
+
+inline void SerialTreeLearner::FindBestSplitsForLeaves() {
+  FindBestSplitForLeaf(smaller_leaf_splits_);
+  FindBestSplitForLeaf(larger_leaf_splits_);
+}
+
+inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const {
+  if (leafIdx >= 0) {
+    return data_partition_->leaf_count(leafIdx);
+  } else {
+    return 0;
+  }
+}
+
+inline void SerialTreeLearner::FindBestSplitForLeaf(LeafSplits* leaf_splits) {
+  if (leaf_splits == nullptr || leaf_splits->LeafIndex() < 0) {
+    return;
+  }
+  std::vector<double> gains;
+  for (size_t i = 0; i < leaf_splits->BestSplitPerFeature().size(); ++i) {
+    gains.push_back(leaf_splits->BestSplitPerFeature()[i].gain);
+  }
+  int best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
+  int leaf = leaf_splits->LeafIndex();
+  best_split_per_leaf_[leaf] = leaf_splits->BestSplitPerFeature()[best_feature];
+  best_split_per_leaf_[leaf].feature = best_feature;
+}
+
+}  // namespace LightGBM
+#endif   #endif  // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
+#ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
+#define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
+
+#include <LightGBM/meta.h>
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+#include <functional>
+
+namespace LightGBM {
+
+/*!
+* \brief Used to store some information for gain split point
+*/
+struct SplitInfo {
+public:
+  /*! \brief Feature index */
+  int feature;
+  /*! \brief Split threshold */
+  unsigned int threshold;
+  /*! \brief Left output after split */
+  score_t left_output;
+  /*! \brief Right output after split */
+  score_t right_output;
+  /*! \brief Split gain */
+  score_t gain;
+  /*! \brief Left number of data after split */
+  data_size_t left_count;
+  /*! \brief Right number of data after split */
+  data_size_t right_count;
+  /*! \brief Left sum gradient after split */
+  score_t left_sum_gradient;
+  /*! \brief Left sum hessian after split */
+  score_t left_sum_hessian;
+  /*! \brief Right sum gradient after split */
+  score_t right_sum_gradient;
+  /*! \brief Right sum hessian after split */
+  score_t right_sum_hessian;
+
+  SplitInfo() {
+    // initilize with -1 and -inf gain
+    feature = -1;
+    gain = kMinScore;
+  }
+
+  inline void Reset() {
+    // initilize with -1 and -inf gain
+    feature = -1;
+    gain = kMinScore;
+  }
+
+  inline bool operator > (const SplitInfo &si) const;
+
+  inline static void MaxReducer(const char* src, char* dst, int len) {
+    const int type_size = sizeof(SplitInfo);
+    int used_size = 0;
+    const SplitInfo* p1;
+    SplitInfo* p2;
+    while (used_size < len) {
+      p1 = reinterpret_cast<const SplitInfo*>(src);
+      p2 = reinterpret_cast<SplitInfo*>(dst);
+      if (*p1 > *p2) {
+        // copy
+        std::memcpy(dst, src, type_size);
+      }
+      src += type_size;
+      dst += type_size;
+      used_size += type_size;
+    }
+  }
+};
+
+
+
+inline bool SplitInfo::operator > (const SplitInfo& si) const {
+  score_t local_gain = this->gain;
+  score_t other_gain = si.gain;
+  // replace nan with -inf
+  if (local_gain == NAN) {
+    local_gain = kMinScore;
+  }
+  // replace nan with -inf
+  if (other_gain == NAN) {
+    other_gain = kMinScore;
+  }
+  int local_feature = this->feature;
+  int other_feature = si.feature;
+  // replace -1 with max int
+  if (local_feature == -1) {
+    local_feature = INT32_MAX;
+  }
+  // replace -1 with max int
+  if (other_feature == -1) {
+    other_feature = INT32_MAX;
+  }
+  if (local_gain != other_gain) {
+    return local_gain > other_gain;
+  } else {
+    // if same gain, use smaller feature
+    return local_feature < other_feature;
+  }
+}
+
+}  // namespace LightGBM
+#endif  #endif  // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
+#include <LightGBM/tree_learner.h>
+
+#include "serial_tree_learner.h"
+#include "parallel_tree_learner.h"
+
+namespace LightGBM {
+
+TreeLearner* TreeLearner::CreateTreeLearner(TreeLearnerType type, const TreeConfig& tree_config) {
+  if (type == TreeLearnerType::kSerialTreeLearner) {
+    return new SerialTreeLearner(tree_config);
+  } else if (type == TreeLearnerType::kFeatureParallelTreelearner) {
+    return new FeatureParallelTreeLearner(tree_config);
+  } else if (type == TreeLearnerType::kDataParallelTreeLearner) {
+    return new DataParallelTreeLearner(tree_config);
+  }
+  return nullptr;
+}
+
+}  // namespace LightGBM
--- a/windows/LightGBM.sln
+++ b/windows/LightGBM.sln
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 14.0.25123.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LightGBM", "LightGBM.vcxproj", "{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug_mpi|x64 = Debug_mpi|x64
+		Debug|x64 = Debug|x64
+		Release_mpi|x64 = Release_mpi|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.ActiveCfg = Debug_mpi|x64
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.Build.0 = Debug_mpi|x64
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.ActiveCfg = Debug|x64
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.Build.0 = Debug|x64
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.ActiveCfg = Release_mpi|x64
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.Build.0 = Release_mpi|x64
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.ActiveCfg = Release|x64
+		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="Projectconfigurations">
+    <ProjectConfiguration Include="Debug_mpi|x64">
+      <Configuration>Debug_mpi</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_mpi|x64">
+      <Configuration>Release_mpi</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}</ProjectGuid>
+    <RootNamespace>LightGBM</RootNamespace>
+    <SccProjectName>SAK</SccProjectName>
+    <SccAuxPath>SAK</SccAuxPath>
+    <SccLocalPath>SAK</SccLocalPath>
+    <SccProvider>SAK</SccProvider>
+    <ProjectName>LightGBM</ProjectName>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release_mpi|x64'">
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(configuration)|$(Platform)'=='Debug|x64'">
+    <IncludePath>..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(configuration)|$(Platform)'=='Debug_mpi|x64'">
+    <IncludePath>$(MSMPI_INC);..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
+    <LibraryPath>$(MSMPI_LIB64);$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(configuration)|$(Platform)'=='Release|x64'">
+    <IncludePath>..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(configuration)|$(Platform)'=='Release_mpi|x64'">
+    <IncludePath>$(MSMPI_INC);..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
+    <LibraryPath>$(MSMPI_LIB64);$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
+    <ClCompile>
+      <PreprocessorDefinitions>USE_MPI</PreprocessorDefinitions>
+      <WarningLevel>Level4</WarningLevel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
+      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <Link>
+      <AdditionalDependencies>msmpi.lib</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PreprocessorDefinitions>USE_SOCKET</PreprocessorDefinitions>
+      <WarningLevel>Level4</WarningLevel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
+      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(configuration)|$(Platform)'=='Release_mpi|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>USE_MPI;_MBCS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <OpenMPSupport>true</OpenMPSupport>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <OmitFramePointers>true</OmitFramePointers>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>msmpi.lib</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PreprocessorDefinitions>USE_SOCKET;_MBCS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <WarningLevel>Level4</WarningLevel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <OmitFramePointers>true</OmitFramePointers>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies />
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\include\LightGBM\application.h" />
+    <ClInclude Include="..\include\LightGBM\bin.h" />
+    <ClInclude Include="..\include\LightGBM\boosting.h" />
+    <ClInclude Include="..\include\LightGBM\config.h" />
+    <ClInclude Include="..\include\LightGBM\dataset.h" />
+    <ClInclude Include="..\include\LightGBM\feature.h" />
+    <ClInclude Include="..\include\LightGBM\meta.h" />
+    <ClInclude Include="..\include\LightGBM\metric.h" />
+    <ClInclude Include="..\include\LightGBM\network.h" />
+    <ClInclude Include="..\include\LightGBM\objective_function.h" />
+    <ClInclude Include="..\include\LightGBM\tree.h" />
+    <ClInclude Include="..\include\LightGBM\tree_learner.h" />
+    <ClInclude Include="..\include\LightGBM\utils\array_args.h" />
+    <ClInclude Include="..\include\LightGBM\utils\common.h" />
+    <ClInclude Include="..\include\LightGBM\utils\log.h" />
+    <ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h" />
+    <ClInclude Include="..\include\LightGBM\utils\random.h" />
+    <ClInclude Include="..\include\LightGBM\utils\text_reader.h" />
+    <ClInclude Include="..\include\LightGBM\utils\threading.h" />
+    <ClInclude Include="..\src\application\predictor.hpp" />
+    <ClInclude Include="..\src\boosting\gbdt.h" />
+    <ClInclude Include="..\src\boosting\score_updater.hpp" />
+    <ClInclude Include="..\src\io\dense_bin.hpp" />
+    <ClInclude Include="..\src\io\ordered_sparse_bin.hpp" />
+    <ClInclude Include="..\src\io\parser.hpp" />
+    <ClInclude Include="..\src\io\sparse_bin.hpp" />
+    <ClInclude Include="..\src\metric\binary_metric.hpp" />
+    <ClInclude Include="..\src\metric\rank_metric.hpp" />
+    <ClInclude Include="..\src\metric\regression_metric.hpp" />
+    <ClInclude Include="..\src\network\linkers.h" />
+    <ClInclude Include="..\src\network\socket_wrapper.hpp" />
+    <ClInclude Include="..\src\objective\binary_objective.hpp" />
+    <ClInclude Include="..\src\objective\rank_objective.hpp" />
+    <ClInclude Include="..\src\objective\regression_objective.hpp" />
+    <ClInclude Include="..\src\treelearner\data_partition.hpp" />
+    <ClInclude Include="..\src\treelearner\feature_histogram.hpp" />
+    <ClInclude Include="..\src\treelearner\leaf_splits.hpp" />
+    <ClInclude Include="..\src\treelearner\parallel_tree_learner.h" />
+    <ClInclude Include="..\src\treelearner\serial_tree_learner.h" />
+    <ClInclude Include="..\src\treelearner\split_info.hpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\application\application.cpp" />
+    <ClCompile Include="..\src\boosting\boosting.cpp" />
+    <ClCompile Include="..\src\boosting\gbdt.cpp" />
+    <ClCompile Include="..\src\io\bin.cpp" />
+    <ClCompile Include="..\src\io\config.cpp" />
+    <ClCompile Include="..\src\io\dataset.cpp" />
+    <ClCompile Include="..\src\io\metadata.cpp" />
+    <ClCompile Include="..\src\io\parser.cpp" />
+    <ClCompile Include="..\src\io\tree.cpp" />
+    <ClCompile Include="..\src\metric\dcg_calculator.cpp" />
+    <ClCompile Include="..\src\metric\metric.cpp" />
+    <ClCompile Include="..\src\network\network.cpp" />
+    <ClCompile Include="..\src\network\linkers_mpi.cpp" />
+    <ClCompile Include="..\src\network\linkers_socket.cpp" />
+    <ClCompile Include="..\src\network\linker_topo.cpp" />
+    <ClCompile Include="..\src\objective\objective_function.cpp" />
+    <ClCompile Include="..\src\main.cpp" />
+    <ClCompile Include="..\src\treelearner\data_parallel_tree_learner.cpp" />
+    <ClCompile Include="..\src\treelearner\feature_parallel_tree_learner.cpp" />
+    <ClCompile Include="..\src\treelearner\serial_tree_learner.cpp" />
+    <ClCompile Include="..\src\treelearner\tree_learner.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="src">
+      <UniqueIdentifier>{6e213f6b-b843-4469-bc8c-56c1ffe7f195}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="include">
+      <UniqueIdentifier>{29082261-e6cd-40b2-b30c-c4cb70f23339}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="src\application">
+      <UniqueIdentifier>{3a703e42-6f06-4ab1-8e46-0dfb07407d9e}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="src\boosting">
+      <UniqueIdentifier>{43be32f9-227b-4a15-9c0e-38dbf9747aeb}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="src\io">
+      <UniqueIdentifier>{6fcdaf19-880a-45b0-80db-344be9498017}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="src\metric">
+      <UniqueIdentifier>{8bacb16c-7f31-494f-94df-8ccc6c3e3894}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="src\network">
+      <UniqueIdentifier>{93db474b-4ab8-406b-99ec-eb8e40f97593}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="src\objective">
+      <UniqueIdentifier>{34d576af-dec6-4cad-90bd-f8d0e95ec614}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="src\treelearner">
+      <UniqueIdentifier>{16638c37-41bd-4124-8b80-befbca2f969f}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="include\LightGBM">
+      <UniqueIdentifier>{37b41659-26e2-4b2f-ac0c-7b52d8bd53da}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="include\LightGBM\utils">
+      <UniqueIdentifier>{bf66b9f7-015e-404d-8098-4353abc46956}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\src\boosting\gbdt.h">
+      <Filter>src\boosting</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\network\linkers.h">
+      <Filter>src\network</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\treelearner\parallel_tree_learner.h">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\treelearner\serial_tree_learner.h">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\application\predictor.hpp">
+      <Filter>src\application</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\boosting\score_updater.hpp">
+      <Filter>src\boosting</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\io\dense_bin.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\io\ordered_sparse_bin.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\io\parser.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\io\sparse_bin.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\metric\binary_metric.hpp">
+      <Filter>src\metric</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\metric\rank_metric.hpp">
+      <Filter>src\metric</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\metric\regression_metric.hpp">
+      <Filter>src\metric</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\network\socket_wrapper.hpp">
+      <Filter>src\network</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\objective\binary_objective.hpp">
+      <Filter>src\objective</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\objective\rank_objective.hpp">
+      <Filter>src\objective</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\objective\regression_objective.hpp">
+      <Filter>src\objective</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\treelearner\data_partition.hpp">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\treelearner\feature_histogram.hpp">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\treelearner\leaf_splits.hpp">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\treelearner\split_info.hpp">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\application.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\bin.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\boosting.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\config.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\dataset.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\feature.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\meta.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\metric.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\network.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\objective_function.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\tree.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\tree_learner.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\array_args.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\common.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\log.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\random.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\text_reader.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\threading.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\application\application.cpp">
+      <Filter>src\application</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\network\linkers_socket.cpp">
+      <Filter>src\network</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\network\linkers_mpi.cpp">
+      <Filter>src\network</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\treelearner\serial_tree_learner.cpp">
+      <Filter>src\treelearner</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\treelearner\tree_learner.cpp">
+      <Filter>src\treelearner</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\Boosting\gbdt.cpp">
+      <Filter>src\boosting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\dataset.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\bin.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\tree.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\objective\objective_function.cpp">
+      <Filter>src\objective</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\Boosting\boosting.cpp">
+      <Filter>src\boosting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\parser.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\metric\metric.cpp">
+      <Filter>src\metric</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\treelearner\data_parallel_tree_learner.cpp">
+      <Filter>src\treelearner</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\treelearner\feature_parallel_tree_learner.cpp">
+      <Filter>src\treelearner</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\network\linker_topo.cpp">
+      <Filter>src\network</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\network\network.cpp">
+      <Filter>src\network</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\config.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\metric\dcg_calculator.cpp">
+      <Filter>src\metric</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\metadata.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\main.cpp">
+      <Filter>src</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file