merge conflict

eade219e · Qiwei Ye · f23e6083 · 060bd316 · eade219e · eade219e
Commit eade219e authored Mar 18, 2017 by Qiwei Ye
20 changed files
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -4,7 +4,6 @@
 #include <LightGBM/utils/common.h>

 #include <LightGBM/dataset.h>
-#include <LightGBM/feature.h>

 #include <sstream>
 #include <unordered_map>
@@ -16,11 +15,10 @@

 namespace LightGBM {

-std::vector<std::function<bool(unsigned int, unsigned int)>> Tree::inner_decision_funs = 
-          {Tree::NumericalDecision<unsigned int>, Tree::CategoricalDecision<unsigned int> };
-std::vector<std::function<bool(double, double)>> Tree::decision_funs = 
-          { Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };
-
+std::vector<bool(*)(uint32_t, uint32_t)> Tree::inner_decision_funs =
+{ Tree::NumericalDecision<uint32_t>, Tree::CategoricalDecision<uint32_t> };
+std::vector<bool(*)(double, double)> Tree::decision_funs =
+{ Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };

 Tree::Tree(int max_leaves)
  :max_leaves_(max_leaves) {
@@ -28,9 +26,9 @@ Tree::Tree(int max_leaves)
  num_leaves_ = 0;
  left_child_ = std::vector<int>(max_leaves_ - 1);
  right_child_ = std::vector<int>(max_leaves_ - 1);
+  split_feature_inner = std::vector<int>(max_leaves_ - 1);
  split_feature_ = std::vector<int>(max_leaves_ - 1);
-  split_feature_real_ = std::vector<int>(max_leaves_ - 1);
-  threshold_in_bin_ = std::vector<unsigned int>(max_leaves_ - 1);
+  threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1);
  threshold_ = std::vector<double>(max_leaves_ - 1);
  decision_type_ = std::vector<int8_t>(max_leaves_ - 1);
  split_gain_ = std::vector<double>(max_leaves_ - 1);
@@ -44,12 +42,14 @@ Tree::Tree(int max_leaves)
  leaf_depth_[0] = 0;
  num_leaves_ = 1;
  leaf_parent_[0] = -1;
+  shrinkage_ = 1.0f;
+  has_categorical_ = false;
 }
 Tree::~Tree() {

 }

-int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_bin, int real_feature,
+int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature,
    double threshold_double, double left_value,
    double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
  int new_node_idx = num_leaves_ - 1;
@@ -64,15 +64,16 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
    }
  }
  // add new node
-  split_feature_[new_node_idx] = feature;
-  split_feature_real_[new_node_idx] = real_feature;
-  threshold_in_bin_[new_node_idx] = threshold_bin;
-  threshold_[new_node_idx] = threshold_double;
+  split_feature_inner[new_node_idx] = feature;
+  split_feature_[new_node_idx] = real_feature;
  if (bin_type == BinType::NumericalBin) {
    decision_type_[new_node_idx] = 0;
  } else {
+    has_categorical_ = true;
    decision_type_[new_node_idx] = 1;
  }
+  threshold_in_bin_[new_node_idx] = threshold_bin;
+  threshold_[new_node_idx] = threshold_double;
  split_gain_[new_node_idx] = gain;
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
@@ -96,36 +97,206 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
 }

 void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
-  Threading::For<data_size_t>(0, num_data, [this, data, score](int, data_size_t start, data_size_t end) {
-    std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
-    for (int i = 0; i < data->num_features(); ++i) {
-      iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(start));
+  if (num_leaves_ <= 1) { return; }
+  if (has_categorical_) {
+    if (data->num_features() > num_leaves_ - 1) {
+      Threading::For<data_size_t>(0, num_data,
+        [this, &data, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
+        for (int i = 0; i < num_leaves_ - 1; ++i) {
+          const int fidx = split_feature_inner[i];
+          iter[i].reset(data->FeatureIterator(fidx));
+          iter[i]->Reset(start);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          int node = 0;
+          while (node >= 0) {
+            if (inner_decision_funs[decision_type_[node]](
+              iter[node]->Get(i),
+              threshold_in_bin_[node])) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[i] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
+    } else {
+      Threading::For<data_size_t>(0, num_data,
+        [this, &data, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
+        for (int i = 0; i < data->num_features(); ++i) {
+          iter[i].reset(data->FeatureIterator(i));
+          iter[i]->Reset(start);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          int node = 0;
+          while (node >= 0) {
+            if (inner_decision_funs[decision_type_[node]](
+              iter[split_feature_inner[node]]->Get(i),
+              threshold_in_bin_[node])) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[i] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
    }
-    for (data_size_t i = start; i < end; ++i) {
-      score[i] += static_cast<double>(leaf_value_[GetLeaf(iterators, i)]);
+  } else {
+    if (data->num_features() > num_leaves_ - 1) {
+      Threading::For<data_size_t>(0, num_data,
+        [this, &data, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
+        for (int i = 0; i < num_leaves_ - 1; ++i) {
+          const int fidx = split_feature_inner[i];
+          iter[i].reset(data->FeatureIterator(fidx));
+          iter[i]->Reset(start);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          int node = 0;
+          while (node >= 0) {
+            if (iter[node]->Get(i) <= threshold_in_bin_[node]) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[i] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
+    } else {
+      Threading::For<data_size_t>(0, num_data,
+        [this, &data, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
+        for (int i = 0; i < data->num_features(); ++i) {
+          iter[i].reset(data->FeatureIterator(i));
+          iter[i]->Reset(start);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          int node = 0;
+          while (node >= 0) {
+            if (iter[split_feature_inner[node]]->Get(i) <= threshold_in_bin_[node]) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[i] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
    }
-  });
+  }
 }

-void Tree::AddPredictionToScore(const Dataset* data, const data_size_t* used_data_indices,
-                                             data_size_t num_data, double* score) const {
-  Threading::For<data_size_t>(0, num_data,
-      [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
-    std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
-    for (int i = 0; i < data->num_features(); ++i) {
-      iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(used_data_indices[start]));
+void Tree::AddPredictionToScore(const Dataset* data,
+  const data_size_t* used_data_indices,
+  data_size_t num_data, double* score) const {
+  if (num_leaves_ <= 1) { return; }
+  if (has_categorical_) {
+    if (data->num_features() > num_leaves_ - 1) {
+      Threading::For<data_size_t>(0, num_data,
+        [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
+        for (int i = 0; i < num_leaves_ - 1; ++i) {
+          const int fidx = split_feature_inner[i];
+          iter[i].reset(data->FeatureIterator(fidx));
+          iter[i]->Reset(used_data_indices[start]);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          int node = 0;
+          const data_size_t idx = used_data_indices[i];
+          while (node >= 0) {
+            if (inner_decision_funs[decision_type_[node]](
+              iter[node]->Get(idx),
+              threshold_in_bin_[node])) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[idx] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
+    } else {
+      Threading::For<data_size_t>(0, num_data,
+        [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
+        for (int i = 0; i < data->num_features(); ++i) {
+          iter[i].reset(data->FeatureIterator(i));
+          iter[i]->Reset(used_data_indices[start]);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          const data_size_t idx = used_data_indices[i];
+          int node = 0;
+          while (node >= 0) {
+            if (inner_decision_funs[decision_type_[node]](
+              iter[split_feature_inner[node]]->Get(idx),
+              threshold_in_bin_[node])) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[idx] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
    }
-    for (data_size_t i = start; i < end; ++i) {
-      score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeaf(iterators, used_data_indices[i])]);
+  } else {
+    if (data->num_features() > num_leaves_ - 1) {
+      Threading::For<data_size_t>(0, num_data,
+        [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
+        for (int i = 0; i < num_leaves_ - 1; ++i) {
+          const int fidx = split_feature_inner[i];
+          iter[i].reset(data->FeatureIterator(fidx));
+          iter[i]->Reset(used_data_indices[start]);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          int node = 0;
+          const data_size_t idx = used_data_indices[i];
+          while (node >= 0) {
+            if (iter[node]->Get(idx) <= threshold_in_bin_[node]) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[idx] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
+    } else {
+      Threading::For<data_size_t>(0, num_data,
+        [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
+        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
+        for (int i = 0; i < data->num_features(); ++i) {
+          iter[i].reset(data->FeatureIterator(i));
+          iter[i]->Reset(used_data_indices[start]);
+        }
+        for (data_size_t i = start; i < end; ++i) {
+          const data_size_t idx = used_data_indices[i];
+          int node = 0;
+          while (node >= 0) {
+            if (iter[split_feature_inner[node]]->Get(idx) <= threshold_in_bin_[node]) {
+              node = left_child_[node];
+            } else {
+              node = right_child_[node];
+            }
+          }
+          score[idx] += static_cast<double>(leaf_value_[~node]);
+        }
+      });
    }
-  });
+  }
 }

 std::string Tree::ToString() {
  std::stringstream str_buf;
  str_buf << "num_leaves=" << num_leaves_ << std::endl;
  str_buf << "split_feature="
-    << Common::ArrayToString<int>(split_feature_real_, num_leaves_ - 1, ' ') << std::endl;
+    << Common::ArrayToString<int>(split_feature_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "split_gain="
    << Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "threshold="
@@ -146,6 +317,7 @@ std::string Tree::ToString() {
    << Common::ArrayToString<double>(internal_value_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "internal_count="
    << Common::ArrayToString<data_size_t>(internal_count_, num_leaves_ - 1, ' ') << std::endl;
+  str_buf << "shrinkage=" << shrinkage_ << std::endl;
  str_buf << std::endl;
  return str_buf.str();
 }
@@ -154,7 +326,7 @@ std::string Tree::ToJSON() {
  std::stringstream str_buf;
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
-
+  str_buf << "\"shrinkage\":" << shrinkage_ << "," << std::endl;
  str_buf << "\"tree_structure\":" << NodeToJSON(0) << std::endl;

  return str_buf.str();
@@ -167,7 +339,7 @@ std::string Tree::NodeToJSON(int index) {
    // non-leaf
    str_buf << "{" << std::endl;
    str_buf << "\"split_index\":" << index << "," << std::endl;
-    str_buf << "\"split_feature\":" << split_feature_real_[index] << "," << std::endl;
+    str_buf << "\"split_feature\":" << split_feature_[index] << "," << std::endl;
    str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
    str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
    str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
@@ -208,7 +380,8 @@ Tree::Tree(const std::string& str) {
    || key_vals.count("left_child") <= 0 || key_vals.count("right_child") <= 0
    || key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0
    || key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0
-    || key_vals.count("leaf_count") <= 0 || key_vals.count("decision_type") <= 0
+    || key_vals.count("leaf_count") <= 0 || key_vals.count("shrinkage") <= 0
+    || key_vals.count("decision_type") <= 0
    ) {
    Log::Fatal("Tree model string format error");
  }
@@ -217,17 +390,17 @@ Tree::Tree(const std::string& str) {

  left_child_ = Common::StringToArray<int>(key_vals["left_child"], ' ', num_leaves_ - 1);
  right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1);
-  split_feature_real_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
+  split_feature_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
  threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1);
+  decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);
  split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1);
  internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1);
  internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1);
-  decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);

  leaf_count_ = Common::StringToArray<data_size_t>(key_vals["leaf_count"], ' ', num_leaves_);
  leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
  leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
-
+  Common::Atof(key_vals["shrinkage"].c_str(), &shrinkage_);
 }

 }  // namespace LightGBM
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2,22 +2,6 @@
 #include <LightGBM/application.h>

 int main(int argc, char** argv) {
-  try {
-    LightGBM::Application app(argc, argv);
-    app.Run();
-  }
-  catch (const std::exception& ex) {
-    std::cerr << "Met Exceptions:" << std::endl;
-    std::cerr << ex.what() << std::endl;
-    exit(-1);
-  }
-  catch (const std::string& ex) {
-    std::cerr << "Met Exceptions:" << std::endl;
-    std::cerr << ex << std::endl;
-    exit(-1);
-  }
-  catch (...) {
-    std::cerr << "Unknown Exceptions" << std::endl;
-    exit(-1);
-  }
+  LightGBM::Application app(argc, argv);
+  app.Run();
 }
--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -63,7 +63,7 @@ public:
 #pragma omp parallel for schedule(static) reduction(+:sum_loss)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // sigmoid transform
-        double prob = 1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * score[i]));
+        double prob = 1.0f / (1.0f + std::exp(-sigmoid_ * score[i]));
        // add loss
        sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob);
      }
@@ -71,7 +71,7 @@ public:
 #pragma omp parallel for schedule(static) reduction(+:sum_loss)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // sigmoid transform
-        double prob = 1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * score[i]));
+        double prob = 1.0f / (1.0f + std::exp(-sigmoid_ * score[i]));
        // add loss
        sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob) * weights_[i];
      }
@@ -103,7 +103,7 @@ public:
  explicit BinaryLoglossMetric(const MetricConfig& config) :BinaryMetric<BinaryLoglossMetric>(config) {}

  inline static double LossOnPoint(float label, double prob) {
-    if (label == 0) {
+    if (label <= 0) {
      if (1.0f - prob > kEpsilon) {
        return -std::log(1.0f - prob);
      }
@@ -128,9 +128,9 @@ public:

  inline static double LossOnPoint(float label, double prob) {
    if (prob <= 0.5f) {
-      return label;
+      return label > 0;
    } else {
-      return 1.0f - label;
+      return label <= 0;
    }
  }

@@ -207,8 +207,8 @@ public:
          // reset
          cur_neg = cur_pos = 0.0f;
        }
-        cur_neg += 1.0f - cur_label;
-        cur_pos += cur_label;
+        cur_neg += (cur_label <= 0);
+        cur_pos += (cur_label > 0);
      }
    } else {  // has weights
      for (data_size_t i = 0; i < num_data_; ++i) {
@@ -224,8 +224,8 @@ public:
          // reset
          cur_neg = cur_pos = 0.0f;
        }
-        cur_neg += (1.0f - cur_label)*cur_weight;
-        cur_pos += cur_label*cur_weight;
+        cur_neg += (cur_label <= 0)*cur_weight;
+        cur_pos += (cur_label > 0)*cur_weight;
      }
    }
    accum += cur_neg*(cur_pos * 0.5f + sum_pos);

--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
+#ifndef LIGHTGBM_METRIC_MAP_METRIC_HPP_
+#define LIGHTGBM_METRIC_MAP_METRIC_HPP_
+
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
+#include <LightGBM/metric.h>
+
+#include <LightGBM/utils/openmp_wrapper.h>
+
+#include <sstream>
+#include <vector>
+
+namespace LightGBM {
+
+class MapMetric:public Metric {
+public:
+  explicit MapMetric(const MetricConfig& config) {
+    // get eval position
+    for (auto k : config.eval_at) {
+      eval_at_.push_back(static_cast<data_size_t>(k));
+    }
+    // get number of threads
+#pragma omp parallel
+#pragma omp master
+    {
+      num_threads_ = omp_get_num_threads();
+    }
+  }
+
+  ~MapMetric() {
+  }
+
+  void Init(const Metadata& metadata, data_size_t num_data) override {
+    std::stringstream str_buf;
+    for (auto k : eval_at_) {
+      name_.emplace_back(std::string("map@") + std::to_string(k));
+    }
+    num_data_ = num_data;
+    // get label
+    label_ = metadata.label();
+    // get query boundaries
+    query_boundaries_ = metadata.query_boundaries();
+    if (query_boundaries_ == nullptr) {
+      Log::Fatal("For MAP metric, there should be query information");
+    }
+    num_queries_ = metadata.num_queries();
+    Log::Info("total groups: %d , total data: %d", num_queries_, num_data_);
+    // get query weights
+    query_weights_ = metadata.query_weights();
+    if (query_weights_ == nullptr) {
+      sum_query_weights_ = static_cast<double>(num_queries_);
+    } else {
+      sum_query_weights_ = 0.0f;
+      for (data_size_t i = 0; i < num_queries_; ++i) {
+        sum_query_weights_ += query_weights_[i];
+      }
+    }
+  }
+
+  const std::vector<std::string>& GetName() const override {
+    return name_;
+  }
+
+  double factor_to_bigger_better() const override {
+    return 1.0f;
+  }
+
+  void CalMapAtK(std::vector<int> ks, const float* label,
+    const double* score, data_size_t num_data, std::vector<double>* out) const {
+    // get sorted indices by score
+    std::vector<data_size_t> sorted_idx;
+    for (data_size_t i = 0; i < num_data; ++i) {
+      sorted_idx.emplace_back(i);
+    }
+    std::sort(sorted_idx.begin(), sorted_idx.end(),
+      [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
+
+    int num_hit = 0;
+    double sum_ap = 0.0f;
+    data_size_t cur_left = 0;
+    for (size_t i = 0; i < ks.size(); ++i) {
+      data_size_t cur_k = ks[i];
+      if (cur_k > num_data) { cur_k = num_data; }
+      for (data_size_t j = cur_left; j < cur_k; ++j) {
+        data_size_t idx = sorted_idx[j];
+        if (label[idx] > 0.5f) {
+          ++num_hit;
+          sum_ap += static_cast<double>(num_hit) / (i + 1.0f);
+        }
+      }
+      (*out)[i] = sum_ap / cur_k;
+      cur_left = cur_k;
+    }
+  }
+  std::vector<double> Eval(const double* score) const override {
+    // some buffers for multi-threading sum up
+    std::vector<std::vector<double>> result_buffer_;
+    for (int i = 0; i < num_threads_; ++i) {
+      result_buffer_.emplace_back(eval_at_.size(), 0.0f);
+    }
+    std::vector<double> tmp_map(eval_at_.size(), 0.0f);
+    if (query_weights_ == nullptr) {
+#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
+      for (data_size_t i = 0; i < num_queries_; ++i) {
+        const int tid = omp_get_thread_num();
+        CalMapAtK(eval_at_, label_ + query_boundaries_[i],
+          score + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_map);
+        for (size_t j = 0; j < eval_at_.size(); ++j) {
+          result_buffer_[tid][j] += tmp_map[j];
+        }
+      }
+    } else {
+#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
+      for (data_size_t i = 0; i < num_queries_; ++i) {
+        const int tid = omp_get_thread_num();
+        CalMapAtK(eval_at_, label_ + query_boundaries_[i],
+          score + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_map);
+        for (size_t j = 0; j < eval_at_.size(); ++j) {
+          result_buffer_[tid][j] += tmp_map[j] * query_weights_[i];
+        }
+      }
+    }
+    // Get final average MAP
+    std::vector<double> result(eval_at_.size(), 0.0f);
+    for (size_t j = 0; j < result.size(); ++j) {
+      for (int i = 0; i < num_threads_; ++i) {
+        result[j] += result_buffer_[i][j];
+      }
+      result[j] /= sum_query_weights_;
+    }
+    return result;
+  }
+
+private:
+  /*! \brief Number of data */
+  data_size_t num_data_;
+  /*! \brief Pointer of label */
+  const float* label_;
+  /*! \brief Query boundaries information */
+  const data_size_t* query_boundaries_;
+  /*! \brief Number of queries */
+  data_size_t num_queries_;
+  /*! \brief Weights of queries */
+  const float* query_weights_;
+  /*! \brief Sum weights of queries */
+  double sum_query_weights_;
+  /*! \brief Evaluate position of Nmap */
+  std::vector<data_size_t> eval_at_;
+  /*! \brief Number of threads */
+  int num_threads_;
+  std::vector<std::string> name_;
+};
+
+}  // namespace LightGBM
+
+#endif   // LIGHTGBM_METRIC_MAP_METRIC_HPP_
--- a/src/metric/metric.cpp
+++ b/src/metric/metric.cpp
@@ -2,6 +2,7 @@
 #include "regression_metric.hpp"
 #include "binary_metric.hpp"
 #include "rank_metric.hpp"
+#include "map_metric.hpp"
 #include "multiclass_metric.hpp"

 namespace LightGBM {
@@ -15,6 +16,8 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
    return new HuberLossMetric(config);
  } else if (type == std::string("fair")) {
    return new FairLossMetric(config);
+  } else if (type == std::string("poisson")) {
+    return new PoissonMetric(config);
  } else if (type == std::string("binary_logloss")) {
    return new BinaryLoglossMetric(config);
  } else if (type == std::string("binary_error")) {
@@ -23,6 +26,8 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
    return new AUCMetric(config);
  } else if (type == std::string("ndcg")) {
    return new NDCGMetric(config);
+  } else if (type == std::string("map")) {
+    return new MapMetric(config);
  } else if (type == std::string("multi_logloss")) {
    return new MultiLoglossMetric(config);
  } else if (type == std::string("multi_error")) {

--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -6,7 +6,7 @@

 #include <LightGBM/metric.h>

-#include <omp.h>
+#include <LightGBM/utils/openmp_wrapper.h>

 #include <sstream>
 #include <vector>
@@ -90,7 +90,7 @@ public:
    }
    std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
    if (query_weights_ == nullptr) {
-#pragma omp parallel for schedule(guided) firstprivate(tmp_dcg)
+#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        // if all doc in this query are all negative, let its NDCG=1
@@ -110,7 +110,7 @@ public:
        }
      }
    } else {
-#pragma omp parallel for schedule(guided) firstprivate(tmp_dcg)
+#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
      for (data_size_t i = 0; i < num_queries_; ++i) {
        const int tid = omp_get_thread_num();
        // if all doc in this query are all negative, let its NDCG=1

--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -162,5 +162,23 @@ public:
  }
 };

+/*! \brief Poisson regression loss for regression task */
+class PoissonMetric: public RegressionMetric<PoissonMetric> {
+public:
+  explicit PoissonMetric(const MetricConfig& config) :RegressionMetric<PoissonMetric>(config) {
+  }
+
+  inline static double LossOnPoint(float label, double score, double, double) {
+    const double eps = 1e-10f;
+    if (score < eps) {
+      score = eps;
+    }
+    return score - label * std::log(score);
+  }
+  inline static const char* Name() {
+    return "poisson";
+  }
+};
+
 }  // namespace LightGBM
 #endif   // LightGBM_METRIC_REGRESSION_METRIC_HPP_
--- a/src/network/linkers_socket.cpp
+++ b/src/network/linkers_socket.cpp
@@ -25,7 +25,7 @@ Linkers::Linkers(NetworkConfig config) {
  local_listen_port_ = config.local_listen_port;
  socket_timeout_ = config.time_out;
  rank_ = -1;
-  // parser clients from file
+  // parse clients from file
  ParseMachineList(config.machine_list_filename.c_str());

  if (rank_ == -1) {

--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -28,14 +28,15 @@ public:
    data_size_t cnt_positive = 0;
    data_size_t cnt_negative = 0;
    // count for positive and negative samples
+#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
    for (data_size_t i = 0; i < num_data_; ++i) {
-      if (label_[i] == 1) {
+      if (label_[i] > 0) {
        ++cnt_positive;
      } else {
        ++cnt_negative;
      }
    }
-    Log::Info("Number of postive: %d, number of negative: %d", cnt_positive, cnt_negative);
+    Log::Info("Number of positive: %d, number of negative: %d", cnt_positive, cnt_negative);
    // cannot continue if all sample are same class
    if (cnt_positive == 0 || cnt_negative == 0) {
      Log::Fatal("Training data only contains one class");
@@ -64,25 +65,27 @@ public:
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // get label and label weights
-        const int label = label_val_[static_cast<int>(label_[i])];
-        const double label_weight = label_weights_[static_cast<int>(label_[i])];
+        const int is_pos = label_[i] > 0;
+        const int label = label_val_[is_pos];
+        const double label_weight = label_weights_[is_pos];
        // calculate gradients and hessians
-        const double response = -2.0f * label * sigmoid_ / (1.0f + std::exp(2.0f * label * sigmoid_ * score[i]));
+        const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
        const double abs_response = fabs(response);
        gradients[i] = static_cast<score_t>(response * label_weight);
-        hessians[i] = static_cast<score_t>(abs_response * (2.0f * sigmoid_ - abs_response) * label_weight);
+        hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight);
      }
    } else {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // get label and label weights
-        const int label = label_val_[static_cast<int>(label_[i])];
-        const double label_weight = label_weights_[static_cast<int>(label_[i])];
+        const int is_pos = label_[i] > 0;
+        const int label = label_val_[is_pos];
+        const double label_weight = label_weights_[is_pos];
        // calculate gradients and hessians
-        const double response = -2.0f * label * sigmoid_ / (1.0f + std::exp(2.0f * label * sigmoid_ * score[i]));
+        const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
        const double abs_response = fabs(response);
        gradients[i] = static_cast<score_t>(response * label_weight  * weights_[i]);
-        hessians[i] = static_cast<score_t>(abs_response * (2.0f * sigmoid_ - abs_response) * label_weight * weights_[i]);
+        hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight * weights_[i]);
      }
    }
  }

--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -14,6 +14,7 @@ class MulticlassLogloss: public ObjectiveFunction {
 public:
  explicit MulticlassLogloss(const ObjectiveConfig& config) {
    num_class_ = config.num_class;
+    is_unbalance_ = config.is_unbalance;
  }

  ~MulticlassLogloss() {
@@ -24,12 +25,25 @@ public:
    label_ = metadata.label();
    weights_ = metadata.weights();
    label_int_.resize(num_data_);
-    for (int i = 0; i < num_data_; ++i){
-        label_int_[i] = static_cast<int>(label_[i]);
-        if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
-            Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]);
-        }
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < num_data_; ++i) {
+      label_int_[i] = static_cast<int>(label_[i]);
+      if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
+        Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]);
+      }
    }
+    label_pos_weights_ = std::vector<float>(num_class_, 1);
+    if (is_unbalance_) {
+      std::vector<int> cnts(num_class_, 0);
+      for (int i = 0; i < num_data_; ++i) {
+        ++cnts[label_int_[i]];
+      }
+      for (int i = 0; i < num_class_; ++i) {
+        int cnt_cur = cnts[i];
+        int cnt_other = (num_data_ - cnts[i]);
+        label_pos_weights_[i] = static_cast<float>(cnt_other) / cnt_cur;
+      }
+    } 
  }

  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
@@ -46,11 +60,12 @@ public:
          auto p = rec[k];
          size_t idx = static_cast<size_t>(num_data_) * k + i;
          if (label_int_[i] == k) {
-            gradients[idx] = static_cast<score_t>(p - 1.0f);
+            gradients[idx] = static_cast<score_t>(p - 1.0f) * label_pos_weights_[k];
+            hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p))* label_pos_weights_[k];
          } else {
            gradients[idx] = static_cast<score_t>(p);
+            hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p));
          }
-          hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p));
        }
      }
    } else {
@@ -66,11 +81,13 @@ public:
          auto p = rec[k];
          size_t idx = static_cast<size_t>(num_data_) * k + i;
          if (label_int_[i] == k) {
-            gradients[idx] = static_cast<score_t>((p - 1.0f) * weights_[i]);
+            gradients[idx] = static_cast<score_t>((p - 1.0f) * weights_[i]) * label_pos_weights_[k];
+            hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p) * weights_[i]) * label_pos_weights_[k];
          } else {
            gradients[idx] = static_cast<score_t>(p * weights_[i]);
+            hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p) * weights_[i]);
          }
-          hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p) * weights_[i]);
+          
        }
      }
    }
@@ -91,6 +108,9 @@ private:
  std::vector<int> label_int_;
  /*! \brief Weights for data */
  const float* weights_;
+  /*! \brief Weights for label */
+  std::vector<float> label_pos_weights_;
+  bool is_unbalance_;
 };

 }  // namespace LightGBM

--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -16,6 +16,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    return new RegressionHuberLoss(config);
  } else if (type == std::string("fair")) {
    return new RegressionFairLoss(config);
+  } else if (type == std::string("poisson")) {
+    return new RegressionPoissonLoss(config);
  } else if (type == std::string("binary")) {
    return new BinaryLogloss(config);
  } else if (type == std::string("lambdarank")) {

--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -52,6 +52,7 @@ public:
    num_queries_ = metadata.num_queries();
    // cache inverse max DCG, avoid computation many times
    inverse_max_dcgs_.resize(num_queries_);
+#pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_queries_; ++i) {
      inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(optimize_pos_at_,
        label_ + query_boundaries_[i],

--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -236,5 +236,55 @@ private:
  double c_;
 };

+
+/*!
+* \brief Objective function for Poisson regression
+*/
+class RegressionPoissonLoss: public ObjectiveFunction {
+public:
+  explicit RegressionPoissonLoss(const ObjectiveConfig& config) {
+    max_delta_step_ =  static_cast<double>(config.poisson_max_delta_step);
+  }
+
+  ~RegressionPoissonLoss() {}
+
+  void Init(const Metadata& metadata, data_size_t num_data) override {
+    num_data_ = num_data;
+    label_ = metadata.label();
+    weights_ = metadata.weights();
+  }
+
+  void GetGradients(const double* score, score_t* gradients,
+    score_t* hessians) const override {
+    if (weights_ == nullptr) {
+#pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        gradients[i] = static_cast<score_t>(score[i] - label_[i]);
+        hessians[i] = static_cast<score_t>(score[i] + max_delta_step_);
+      }
+    } else {
+#pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        gradients[i] = static_cast<score_t>((score[i] - label_[i]) * weights_[i]);
+        hessians[i] = static_cast<score_t>((score[i] + max_delta_step_) * weights_[i]);
+      }
+    }
+  }
+
+  const char* GetName() const override {
+    return "poisson";
+  }
+
+private:
+  /*! \brief Number of data */
+  data_size_t num_data_;
+  /*! \brief Pointer of label */
+  const float* label_;
+  /*! \brief Pointer of weights */
+  const float* weights_;
+  /*! \brief used to safeguard optimization */
+  double max_delta_step_;
+};
+
 }  // namespace LightGBM
 #endif   // LightGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -22,10 +22,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data) {
  rank_ = Network::rank();
  num_machines_ = Network::num_machines();
  // allocate buffer for communication
-  size_t buffer_size = 0;
-  for (int i = 0; i < num_features_; ++i) {
-    buffer_size += train_data_->FeatureAt(i)->num_bin() * sizeof(HistogramBinEntry);
-  }
+  size_t buffer_size = train_data_->NumTotalBin() * sizeof(HistogramBinEntry);

  input_buffer_.resize(buffer_size);
  output_buffer_.resize(buffer_size);
@@ -50,13 +47,19 @@ void DataParallelTreeLearner::BeforeTrain() {
  // generate feature partition for current tree
  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
  std::vector<int> num_bins_distributed(num_machines_, 0);
-  for (int i = 0; i < train_data_->num_features(); ++i) {
-    if (is_feature_used_[i]) {
+  for (int i = 0; i < train_data_->num_total_features(); ++i) {
+    int inner_feature_index = train_data_->InnerFeatureIndex(i);
+    if (inner_feature_index == -1) { continue; }
+    if (is_feature_used_[inner_feature_index]) {
      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
-      feature_distribution[cur_min_machine].push_back(i);
-      num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
+      feature_distribution[cur_min_machine].push_back(inner_feature_index);
+      auto num_bin = train_data_->FeatureNumBin(inner_feature_index);
+      if (train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() == 0) {
+        num_bin -= 1;
+      }
+      num_bins_distributed[cur_min_machine] += num_bin;
    }
-    is_feature_aggregated_[i] = false;
+    is_feature_aggregated_[inner_feature_index] = false;
  }
  // get local used feature
  for (auto fid : feature_distribution[rank_]) {
@@ -68,7 +71,11 @@ void DataParallelTreeLearner::BeforeTrain() {
  for (int i = 0; i < num_machines_; ++i) {
    block_len_[i] = 0;
    for (auto fid : feature_distribution[i]) {
-      block_len_[i] += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
+      auto num_bin = train_data_->FeatureNumBin(fid);
+      if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
+        num_bin -= 1;
+      }
+      block_len_[i] += num_bin * sizeof(HistogramBinEntry);
    }
    reduce_scatter_size_ += block_len_[i];
  }
@@ -83,7 +90,11 @@ void DataParallelTreeLearner::BeforeTrain() {
  for (int i = 0; i < num_machines_; ++i) {
    for (auto fid : feature_distribution[i]) {
      buffer_write_start_pos_[fid] = bin_size;
-      bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
+      auto num_bin = train_data_->FeatureNumBin(fid);
+      if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
+        num_bin -= 1;
+      }
+      bin_size += num_bin * sizeof(HistogramBinEntry);
    }
  }

@@ -91,12 +102,16 @@ void DataParallelTreeLearner::BeforeTrain() {
  bin_size = 0;
  for (auto fid : feature_distribution[rank_]) {
    buffer_read_start_pos_[fid] = bin_size;
-    bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
+    auto num_bin = train_data_->FeatureNumBin(fid);
+    if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
+      num_bin -= 1;
+    }
+    bin_size += num_bin * sizeof(HistogramBinEntry);
  }

  // sync global data sumup info
  std::tuple<data_size_t, double, double> data(smaller_leaf_splits_->num_data_in_leaf(),
-             smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
+    smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
  int size = sizeof(data);
  std::memcpy(input_buffer_.data(), &data, size);
  // global sumup reduce
@@ -125,88 +140,88 @@ void DataParallelTreeLearner::BeforeTrain() {
 }

 void DataParallelTreeLearner::FindBestThresholds() {
+  train_data_->ConstructHistograms(is_feature_used_,
+    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+    smaller_leaf_splits_->LeafIndex(),
+    ordered_bins_, gradients_, hessians_,
+    ordered_gradients_.data(), ordered_hessians_.data(),
+    smaller_leaf_histogram_array_[0].RawData() - 1);
  // construct local histograms
-  #pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue;
-    // construct histograms for smaller leaf
-    if (ordered_bins_[feature_index] == nullptr) {
-      smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
-                                                             smaller_leaf_splits_->num_data_in_leaf(),
-                                                             smaller_leaf_splits_->sum_gradients(),
-                                                             smaller_leaf_splits_->sum_hessians(),
-                                                             ptr_to_ordered_gradients_smaller_leaf_,
-                                                             ptr_to_ordered_hessians_smaller_leaf_);
-    } else {
-      smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index].get(),
-                                                             smaller_leaf_splits_->LeafIndex(),
-                                                             smaller_leaf_splits_->num_data_in_leaf(),
-                                                             smaller_leaf_splits_->sum_gradients(),
-                                                             smaller_leaf_splits_->sum_hessians(),
-                                                             gradients_,
-                                                             hessians_);
-    }
    // copy to buffer
    std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
-                smaller_leaf_histogram_array_[feature_index].HistogramData(),
-                smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
+      smaller_leaf_histogram_array_[feature_index].RawData(),
+      smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
  }
-
  // Reduce scatter for histogram
  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(),
-                         block_len_.data(), output_buffer_.data(), &HistogramBinEntry::SumReducer);
-  #pragma omp parallel for schedule(guided)
+    block_len_.data(), output_buffer_.data(), &HistogramBinEntry::SumReducer);
+
+  std::vector<SplitInfo> smaller_best(num_threads_, SplitInfo());
+  std::vector<SplitInfo> larger_best(num_threads_, SplitInfo());
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_aggregated_[feature_index]) continue;
-    // copy global sumup info
-    smaller_leaf_histogram_array_[feature_index].SetSumup(
-        GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
-                                smaller_leaf_splits_->sum_gradients(), 
-                                smaller_leaf_splits_->sum_hessians());
-
+    const int tid = omp_get_thread_num();
    // restore global histograms from buffer
    smaller_leaf_histogram_array_[feature_index].FromMemory(
-        output_buffer_.data() + buffer_read_start_pos_[feature_index]);
+      output_buffer_.data() + buffer_read_start_pos_[feature_index]);

+    train_data_->FixHistogram(feature_index,
+      smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
+      GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
+      smaller_leaf_histogram_array_[feature_index].RawData());
+    SplitInfo smaller_split;
    // find best threshold for smaller child
    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
-        &smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
+      smaller_leaf_splits_->sum_gradients(),
+      smaller_leaf_splits_->sum_hessians(),
+      GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
+      &smaller_split);
+    if (smaller_split.gain > smaller_best[tid].gain) {
+      smaller_best[tid] = smaller_split;
+      smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
+    }

    // only root leaf
    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;

    // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
    larger_leaf_histogram_array_[feature_index].Subtract(
-        smaller_leaf_histogram_array_[feature_index]);
-    // set sumup info for histogram
-    larger_leaf_histogram_array_[feature_index].SetSumup(
-        GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
-                                                         larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians());
+      smaller_leaf_histogram_array_[feature_index]);
+    SplitInfo larger_split;
    // find best threshold for larger child
    larger_leaf_histogram_array_[feature_index].FindBestThreshold(
-        &larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
+      larger_leaf_splits_->sum_gradients(),
+      larger_leaf_splits_->sum_hessians(),
+      GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
+      &larger_split);
+    if (larger_split.gain > larger_best[tid].gain) {
+      larger_best[tid] = larger_split;
+      larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
+    }
  }
+  auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
+  int leaf = smaller_leaf_splits_->LeafIndex();
+  best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
+
+
+  if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { return; }
+
+  leaf = larger_leaf_splits_->LeafIndex();
+  auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
+  best_split_per_leaf_[leaf] = larger_best[larger_best_idx];

 }

 void DataParallelTreeLearner::FindBestSplitsForLeaves() {
-  int smaller_best_feature = -1, larger_best_feature = -1;
  SplitInfo smaller_best, larger_best;
-  std::vector<double> gains;
-  // find local best split for smaller leaf
-  for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
-    gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
-  }
-  smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
-  smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
+  smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
  // find local best split for larger leaf
  if (larger_leaf_splits_->LeafIndex() >= 0) {
-    gains.clear();
-    for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
-      gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
-    }
-    larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
-    larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
+    larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
  }

  // sync global best info
@@ -214,7 +229,7 @@ void DataParallelTreeLearner::FindBestSplitsForLeaves() {
  std::memcpy(input_buffer_.data() + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo));

  Network::Allreduce(input_buffer_.data(), sizeof(SplitInfo) * 2, sizeof(SplitInfo),
-                     output_buffer_.data(), &SplitInfo::MaxReducer);
+    output_buffer_.data(), &SplitInfo::MaxReducer);

  std::memcpy(&smaller_best, output_buffer_.data(), sizeof(SplitInfo));
  std::memcpy(&larger_best, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));

--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -2,9 +2,9 @@
 #define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_

 #include <LightGBM/meta.h>
-#include <LightGBM/feature.h>
+#include <LightGBM/dataset.h>

-#include <omp.h>
+#include <LightGBM/utils/openmp_wrapper.h>

 #include <cstring>

@@ -41,7 +41,12 @@ public:
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
  }
-
+  void ResetNumData(int num_data) {
+    num_data_ = num_data;
+    indices_.resize(num_data_);
+    temp_left_indices_.resize(num_data_);
+    temp_right_indices_.resize(num_data_);
+  }
  ~DataPartition() {

  }
@@ -88,7 +93,7 @@ public:
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
-  void Split(int leaf, const Bin* feature_bins, unsigned int threshold, int right_leaf) {
+  void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, int right_leaf) {
    const data_size_t min_inner_size = 1000;
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
@@ -106,7 +111,7 @@ public:
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
      // split data inner, reduce the times of function called
-      data_size_t cur_left_count = feature_bins->Split(threshold, indices_.data() + begin + cur_start, cur_cnt,
+      data_size_t cur_left_count = dataset->Split(feature, threshold, indices_.data() + begin + cur_start, cur_cnt,
        temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -2,19 +2,31 @@
 #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_

 #include "split_info.hpp"
-#include <LightGBM/feature.h>
+
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/dataset.h>

 #include <cstring>

-namespace LightGBM {
+namespace LightGBM 
+{

+class FeatureMetainfo {
+public:
+  int num_bin;
+  int bias = 0;
+  /*! \brief pointer of tree config */
+  const TreeConfig* tree_config;
+};
 /*!
 * \brief FeatureHistogram is used to construct and store a histogram for a feature.
 */
 class FeatureHistogram {
 public:
  FeatureHistogram() {
+    data_ = nullptr;
  }
+
  ~FeatureHistogram() {
  }

@@ -28,125 +40,76 @@ public:
  * \param feature the feature data for this histogram
  * \param min_num_data_one_leaf minimal number of data in one leaf
  */
-  void Init(const Feature* feature, int feature_idx, const TreeConfig* tree_config) {
-    feature_idx_ = feature_idx;
-    tree_config_ = tree_config;
-    bin_data_ = feature->bin_data();
-    num_bins_ = feature->num_bin();
-    data_.resize(num_bins_);
-    if (feature->bin_type() == BinType::NumericalBin) {
-      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdForNumerical, this, std::placeholders::_1);
+  void Init(HistogramBinEntry* data, const FeatureMetainfo* meta, BinType bin_type) {
+    meta_ = meta;
+    data_ = data;
+    if (bin_type == BinType::NumericalBin) {
+      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
+        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
    } else {
-      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdForCategorical, this, std::placeholders::_1);
+      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
+        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
    }
  }

-
-  /*!
-  * \brief Construct a histogram
-  * \param num_data number of data in current leaf
-  * \param sum_gradients sum of gradients of current leaf
-  * \param sum_hessians sum of hessians of current leaf
-  * \param ordered_gradients Orederd gradients
-  * \param ordered_hessians  Ordered hessians
-  * \param data_indices data indices of current leaf
-  */
-  void Construct(const data_size_t* data_indices, data_size_t num_data, double sum_gradients,
-    double sum_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians) {
-    std::memset(data_.data(), 0, sizeof(HistogramBinEntry)* num_bins_);
-    num_data_ = num_data;
-    sum_gradients_ = sum_gradients;
-    sum_hessians_ = sum_hessians + 2 * kEpsilon;
-    bin_data_->ConstructHistogram(data_indices, num_data, ordered_gradients, ordered_hessians, data_.data());
-  }
-
-  /*!
-  * \brief Construct a histogram by ordered bin
-  * \param leaf current leaf
-  * \param num_data number of data in current leaf
-  * \param sum_gradients sum of gradients of current leaf
-  * \param sum_hessians sum of hessians of current leaf
-  * \param gradients
-  * \param hessian
-  */
-  void Construct(const OrderedBin* ordered_bin, int leaf, data_size_t num_data, double sum_gradients,
-    double sum_hessians, const score_t* gradients, const score_t* hessians) {
-    std::memset(data_.data(), 0, sizeof(HistogramBinEntry)* num_bins_);
-    num_data_ = num_data;
-    sum_gradients_ = sum_gradients;
-    sum_hessians_ = sum_hessians + 2 * kEpsilon;
-    ordered_bin->ConstructHistogram(leaf, gradients, hessians, data_.data());
+  HistogramBinEntry* RawData() {
+    return data_;
  }
-
-  /*!
-  * \brief Set sumup information for current histogram
-  * \param num_data number of data in current leaf
-  * \param sum_gradients sum of gradients of current leaf
-  * \param sum_hessians sum of hessians of current leaf
-  */
-  void SetSumup(data_size_t num_data, double sum_gradients, double sum_hessians) {
-    num_data_ = num_data;
-    sum_gradients_ = sum_gradients;
-    sum_hessians_ = sum_hessians + 2 * kEpsilon;
-  }
-
  /*!
  * \brief Subtract current histograms with other
  * \param other The histogram that want to subtract
  */
  void Subtract(const FeatureHistogram& other) {
-    num_data_ -= other.num_data_;
-    sum_gradients_ -= other.sum_gradients_;
-    sum_hessians_ -= other.sum_hessians_;
-    for (unsigned int i = 0; i < num_bins_; ++i) {
+    for (int i = 0; i < meta_->num_bin - meta_->bias; ++i) {
      data_[i].cnt -= other.data_[i].cnt;
      data_[i].sum_gradients -= other.data_[i].sum_gradients;
      data_[i].sum_hessians -= other.data_[i].sum_hessians;
    }
  }

-  /*!
-  * \brief Find best threshold for this histogram
-  * \param output The best split result
-  */
-  void FindBestThreshold(SplitInfo* output) {
-    find_best_threshold_fun_(output);
+  void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
+    SplitInfo* output) {
+    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
  }

-  void FindBestThresholdForNumerical(SplitInfo* output) {
+  void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
+    SplitInfo* output) {
    double best_sum_left_gradient = NAN;
    double best_sum_left_hessian = NAN;
    double best_gain = kMinScore;
    data_size_t best_left_count = 0;
-    unsigned int best_threshold = static_cast<unsigned int>(num_bins_);
+    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
    double sum_right_gradient = 0.0f;
    double sum_right_hessian = kEpsilon;
    data_size_t right_count = 0;
-    double gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
-    double min_gain_shift = gain_shift + tree_config_->min_gain_to_split;
+    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
+    double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
    is_splittable_ = false;
+    const int bias = meta_->bias;
+    int t = meta_->num_bin - 1 - bias;
+    const int t_end = 1 - bias;
    // from right to left, and we don't need data in bin0
-    for (unsigned int t = num_bins_ - 1; t > 0; --t) {
+    for (; t >= t_end; --t) {
      sum_right_gradient += data_[t].sum_gradients;
      sum_right_hessian += data_[t].sum_hessians;
      right_count += data_[t].cnt;
      // if data not enough, or sum hessian too small
-      if (right_count < tree_config_->min_data_in_leaf 
-          || sum_right_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
-      data_size_t left_count = num_data_ - right_count;
+      if (right_count < meta_->tree_config->min_data_in_leaf
+        || sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+      data_size_t left_count = num_data - right_count;
      // if data not enough
-      if (left_count < tree_config_->min_data_in_leaf) break;
+      if (left_count < meta_->tree_config->min_data_in_leaf) break;

-      double sum_left_hessian = sum_hessians_ - sum_right_hessian;
+      double sum_left_hessian = sum_hessian - sum_right_hessian;
      // if sum hessian too small
-      if (sum_left_hessian < tree_config_->min_sum_hessian_in_leaf) break;
+      if (sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;

-      double sum_left_gradient = sum_gradients_ - sum_right_gradient;
+      double sum_left_gradient = sum_gradient - sum_right_gradient;
      // current split gain
      double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian)
        + GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
      // gain with split is worse than without split
-      if (current_gain < min_gain_shift) continue;
+      if (current_gain <= min_gain_shift) continue;

      // mark to is splittable
      is_splittable_ = true;
@@ -156,91 +119,119 @@ public:
        best_sum_left_gradient = sum_left_gradient;
        best_sum_left_hessian = sum_left_hessian;
        // left is <= threshold, right is > threshold.  so this is t-1
-        best_threshold = t - 1;
+        best_threshold = static_cast<uint32_t>(t - 1 + bias);
        best_gain = current_gain;
      }
    }
    if (is_splittable_) {
      // update split information
-      output->feature = feature_idx_;
      output->threshold = best_threshold;
      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
-      output->left_sum_hessian = best_sum_left_hessian;
-      output->right_output = CalculateSplittedLeafOutput(sum_gradients_ - best_sum_left_gradient,
-        sum_hessians_ - best_sum_left_hessian);
-      output->right_count = num_data_ - best_left_count;
-      output->right_sum_gradient = sum_gradients_ - best_sum_left_gradient;
-      output->right_sum_hessian = sum_hessians_ - best_sum_left_hessian;
+      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
+      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
+        sum_hessian - best_sum_left_hessian);
+      output->right_count = num_data - best_left_count;
+      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
+      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
      output->gain = best_gain - gain_shift;
    } else {
-      output->feature = feature_idx_;
      output->gain = kMinScore;
    }
  }

-  /*!
-  * \brief Find best threshold for this histogram
-  * \param output The best split result
-  */
-  void FindBestThresholdForCategorical(SplitInfo* output) {
+  void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
+    SplitInfo* output) {
    double best_gain = kMinScore;
-    unsigned int best_threshold = static_cast<unsigned int>(num_bins_);
-
-    double gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
-    double min_gain_shift = gain_shift + tree_config_->min_gain_to_split;
+    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
+    data_size_t best_left_count = 0;
+    double best_sum_left_gradient = 0.0f;
+    double best_sum_left_hessian = 0.0f;
+    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
+    double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
    is_splittable_ = false;
-
-    for (int t = num_bins_ - 1; t >= 0; --t) {
-      double sum_current_gradient = data_[t].sum_gradients;
-      double sum_current_hessian = data_[t].sum_hessians;
-      data_size_t current_count = data_[t].cnt;
+    const int bias = meta_->bias;
+    int t = meta_->num_bin - 1 - bias;
+    const int t_end = 0;
+    // from right to left, and we don't need data in bin0
+    for (; t >= t_end; --t) {
      // if data not enough, or sum hessian too small
-      if (current_count < tree_config_->min_data_in_leaf
-          || sum_current_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
-      data_size_t other_count = num_data_ - current_count;
+      if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
+        || data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+      data_size_t other_count = num_data - data_[t].cnt;
      // if data not enough
-      if (other_count < tree_config_->min_data_in_leaf) continue;
+      if (other_count < meta_->tree_config->min_data_in_leaf) continue;

-      double sum_other_hessian = sum_hessians_ - sum_current_hessian;
+      double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
      // if sum hessian too small
-      if (sum_other_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
+      if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;

-      double sum_other_gradient = sum_gradients_ - sum_current_gradient;
+      double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
      // current split gain
-      double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian) 
-        + GetLeafSplitGain(sum_current_gradient, sum_current_hessian);
+      double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+        + GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon);
      // gain with split is worse than without split
-      if (current_gain < min_gain_shift) continue;
+      if (current_gain <= min_gain_shift) continue;

      // mark to is splittable
      is_splittable_ = true;
      // better split point
      if (current_gain > best_gain) {
-        best_threshold = static_cast<unsigned int>(t);
+        best_threshold = static_cast<uint32_t>(t + bias);
+        best_sum_left_gradient = data_[t].sum_gradients;
+        best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
+        best_left_count = data_[t].cnt;
        best_gain = current_gain;
      }
    }
-    // update split information
+    // need restore zero bin
+    if (bias == 1) {
+      t = meta_->num_bin - 1 - bias;
+      double sum_bin0_gradient = sum_gradient;
+      double sum_bin0_hessian = sum_hessian - 2 * kEpsilon;
+      data_size_t cnt_bin0 = num_data;
+      for (; t >= 0; --t) {
+        sum_bin0_gradient -= data_[t].sum_gradients;
+        sum_bin0_hessian -= data_[t].sum_hessians;
+        cnt_bin0 -= data_[t].cnt;
+      }
+      data_size_t other_count = num_data - cnt_bin0;
+      double sum_other_hessian = sum_hessian - sum_bin0_hessian - kEpsilon;
+      if (cnt_bin0 >= meta_->tree_config->min_data_in_leaf
+        && sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
+        && other_count >= meta_->tree_config->min_data_in_leaf
+        && sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
+        double sum_other_gradient = sum_gradient - sum_bin0_gradient;
+        double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+          + GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon);
+        if (current_gain > min_gain_shift) {
+          is_splittable_ = true;
+          // better split point
+          if (current_gain > best_gain) {
+            best_threshold = static_cast<uint32_t>(0);
+            best_sum_left_gradient = sum_bin0_gradient;
+            best_sum_left_hessian = sum_bin0_hessian + kEpsilon;
+            best_left_count = cnt_bin0;
+            best_gain = current_gain;
+          }
+        }
+      }
+    }
    if (is_splittable_) {
-      output->feature = feature_idx_;
+      // update split information
      output->threshold = best_threshold;
-      output->left_output = CalculateSplittedLeafOutput(data_[best_threshold].sum_gradients,
-        data_[best_threshold].sum_hessians);
-      output->left_count = data_[best_threshold].cnt;
-      output->left_sum_gradient = data_[best_threshold].sum_gradients;
-      output->left_sum_hessian = data_[best_threshold].sum_hessians;
-
-      output->right_output = CalculateSplittedLeafOutput(sum_gradients_ - data_[best_threshold].sum_gradients,
-        sum_hessians_ - data_[best_threshold].sum_hessians);
-      output->right_count = num_data_ - data_[best_threshold].cnt;
-      output->right_sum_gradient = sum_gradients_ - data_[best_threshold].sum_gradients;
-      output->right_sum_hessian = sum_hessians_ - data_[best_threshold].sum_hessians;
-
+      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
+      output->left_count = best_left_count;
+      output->left_sum_gradient = best_sum_left_gradient;
+      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
+      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
+        sum_hessian - best_sum_left_hessian);
+      output->right_count = num_data - best_left_count;
+      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
+      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
      output->gain = best_gain - gain_shift;
    } else {
-      output->feature = feature_idx_;
      output->gain = kMinScore;
    }
  }
@@ -249,21 +240,14 @@ public:
  * \brief Binary size of this histogram
  */
  int SizeOfHistgram() const {
-    return num_bins_ * sizeof(HistogramBinEntry);
-  }
-
-  /*!
-  * \brief Memory pointer to histogram data
-  */
-  const HistogramBinEntry* HistogramData() const {
-    return data_.data();
+    return (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry);
  }

  /*!
  * \brief Restore histogram from memory
  */
-  void FromMemory(char* memory_data)  {
-    std::memcpy(data_.data(), memory_data, num_bins_ * sizeof(HistogramBinEntry));
+  void FromMemory(char* memory_data) {
+    std::memcpy(data_, memory_data, (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry));
  }

  /*!
@@ -276,10 +260,6 @@ public:
  */
  void set_is_splittable(bool val) { is_splittable_ = val; }

-  void ResetConfig(const TreeConfig* tree_config) {
-    tree_config_ = tree_config;
-  }
-
 private:
  /*!
  * \brief Calculate the split gain based on regularized sum_gradients and sum_hessians
@@ -289,12 +269,10 @@ private:
  */
  double GetLeafSplitGain(double sum_gradients, double sum_hessians) const {
    double abs_sum_gradients = std::fabs(sum_gradients);
-    if (abs_sum_gradients > tree_config_->lambda_l1) {
-      double reg_abs_sum_gradients = abs_sum_gradients - tree_config_->lambda_l1;
-      return (reg_abs_sum_gradients * reg_abs_sum_gradients) 
-             / (sum_hessians + tree_config_->lambda_l2);
-    }
-    return 0.0f;
+    double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
+    return (reg_abs_sum_gradients * reg_abs_sum_gradients)
+      / (sum_hessians + meta_->tree_config->lambda_l2);
+
  }

  /*!
@@ -305,35 +283,19 @@ private:
  */
  double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians) const {
    double abs_sum_gradients = std::fabs(sum_gradients);
-    if (abs_sum_gradients > tree_config_->lambda_l1) {
-      return -std::copysign(abs_sum_gradients - tree_config_->lambda_l1, sum_gradients) 
-                            / (sum_hessians + tree_config_->lambda_l2);
-    }
-    return 0.0f;
+    double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
+    return -std::copysign(reg_abs_sum_gradients, sum_gradients)
+      / (sum_hessians + meta_->tree_config->lambda_l2);
  }
-
-  int feature_idx_;
-  /*! \brief pointer of tree config */
-  const TreeConfig* tree_config_;
-  /*! \brief the bin data of current feature */
-  const Bin* bin_data_;
-  /*! \brief number of bin of histogram */
-  unsigned int num_bins_;
+  const FeatureMetainfo* meta_;
  /*! \brief sum of gradient of each bin */
-  std::vector<HistogramBinEntry> data_;
-  /*! \brief number of all data */
-  data_size_t num_data_;
-  /*! \brief sum of gradient of current leaf */
-  double sum_gradients_;
-  /*! \brief sum of hessians of current leaf */
-  double sum_hessians_;
+  HistogramBinEntry* data_;
+  //std::vector<HistogramBinEntry> data_;
  /*! \brief False if this histogram cannot split */
  bool is_splittable_ = true;
-  /*! \brief function that used to find best threshold */
-  std::function<void(SplitInfo*)> find_best_threshold_fun_;
-};
-

+  std::function<void(double, double, data_size_t, SplitInfo*)> find_best_threshold_fun_;
+};
 class HistogramPool {
 public:
  /*!
@@ -343,7 +305,6 @@ public:
    cache_size_ = 0;
    total_size_ = 0;
  }
-
  /*!
  * \brief Destructor
  */
@@ -370,7 +331,6 @@ public:
      ResetMap();
    }
  }
-
  /*!
  * \brief Reset mapper
  */
@@ -383,34 +343,48 @@ public:
    }
  }

-  /*!
-  * \brief Fill the pool
-  * \param obj_create_fun that used to generate object
-  */
-  void Fill(std::function<FeatureHistogram*()> obj_create_fun) {
-    fill_func_ = obj_create_fun;
-    pool_.clear();
-    pool_.resize(cache_size_);
-    for (int i = 0; i < cache_size_; ++i) {
-      pool_[i].reset(obj_create_fun());
+  void DynamicChangeSize(const Dataset* train_data, const TreeConfig* tree_config, int cache_size, int total_size) {
+    if (feature_metas_.empty()) {
+      feature_metas_.resize(train_data->num_features());
+#pragma omp parallel for schedule(static)
+      for (int i = 0; i < train_data->num_features(); ++i) {
+        feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
+        if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
+          feature_metas_[i].bias = 1;
+        } else {
+          feature_metas_[i].bias = 0;
+        }
+        feature_metas_[i].tree_config = tree_config;
+      }
    }
-  }
-
-  void DynamicChangeSize(int cache_size, int total_size) {
+    uint64_t num_total_bin = train_data->NumTotalBin();
+    Log::Info("Total Bins %d", num_total_bin);
    int old_cache_size = cache_size_;
    Reset(cache_size, total_size);
-    pool_.resize(cache_size_);
+    pool_.resize(cache_size);
+    data_.resize(cache_size);
+#pragma omp parallel for schedule(static)
    for (int i = old_cache_size; i < cache_size_; ++i) {
-      pool_[i].reset(fill_func_());
+      pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
+      data_[i].resize(num_total_bin);
+      uint64_t offset = 0;
+      for (int j = 0; j < train_data->num_features(); ++j) {
+        offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
+        pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
+        auto num_bin = train_data->FeatureNumBin(j);
+        if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
+          num_bin -= 1;
+        }
+        offset += static_cast<uint64_t>(num_bin);
+      }
+      CHECK(offset == num_total_bin);
    }
  }

-  void ResetConfig(const TreeConfig* tree_config, int array_size) {
-    for (int i = 0; i < cache_size_; ++i) {
-      auto data_ptr = pool_[i].get();
-      for (int j = 0; j < array_size; ++j) {
-        data_ptr[j].ResetConfig(tree_config);
-      }
+  void ResetConfig(const TreeConfig* tree_config) {
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < static_cast<int>(feature_metas_.size()); ++i) {
+      feature_metas_[i].tree_config = tree_config;
    }
  }
  /*!
@@ -468,9 +442,9 @@ public:
    inverse_mapper_[slot] = dst_idx;
  }
 private:
-
  std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
-  std::function<FeatureHistogram*()> fill_func_;
+  std::vector<std::vector<HistogramBinEntry>> data_;
+  std::vector<FeatureMetainfo> feature_metas_;
  int cache_size_;
  int total_size_;
  bool is_enough_ = false;
@@ -480,7 +454,5 @@ private:
  int cur_time_ = 0;
 };

-
-
 }  // namespace LightGBM
 #endif   // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -28,12 +28,14 @@ void FeatureParallelTreeLearner::BeforeTrain() {
  // get feature partition
  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
  std::vector<int> num_bins_distributed(num_machines_, 0);
-  for (int i = 0; i < train_data_->num_features(); ++i) {
-    if (is_feature_used_[i]) {
+  for (int i = 0; i < train_data_->num_total_features(); ++i) {
+    int inner_feature_index = train_data_->InnerFeatureIndex(i);
+    if (inner_feature_index == -1) { continue; }
+    if (is_feature_used_[inner_feature_index]) {
      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
-      feature_distribution[cur_min_machine].push_back(i);
-      num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
-      is_feature_used_[i] = false;
+      feature_distribution[cur_min_machine].push_back(inner_feature_index);
+      num_bins_distributed[cur_min_machine] += train_data_->FeatureNumBin(inner_feature_index);
+      is_feature_used_[inner_feature_index] = false;
    }
  }
  // get local used features
@@ -43,23 +45,12 @@ void FeatureParallelTreeLearner::BeforeTrain() {
 }

 void FeatureParallelTreeLearner::FindBestSplitsForLeaves() {
-  int smaller_best_feature = -1, larger_best_feature = -1;
  SplitInfo smaller_best, larger_best;
  // get best split at smaller leaf
-  std::vector<double> gains;
-  for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
-    gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
-  }
-  smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
-  smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
-  // get best split at larger leaf
+  smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
+  // find local best split for larger leaf
  if (larger_leaf_splits_->LeafIndex() >= 0) {
-    gains.clear();
-    for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
-      gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
-    }
-    larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
-    larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
+    larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
  }
  // sync global best info
  std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -3,7 +3,6 @@

 #include <LightGBM/meta.h>
 #include "data_partition.hpp"
-#include "split_info.hpp"

 #include <vector>

@@ -14,13 +13,13 @@ namespace LightGBM {
 */
 class LeafSplits {
 public:
-  LeafSplits(int num_feature, data_size_t num_data)
-    :num_data_in_leaf_(num_data), num_data_(num_data), num_features_(num_feature),
+  LeafSplits(data_size_t num_data)
+    :num_data_in_leaf_(num_data), num_data_(num_data),
    data_indices_(nullptr) {
-    best_split_per_feature_.resize(num_features_);
-    for (int i = 0; i < num_features_; ++i) {
-      best_split_per_feature_[i].feature = i;
-    }
+  }
+  void ResetNumData(data_size_t num_data) {
+    num_data_ = num_data;
+    num_data_in_leaf_ = num_data;
  }
  ~LeafSplits() {
  }
@@ -38,9 +37,6 @@ public:
    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
-    for (SplitInfo& split_info : best_split_per_feature_) {
-      split_info.Reset();
-    }
  }

  /*!
@@ -61,9 +57,6 @@ public:
    }
    sum_gradients_ = tmp_sum_gradients;
    sum_hessians_ = tmp_sum_hessians;
-    for (SplitInfo& split_info : best_split_per_feature_) {
-      split_info.Reset();
-    }
  }

  /*!
@@ -86,9 +79,6 @@ public:
    }
    sum_gradients_ = tmp_sum_gradients;
    sum_hessians_ = tmp_sum_hessians;
-    for (SplitInfo& split_info : best_split_per_feature_) {
-      split_info.Reset();
-    }
  }


@@ -101,9 +91,6 @@ public:
    leaf_index_ = 0;
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
-    for (SplitInfo& split_info : best_split_per_feature_) {
-      split_info.Reset();
-    }
  }

  /*!
@@ -111,13 +98,10 @@ public:
  */
  void Init() {
    leaf_index_ = -1;
-    for (SplitInfo& split_info : best_split_per_feature_) {
-      split_info.Reset();
-    }
+    data_indices_ = nullptr;
+    num_data_in_leaf_ = 0;
  }

-  /*! \brief Get best splits on all features */
-  std::vector<SplitInfo>& BestSplitPerFeature() { return best_split_per_feature_;}

  /*! \brief Get current leaf index */
  int LeafIndex() const { return leaf_index_; }
@@ -136,16 +120,12 @@ public:


 private:
-  /*! \brief store best splits of all feature on current leaf */
-  std::vector<SplitInfo> best_split_per_feature_;
  /*! \brief current leaf index */
  int leaf_index_;
  /*! \brief number of data on current leaf */
  data_size_t num_data_in_leaf_;
  /*! \brief number of all training data */
  data_size_t num_data_;
-  /*! \brief number of features */
-  int num_features_;
  /*! \brief sum of gradients of current leaf */
  double sum_gradients_;
  /*! \brief sum of hessians of current leaf */

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -22,7 +22,7 @@ class FeatureParallelTreeLearner: public SerialTreeLearner {
 public:
  explicit FeatureParallelTreeLearner(const TreeConfig* tree_config);
  ~FeatureParallelTreeLearner();
-  virtual void Init(const Dataset* train_data);
+  void Init(const Dataset* train_data) override;

 protected:
  void BeforeTrain() override;
@@ -170,6 +170,10 @@ private:
  std::unique_ptr<FeatureHistogram[]> smaller_leaf_histogram_array_global_;
  /*! \brief Store global histogram for larger leaf  */
  std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;
+
+  std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
+  std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
+  std::vector<FeatureMetainfo> feature_metas_;
 };

 }  // namespace LightGBM

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -7,13 +7,34 @@

 namespace LightGBM {

+#ifdef TIMETAG
+std::chrono::duration<double, std::milli> init_train_time;
+std::chrono::duration<double, std::milli> init_split_time;
+std::chrono::duration<double, std::milli> hist_time;
+std::chrono::duration<double, std::milli> find_split_time;
+std::chrono::duration<double, std::milli> split_time;
+std::chrono::duration<double, std::milli> ordered_bin_time;
+#endif // TIMETAG
+
 SerialTreeLearner::SerialTreeLearner(const TreeConfig* tree_config)
-  :tree_config_(tree_config){
+  :tree_config_(tree_config) {
  random_ = Random(tree_config_->feature_fraction_seed);
+#pragma omp parallel
+#pragma omp master
+  {
+    num_threads_ = omp_get_num_threads();
+  }
 }

 SerialTreeLearner::~SerialTreeLearner() {
-
+#ifdef TIMETAG
+  Log::Info("SerialTreeLearner::init_train costs %f", init_train_time * 1e-3);
+  Log::Info("SerialTreeLearner::init_split costs %f", init_split_time * 1e-3);
+  Log::Info("SerialTreeLearner::hist_build costs %f", hist_time * 1e-3);
+  Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3);
+  Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3);
+  Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3);
+#endif
 }

 void SerialTreeLearner::Init(const Dataset* train_data) {
@@ -27,49 +48,74 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  } else {
    size_t total_histogram_size = 0;
    for (int i = 0; i < train_data_->num_features(); ++i) {
-      total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin();
+      total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
    }
    max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
  }
  // at least need 2 leaves
  max_cache_size = std::max(2, max_cache_size);
  max_cache_size = std::min(max_cache_size, tree_config_->num_leaves);
-  histogram_pool_.Reset(max_cache_size, tree_config_->num_leaves);
-
-  auto histogram_create_function = [this]() {
-    auto tmp_histogram_array = std::unique_ptr<FeatureHistogram[]>(new FeatureHistogram[train_data_->num_features()]);
-    for (int j = 0; j < train_data_->num_features(); ++j) {
-      tmp_histogram_array[j].Init(train_data_->FeatureAt(j),
-        j, tree_config_);
-    }
-    return tmp_histogram_array.release();
-  };
-  histogram_pool_.Fill(histogram_create_function);

+  histogram_pool_.DynamicChangeSize(train_data_, tree_config_, max_cache_size, tree_config_->num_leaves);
  // push split information for all leaves
  best_split_per_leaf_.resize(tree_config_->num_leaves);
-  // initialize ordered_bins_ with nullptr
-  ordered_bins_.resize(num_features_);

  // get ordered bin
-  #pragma omp parallel for schedule(guided)
-  for (int i = 0; i < num_features_; ++i) {
-    ordered_bins_[i].reset(train_data_->FeatureAt(i)->bin_data()->CreateOrderedBin());
-  }
+  train_data_->CreateOrderedBins(&ordered_bins_);

  // check existing for ordered bin
-  for (int i = 0; i < num_features_; ++i) {
+  for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
    if (ordered_bins_[i] != nullptr) {
      has_ordered_bin_ = true;
      break;
    }
  }
  // initialize splits for leaf
-  smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
-  larger_leaf_splits_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
+  smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
+  larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));

  // initialize data partition
  data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves));
+  is_feature_used_.resize(num_features_);
+  // initialize ordered gradients and hessians
+  ordered_gradients_.resize(num_data_);
+  ordered_hessians_.resize(num_data_);
+  // if has ordered bin, need to allocate a buffer to fast split
+  if (has_ordered_bin_) {
+    is_data_in_leaf_.resize(num_data_);
+    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
+    ordered_bin_indices_.clear();
+    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
+      if (ordered_bins_[i] != nullptr) {
+        ordered_bin_indices_.push_back(i);
+      }
+    }
+  }
+  Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
+}
+
+void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
+  train_data_ = train_data;
+  num_data_ = train_data_->num_data();
+  num_features_ = train_data_->num_features();
+
+  // get ordered bin
+  train_data_->CreateOrderedBins(&ordered_bins_);
+
+  has_ordered_bin_ = false;
+  // check existing for ordered bin
+  for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
+    if (ordered_bins_[i] != nullptr) {
+      has_ordered_bin_ = true;
+      break;
+    }
+  }
+  // initialize splits for leaf
+  smaller_leaf_splits_->ResetNumData(num_data_);
+  larger_leaf_splits_->ResetNumData(num_data_);
+
+  // initialize data partition
+  data_partition_->ResetNumData(num_data_);

  is_feature_used_.resize(num_features_);

@@ -79,11 +125,16 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  // if has ordered bin, need to allocate a buffer to fast split
  if (has_ordered_bin_) {
    is_data_in_leaf_.resize(num_data_);
+    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
+    ordered_bin_indices_.clear();
+    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
+      if (ordered_bins_[i] != nullptr) {
+        ordered_bin_indices_.push_back(i);
+      }
+    }
  }
-  Log::Info("Number of data: %d, number of features: %d", num_data_, num_features_);
 }

-
 void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
  if (tree_config_->num_leaves != tree_config->num_leaves) {
    tree_config_ = tree_config;
@@ -94,14 +145,14 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
    } else {
      size_t total_histogram_size = 0;
      for (int i = 0; i < train_data_->num_features(); ++i) {
-        total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin();
+        total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
      }
      max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
    }
    // at least need 2 leaves
    max_cache_size = std::max(2, max_cache_size);
    max_cache_size = std::min(max_cache_size, tree_config_->num_leaves);
-    histogram_pool_.DynamicChangeSize(max_cache_size, tree_config_->num_leaves);
+    histogram_pool_.DynamicChangeSize(train_data_, tree_config_, max_cache_size, tree_config_->num_leaves);

    // push split information for all leaves
    best_split_per_leaf_.resize(tree_config_->num_leaves);
@@ -110,24 +161,40 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
    tree_config_ = tree_config;
  }

-  histogram_pool_.ResetConfig(tree_config_, train_data_->num_features());
+  histogram_pool_.ResetConfig(tree_config_);
 }

 Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
  gradients_ = gradients;
  hessians_ = hessians;
+
+#ifdef TIMETAG
+  auto start_time = std::chrono::steady_clock::now();
+#endif
  // some initial works before training
  BeforeTrain();
+
+#ifdef TIMETAG
+  init_train_time += std::chrono::steady_clock::now() - start_time;
+#endif
+
  auto tree = std::unique_ptr<Tree>(new Tree(tree_config_->num_leaves));
  // save pointer to last trained tree
  last_trained_tree_ = tree.get();
  // root leaf
  int left_leaf = 0;
+  int cur_depth = 1;
  // only root leaf can be splitted on first time
  int right_leaf = -1;
-  for (int split = 0; split < tree_config_->num_leaves - 1; split++) {
+  for (int split = 0; split < tree_config_->num_leaves - 1; ++split) {
+  #ifdef TIMETAG
+    start_time = std::chrono::steady_clock::now();
+  #endif
    // some initial works before finding best split
    if (BeforeFindBestSplit(left_leaf, right_leaf)) {
+    #ifdef TIMETAG
+      init_split_time += std::chrono::steady_clock::now() - start_time;
+    #endif
      // find best threshold for every feature
      FindBestThresholds();
      // find best split from all features
@@ -139,13 +206,20 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
    const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
    // cannot split, quit
    if (best_leaf_SplitInfo.gain <= 0.0) {
-      Log::Info("No further splits with positive gain, best gain: %f, leaves: %d",
-                   best_leaf_SplitInfo.gain, split + 1);
+      Log::Info("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain);
      break;
    }
+  #ifdef TIMETAG
+    start_time = std::chrono::steady_clock::now();
+  #endif
    // split tree with best leaf
    Split(tree.get(), best_leaf, &left_leaf, &right_leaf);
+  #ifdef TIMETAG
+    split_time += std::chrono::steady_clock::now() - start_time;
+  #endif
+    cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
  }
+  Log::Info("Trained a tree with leaves=%d and max_depth=%d", tree->num_leaves(), cur_depth);
  return tree.release();
 }

@@ -153,15 +227,24 @@ void SerialTreeLearner::BeforeTrain() {

  // reset histogram pool
  histogram_pool_.ResetMap();
-  // initialize used features
-  for (int i = 0; i < num_features_; ++i) {
-    is_feature_used_[i] = false;
-  }
-  // Get used feature at current tree
-  int used_feature_cnt = static_cast<int>(num_features_*tree_config_->feature_fraction);
-  auto used_feature_indices = random_.Sample(num_features_, used_feature_cnt);
-  for (auto idx : used_feature_indices) {
-    is_feature_used_[idx] = true;
+
+  if (tree_config_->feature_fraction < 1) {
+    int used_feature_cnt = static_cast<int>(train_data_->num_total_features()*tree_config_->feature_fraction);
+    // initialize used features
+    std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
+    // Get used feature at current tree
+    auto used_feature_indices = random_.Sample(train_data_->num_total_features(), used_feature_cnt);
+  #pragma omp parallel for schedule(static)
+    for (int i = 0; i < static_cast<int>(used_feature_indices.size()); ++i) {
+      int inner_feature_index = train_data_->InnerFeatureIndex(used_feature_indices[i]);
+      if (inner_feature_index < 0) {  continue; }
+      is_feature_used_[inner_feature_index] = 1;
+    }
+  } else {
+  #pragma omp parallel for schedule(static)
+    for (int i = 0; i < num_features_; ++i) {
+      is_feature_used_[i] = 1;
+    }
  }

  // initialize data partition
@@ -176,60 +259,49 @@ void SerialTreeLearner::BeforeTrain() {
  if (data_partition_->leaf_count(0) == num_data_) {
    // use all data
    smaller_leaf_splits_->Init(gradients_, hessians_);
-    // point to gradients, avoid copy
-    ptr_to_ordered_gradients_smaller_leaf_ = gradients_;
-    ptr_to_ordered_hessians_smaller_leaf_  = hessians_;
+
  } else {
    // use bagging, only use part of data
    smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
-    // copy used gradients and hessians to ordered buffer
-    const data_size_t* indices = data_partition_->indices();
-    data_size_t cnt = data_partition_->leaf_count(0);
-    #pragma omp parallel for schedule(static)
-    for (data_size_t i = 0; i < cnt; ++i) {
-      ordered_gradients_[i] = gradients_[indices[i]];
-      ordered_hessians_[i] = hessians_[indices[i]];
-    }
-    // point to ordered_gradients_ and ordered_hessians_
-    ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_.data();
-    ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_.data();
  }

-  ptr_to_ordered_gradients_larger_leaf_ = nullptr;
-  ptr_to_ordered_hessians_larger_leaf_ = nullptr;
-
  larger_leaf_splits_->Init();

  // if has ordered bin, need to initialize the ordered bin
  if (has_ordered_bin_) {
+  #ifdef TIMETAG
+    auto start_time = std::chrono::steady_clock::now();
+  #endif
    if (data_partition_->leaf_count(0) == num_data_) {
      // use all data, pass nullptr
-      #pragma omp parallel for schedule(guided)
-      for (int i = 0; i < num_features_; ++i) {
-        if (ordered_bins_[i] != nullptr) {
-          ordered_bins_[i]->Init(nullptr, tree_config_->num_leaves);
-        }
+    #pragma omp parallel for schedule(static)
+      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
+        ordered_bins_[ordered_bin_indices_[i]]->Init(nullptr, tree_config_->num_leaves);
      }
    } else {
      // bagging, only use part of data

      // mark used data
-      std::memset(is_data_in_leaf_.data(), 0, sizeof(char)*num_data_);
      const data_size_t* indices = data_partition_->indices();
      data_size_t begin = data_partition_->leaf_begin(0);
      data_size_t end = begin + data_partition_->leaf_count(0);
-      #pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(static)
      for (data_size_t i = begin; i < end; ++i) {
        is_data_in_leaf_[indices[i]] = 1;
      }
      // initialize ordered bin
-      #pragma omp parallel for schedule(guided)
-      for (int i = 0; i < num_features_; ++i) {
-        if (ordered_bins_[i] != nullptr) {
-          ordered_bins_[i]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
-        }
+    #pragma omp parallel for schedule(static)
+      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
+        ordered_bins_[ordered_bin_indices_[i]]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
+      }
+    #pragma omp parallel for schedule(static)
+      for (data_size_t i = begin; i < end; ++i) {
+        is_data_in_leaf_[indices[i]] = 0;
      }
    }
+  #ifdef TIMETAG
+    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
+  #endif
  }
 }

@@ -249,7 +321,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // no enough data to continue
  if (num_data_in_right_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)
-    && num_data_in_left_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)) {
+      && num_data_in_left_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)) {
    best_split_per_leaf_[left_leaf].gain = kMinScore;
    if (right_leaf >= 0) {
      best_split_per_leaf_[right_leaf].gain = kMinScore;
@@ -257,172 +329,184 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
    return false;
  }
  parent_leaf_histogram_array_ = nullptr;
-  // -1 if only has one leaf. else equal the index of smaller leaf
-  int smaller_leaf = -1;
-  int larger_leaf = -1;
  // only have root
  if (right_leaf < 0) {
    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
    larger_leaf_histogram_array_ = nullptr;
-
  } else if (num_data_in_left_child < num_data_in_right_child) {
-    smaller_leaf = left_leaf;
-    larger_leaf = right_leaf;
    // put parent(left) leaf's histograms into larger leaf's histograms
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Move(left_leaf, right_leaf);
    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
  } else {
-    smaller_leaf = right_leaf;
-    larger_leaf = left_leaf;
    // put parent(left) leaf's histograms to larger leaf's histograms
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
  }
-
-  // init for the ordered gradients, only initialize when have 2 leaves
-  if (smaller_leaf >= 0) {
-    // only need to initialize for smaller leaf
-
-    // Get leaf boundary
-    const data_size_t* indices = data_partition_->indices();
-    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
-    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
-    // copy
-    #pragma omp parallel for schedule(static)
-    for (data_size_t i = begin; i < end; ++i) {
-      ordered_gradients_[i - begin] = gradients_[indices[i]];
-      ordered_hessians_[i - begin] = hessians_[indices[i]];
-    }
-    // assign pointer
-    ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_.data();
-    ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_.data();
-
-    if (parent_leaf_histogram_array_ == nullptr) {
-      // need order gradient for larger leaf
-      data_size_t smaller_size = end - begin;
-      data_size_t larger_begin = data_partition_->leaf_begin(larger_leaf);
-      data_size_t larger_end = larger_begin + data_partition_->leaf_count(larger_leaf);
-      // copy
-      #pragma omp parallel for schedule(static)
-      for (data_size_t i = larger_begin; i < larger_end; ++i) {
-        ordered_gradients_[smaller_size + i - larger_begin] = gradients_[indices[i]];
-        ordered_hessians_[smaller_size + i - larger_begin] = hessians_[indices[i]];
-      }
-      ptr_to_ordered_gradients_larger_leaf_ = ordered_gradients_.data() + smaller_size;
-      ptr_to_ordered_hessians_larger_leaf_ = ordered_hessians_.data() + smaller_size;
-    }
-  }
-
  // split for the ordered bin
  if (has_ordered_bin_ && right_leaf >= 0) {
+  #ifdef TIMETAG
+    auto start_time = std::chrono::steady_clock::now();
+  #endif
    // mark data that at left-leaf
-    std::memset(is_data_in_leaf_.data(), 0, sizeof(char)*num_data_);
    const data_size_t* indices = data_partition_->indices();
+    const auto left_cnt = data_partition_->leaf_count(left_leaf);
+    const auto right_cnt = data_partition_->leaf_count(right_leaf);
+    char mark = 1;
    data_size_t begin = data_partition_->leaf_begin(left_leaf);
-    data_size_t end = begin + data_partition_->leaf_count(left_leaf);
-    #pragma omp parallel for schedule(static)
+    data_size_t end = begin + left_cnt;
+    if (left_cnt > right_cnt) {
+      begin = data_partition_->leaf_begin(right_leaf);
+      end = begin + right_cnt;
+      mark = 0;
+    }
+  #pragma omp parallel for schedule(static)
    for (data_size_t i = begin; i < end; ++i) {
      is_data_in_leaf_[indices[i]] = 1;
    }
    // split the ordered bin
-    #pragma omp parallel for schedule(guided)
-    for (int i = 0; i < num_features_; ++i) {
-      if (ordered_bins_[i] != nullptr) {
-        ordered_bins_[i]->Split(left_leaf, right_leaf, is_data_in_leaf_.data());
-      }
+  #pragma omp parallel for schedule(static)
+    for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
+      ordered_bins_[ordered_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
    }
+  #pragma omp parallel for schedule(static)
+    for (data_size_t i = begin; i < end; ++i) {
+      is_data_in_leaf_[indices[i]] = 0;
+    }
+  #ifdef TIMETAG
+    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
+  #endif
  }
  return true;
 }

-
 void SerialTreeLearner::FindBestThresholds() {
-  #pragma omp parallel for schedule(guided)
-  for (int feature_index = 0; feature_index < num_features_; feature_index++) {
-    // feature is not used
-    if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue;
-    // if parent(larger) leaf cannot split at current feature
-    if (parent_leaf_histogram_array_ != nullptr && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
+#ifdef TIMETAG
+  auto start_time = std::chrono::steady_clock::now();
+#endif
+  std::vector<int8_t> is_feature_used(num_features_, 0);
+#pragma omp parallel for schedule(static)
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    if (!is_feature_used_[feature_index]) continue;
+    if (parent_leaf_histogram_array_ != nullptr
+        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
      continue;
    }
-
-    // construct histograms for smaller leaf
-    if (ordered_bins_[feature_index] == nullptr) {
-      // if not use ordered bin
-      smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
-        smaller_leaf_splits_->num_data_in_leaf(),
-        smaller_leaf_splits_->sum_gradients(),
-        smaller_leaf_splits_->sum_hessians(),
-        ptr_to_ordered_gradients_smaller_leaf_,
-        ptr_to_ordered_hessians_smaller_leaf_);
-    } else {
-      // used ordered bin
-      smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index].get(),
-        smaller_leaf_splits_->LeafIndex(),
-        smaller_leaf_splits_->num_data_in_leaf(),
-        smaller_leaf_splits_->sum_gradients(),
-        smaller_leaf_splits_->sum_hessians(),
-        gradients_,
-        hessians_);
+    is_feature_used[feature_index] = 1;
+  }
+  bool use_subtract = true;
+  if (parent_leaf_histogram_array_ == nullptr) {
+    use_subtract = false;
+  }
+  // construct smaller leaf
+  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  train_data_->ConstructHistograms(is_feature_used,
+                                   smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+                                   smaller_leaf_splits_->LeafIndex(),
+                                   ordered_bins_, gradients_, hessians_,
+                                   ordered_gradients_.data(), ordered_hessians_.data(),
+                                   ptr_smaller_leaf_hist_data);
+
+  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
+    // construct larger leaf
+    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    train_data_->ConstructHistograms(is_feature_used,
+                                     larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
+                                     larger_leaf_splits_->LeafIndex(),
+                                     ordered_bins_, gradients_, hessians_,
+                                     ordered_gradients_.data(), ordered_hessians_.data(),
+                                     ptr_larger_leaf_hist_data);
+  }
+#ifdef TIMETAG
+  hist_time += std::chrono::steady_clock::now() - start_time;
+#endif
+#ifdef TIMETAG
+  start_time = std::chrono::steady_clock::now();
+#endif
+  std::vector<SplitInfo> smaller_best(num_threads_);
+  std::vector<SplitInfo> larger_best(num_threads_);
+  // find splits
+#pragma omp parallel for schedule(static)
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    if (!is_feature_used[feature_index]) { continue; }
+    const int tid = omp_get_thread_num();
+    SplitInfo smaller_split;
+    train_data_->FixHistogram(feature_index,
+                              smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
+                              smaller_leaf_splits_->num_data_in_leaf(),
+                              smaller_leaf_histogram_array_[feature_index].RawData());
+
+    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
+      smaller_leaf_splits_->sum_gradients(),
+      smaller_leaf_splits_->sum_hessians(),
+      smaller_leaf_splits_->num_data_in_leaf(),
+      &smaller_split);
+    if (smaller_split.gain > smaller_best[tid].gain) {
+      smaller_best[tid] = smaller_split;
+      smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
    }
-    // find best threshold for smaller child
-    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
-
    // only has root leaf
-    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
+    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }

-    if (parent_leaf_histogram_array_ != nullptr) {
-      // construct histgroms for large leaf, we initialize larger leaf as the parent,
-      // so we can just subtract the smaller leaf's histograms
+    if (use_subtract) {
      larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
    } else {
-      if (ordered_bins_[feature_index] == nullptr) {
-        // if not use ordered bin
-        larger_leaf_histogram_array_[feature_index].Construct(larger_leaf_splits_->data_indices(),
-          larger_leaf_splits_->num_data_in_leaf(),
-          larger_leaf_splits_->sum_gradients(),
-          larger_leaf_splits_->sum_hessians(),
-          ptr_to_ordered_gradients_larger_leaf_,
-          ptr_to_ordered_hessians_larger_leaf_);
-      } else {
-        // used ordered bin
-        larger_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index].get(),
-          larger_leaf_splits_->LeafIndex(),
-          larger_leaf_splits_->num_data_in_leaf(),
-          larger_leaf_splits_->sum_gradients(),
-          larger_leaf_splits_->sum_hessians(),
-          gradients_,
-          hessians_);
-      }
+      train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
+                                larger_leaf_splits_->num_data_in_leaf(),
+                                larger_leaf_histogram_array_[feature_index].RawData());
    }
-
+    SplitInfo larger_split;
    // find best threshold for larger child
-    larger_leaf_histogram_array_[feature_index].FindBestThreshold(&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
+    larger_leaf_histogram_array_[feature_index].FindBestThreshold(
+      larger_leaf_splits_->sum_gradients(),
+      larger_leaf_splits_->sum_hessians(),
+      larger_leaf_splits_->num_data_in_leaf(),
+      &larger_split);
+    if (larger_split.gain > larger_best[tid].gain) {
+      larger_best[tid] = larger_split;
+      larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
+    }
+  }
+
+  auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
+  int leaf = smaller_leaf_splits_->LeafIndex();
+  best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
+
+  if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->LeafIndex() >= 0) {
+    leaf = larger_leaf_splits_->LeafIndex();
+    auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
+    best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
  }
+#ifdef TIMETAG
+  find_split_time += std::chrono::steady_clock::now() - start_time;
+#endif
+}
+
+void SerialTreeLearner::FindBestSplitsForLeaves() {
+
 }


 void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
-
+  const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
  // left = parent
  *left_leaf = best_Leaf;
  // split tree, will return right leaf
-  *right_leaf = tree->Split(best_Leaf, best_split_info.feature, 
-    train_data_->FeatureAt(best_split_info.feature)->bin_type(),
-    best_split_info.threshold,
-    train_data_->FeatureAt(best_split_info.feature)->feature_index(),
-    train_data_->FeatureAt(best_split_info.feature)->BinToValue(best_split_info.threshold),
-    static_cast<double>(best_split_info.left_output),
-    static_cast<double>(best_split_info.right_output),
-    static_cast<data_size_t>(best_split_info.left_count),
-    static_cast<data_size_t>(best_split_info.right_count),
-    static_cast<double>(best_split_info.gain));
-
+  *right_leaf = tree->Split(best_Leaf,
+                            inner_feature_index,
+                            train_data_->FeatureBinMapper(inner_feature_index)->bin_type(),
+                            best_split_info.threshold,
+                            best_split_info.feature,
+                            train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
+                            static_cast<double>(best_split_info.left_output),
+                            static_cast<double>(best_split_info.right_output),
+                            static_cast<data_size_t>(best_split_info.left_count),
+                            static_cast<data_size_t>(best_split_info.right_count),
+                            static_cast<double>(best_split_info.gain));
  // split data partition
-  data_partition_->Split(best_Leaf, train_data_->FeatureAt(best_split_info.feature)->bin_data(),
+  data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
                         best_split_info.threshold, *right_leaf);

  // init the leaves that used on next iteration
@@ -431,8 +515,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
                               best_split_info.left_sum_gradient,
                               best_split_info.left_sum_hessian);
    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
-                               best_split_info.right_sum_gradient,
-                               best_split_info.right_sum_hessian);
+                              best_split_info.right_sum_gradient,
+                              best_split_info.right_sum_hessian);
  } else {
    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);