Handle for missing values (#516)

e984b0d6 · Guolin Ke · GitHub · e8cc6ab9 · e984b0d6 · e984b0d6
Commit e984b0d6 authored May 15, 2017 by Guolin Ke Committed by GitHub May 15, 2017
18 changed files
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -360,6 +360,7 @@ public:
  * \param min_bin min_bin of current used feature
  * \param max_bin max_bin of current used feature
  * \param default_bin defualt bin if bin not in [min_bin, max_bin]
+  * \param default_bin_for_zero defualt bin for the zero(missing) bin
  * \param threshold The split threshold.
  * \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
  * \param num_data Number of used data
@@ -369,7 +370,7 @@ public:
  * \return The number of less than or equal data.
  */
  virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, 
-    uint32_t default_bin, uint32_t threshold,
+    uint32_t default_bin, uint32_t default_bin_for_zero, uint32_t threshold,
    data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;


--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -402,12 +402,12 @@ public:
                    HistogramBinEntry* data) const;

  inline data_size_t Split(int feature,
-                           uint32_t threshold,
+                           uint32_t threshold, uint32_t default_bin_for_zero,
                           data_size_t* data_indices, data_size_t num_data,
                           data_size_t* lte_indices, data_size_t* gt_indices) const {
    const int group = feature2group_[feature];
    const int sub_feature = feature2subfeature_[feature];
-    return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);
+    return feature_groups_[group]->Split(sub_feature, threshold, default_bin_for_zero, data_indices, num_data, lte_indices, gt_indices);
  }

  inline int SubFeatureBinOffset(int i) const {

--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -161,13 +161,14 @@ public:
  inline data_size_t Split(
    int sub_feature,
    uint32_t threshold,
+    uint32_t default_bin_for_zero,
    data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices) const {

    uint32_t min_bin = bin_offsets_[sub_feature];
    uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
-    return bin_data_->Split(min_bin, max_bin, default_bin,
+    return bin_data_->Split(min_bin, max_bin, default_bin, default_bin_for_zero,
      threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
  }
  /*!

--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -19,6 +19,8 @@ const score_t kMinScore = -std::numeric_limits<score_t>::infinity();

 const score_t kEpsilon = 1e-15f;

+const double kMissingValueRange = 1e-20f;
+
 using ReduceFunction = std::function<void(const char*, char*, int)>;

 using PredictFunction =

--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -44,11 +44,15 @@ public:
  * \param left_cnt Count of left child
  * \param right_cnt Count of right child
  * \param gain Split gain
+  * \param zero_bin bin value for value==0 (missing value)
+  * \param default_bin default conversion for the missing value, in bin
+  * \param default_value default conversion for the missing value, in float value
  * \return The index of new leaf.
  */
-  int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
-            double threshold_double, double left_value,
-            double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
+  int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature, 
+            double threshold_double, double left_value, double right_value, 
+            data_size_t left_cnt, data_size_t right_cnt, double gain,
+            uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value);

  /*! \brief Get the output of one leaf */
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
@@ -140,6 +144,23 @@ public:
    }
  }

+  static double DefaultValueForZero(double fval, double zero, double out) {
+    if (fval > -zero && fval <= zero) {
+      return out;
+    } else {
+      return fval;
+    }
+  }
+
+  static uint32_t DefaultValueForZero(uint32_t fval, uint32_t zero, uint32_t out) {
+    if (fval == zero) {
+      return out;
+    } else {
+      return fval;
+    }
+  }
+
+
  static const char* GetDecisionTypeName(int8_t type) {
    if (type == 0) {
      return "no_greater";
@@ -176,7 +197,7 @@ private:
  /*! \brief A non-leaf node's right child */
  std::vector<int> right_child_;
  /*! \brief A non-leaf node's split feature */
-  std::vector<int> split_feature_inner;
+  std::vector<int> split_feature_inner_;
  /*! \brief A non-leaf node's split feature, the original index */
  std::vector<int> split_feature_;
  /*! \brief A non-leaf node's split threshold in bin */
@@ -185,6 +206,10 @@ private:
  std::vector<double> threshold_;
  /*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
  std::vector<int8_t> decision_type_;
+  /*! \brief Default values for the na/0 feature values */
+  std::vector<double> default_value_;
+  std::vector<uint32_t> zero_bin_;
+  std::vector<uint32_t> default_bin_for_zero_;
  /*! \brief A non-leaf node's split gain */
  std::vector<double> split_gain_;
  // used for leaf node
@@ -226,8 +251,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
  int node = 0;
  if (has_categorical_) {
    while (node >= 0) {
+      double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
      if (decision_funs[decision_type_[node]](
-        feature_values[split_feature_[node]],
+        fval,
        threshold_[node])) {
        node = left_child_[node];
      } else {
@@ -236,8 +262,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
    }
  } else {
    while (node >= 0) {
+      double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
      if (NumericalDecision<double>(
-        feature_values[split_feature_[node]],
+        fval,
        threshold_[node])) {
        node = left_child_[node];
      } else {

--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -462,6 +462,16 @@ inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& dat
  return ret;
 }

+inline static double AvoidInf(double x) {
+  if (x >= std::numeric_limits<double>::max()) {
+    return std::numeric_limits<double>::max();
+  } else if(x <= std::numeric_limits<double>::min()) {
+    return std::numeric_limits<double>::min();
+  } else {
+    return x;
+  }
+}
+
 }  // namespace Common

 }  // namespace LightGBM

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -353,7 +353,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
    }
    init_score /= num_data_;
    std::unique_ptr<Tree> new_tree(new Tree(2));
-    new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, num_data_, -1);
+    new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, num_data_, -1, 0, 0, 0);
    train_score_updater_->AddScore(init_score, 0);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(init_score, 0);
@@ -432,7 +432,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
      if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
        auto output = class_default_output_[cur_tree_id];
        new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
-                        output, output, 0, num_data_, -1);
+                        output, output, 0, num_data_, -1, 0, 0, 0);
        train_score_updater_->AddScore(output, cur_tree_id);
        for (auto& score_updater : valid_score_updater_) {
          score_updater->AddScore(output, cur_tree_id);

--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -63,6 +63,76 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, Bin
  }
  return true;
 }
+std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts, 
+                                  int num_distinct_values, int max_bin, int total_cnt, int min_data_in_bin) {
+  std::vector<double> bin_upper_bound;
+  if (num_distinct_values <= max_bin) {
+    bin_upper_bound.clear();
+    int cur_cnt_inbin = 0;
+    for (int i = 0; i < num_distinct_values - 1; ++i) {
+      cur_cnt_inbin += counts[i];
+      if (cur_cnt_inbin >= min_data_in_bin) {
+        bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
+        cur_cnt_inbin = 0;
+      }
+    }
+    cur_cnt_inbin += counts[num_distinct_values - 1];
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+  } else {
+    if (min_data_in_bin > 0) {
+      max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
+      max_bin = std::max(max_bin, 1);
+    }
+    double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
+
+    // mean size for one bin
+    int rest_bin_cnt = max_bin;
+    int rest_sample_cnt = static_cast<int>(total_cnt);
+    std::vector<bool> is_big_count_value(num_distinct_values, false);
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (counts[i] >= mean_bin_size) {
+        is_big_count_value[i] = true;
+        --rest_bin_cnt;
+        rest_sample_cnt -= counts[i];
+      }
+    }
+    mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
+    std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
+    std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
+
+    int bin_cnt = 0;
+    lower_bounds[bin_cnt] = distinct_values[0];
+    int cur_cnt_inbin = 0;
+    for (int i = 0; i < num_distinct_values - 1; ++i) {
+      if (!is_big_count_value[i]) {
+        rest_sample_cnt -= counts[i];
+      }
+      cur_cnt_inbin += counts[i];
+      // need a new bin
+      if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
+        (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
+        upper_bounds[bin_cnt] = distinct_values[i];
+        ++bin_cnt;
+        lower_bounds[bin_cnt] = distinct_values[i + 1];
+        if (bin_cnt >= max_bin - 1) { break; }
+        cur_cnt_inbin = 0;
+        if (!is_big_count_value[i]) {
+          --rest_bin_cnt;
+          mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
+        }
+      }
+    }
+    ++bin_cnt;
+    // update bin upper bound
+    bin_upper_bound.resize(bin_cnt);
+    for (int i = 0; i < bin_cnt - 1; ++i) {
+      bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
+    }
+    // last bin upper bound
+    bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
+  }
+  return bin_upper_bound;
+}

 void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
  int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
@@ -109,81 +179,62 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
  std::vector<int> cnt_in_bin;
  int num_distinct_values = static_cast<int>(distinct_values.size());
  if (bin_type_ == BinType::NumericalBin) {
-    if (num_distinct_values <= max_bin) {
-      // use distinct value is enough
-      bin_upper_bound_.clear();
-      int cur_cnt_inbin = 0;
-      for (int i = 0; i < num_distinct_values - 1; ++i) {
-        cur_cnt_inbin += counts[i];
-        if (cur_cnt_inbin >= min_data_in_bin) {
-          bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
-          cnt_in_bin.push_back(cur_cnt_inbin);
-          cur_cnt_inbin = 0;
-        }
-      }
-      cur_cnt_inbin += counts.back();
-      cnt_in_bin.push_back(cur_cnt_inbin);
-      bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
-      num_bin_ = static_cast<int>(bin_upper_bound_.size());
-    } else {
-      if (min_data_in_bin > 0) {
-        max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
-        max_bin = std::max(max_bin, 1);
+    bin_upper_bound_.clear();
+    int left_cnt_data = 0;
+    int missing_cnt_data = 0;
+    int right_cnt_data = 0;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] <= -kMissingValueRange) {
+        left_cnt_data += counts[i];
+      } else if (distinct_values[i] > kMissingValueRange) {
+        right_cnt_data += counts[i];
+      } else {
+        missing_cnt_data += counts[i];
      }
-      double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
-      if (zero_cnt > mean_bin_size) {
-        int non_zero_cnt = num_sample_values;
-        max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
+    }
+
+    int left_cnt = 0;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > -kMissingValueRange) {
+        left_cnt = i;
+        break;
+      } 
+    }
+
+    if (left_cnt > 0) {
+      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - missing_cnt_data) * (max_bin - 1));
+      bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
+      bin_upper_bound_.back() = -kMissingValueRange;
+    }
+
+    int right_start = -1;
+    for (int i = left_cnt; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > kMissingValueRange) {
+        right_start = i;
+        break;
      }
-      // mean size for one bin
-      int rest_bin_cnt = max_bin;
-      int rest_sample_cnt = static_cast<int>(total_sample_cnt);
-      std::vector<bool> is_big_count_value(num_distinct_values, false);
+    }
+
+    if (right_start >= 0) {
+      int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound_.size());
+      auto right_bounds = GreedyFindBin(distinct_values.data() + right_start, counts.data() + right_start, 
+                                   num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
+      bin_upper_bound_.push_back(kMissingValueRange);
+      bin_upper_bound_.insert(bin_upper_bound_.end(), right_bounds.begin(), right_bounds.end());
+    } else {
+      bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
+    }
+
+    num_bin_ = static_cast<int>(bin_upper_bound_.size());
+    {
+      cnt_in_bin.resize(num_bin_, 0);
+      int i_bin = 0;
      for (int i = 0; i < num_distinct_values; ++i) {
-        if (counts[i] >= mean_bin_size) {
-          is_big_count_value[i] = true;
-          --rest_bin_cnt;
-          rest_sample_cnt -= counts[i];
-        }
-      }
-      mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
-      std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
-      std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
-
-      int bin_cnt = 0;
-      lower_bounds[bin_cnt] = distinct_values[0];
-      int cur_cnt_inbin = 0;
-      for (int i = 0; i < num_distinct_values - 1; ++i) {
-        if (!is_big_count_value[i]) {
-          rest_sample_cnt -= counts[i];
-        }
-        cur_cnt_inbin += counts[i];
-        // need a new bin
-        if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
-          (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
-          upper_bounds[bin_cnt] = distinct_values[i];
-          cnt_in_bin.push_back(cur_cnt_inbin);
-          ++bin_cnt;
-          lower_bounds[bin_cnt] = distinct_values[i + 1];
-          if (bin_cnt >= max_bin - 1) { break; }
-          cur_cnt_inbin = 0;
-          if (!is_big_count_value[i]) {
-            --rest_bin_cnt;
-            mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
-          }
-        }
-      }
-      cur_cnt_inbin += counts.back();
-      cnt_in_bin.push_back(cur_cnt_inbin);
-      ++bin_cnt;
-      // update bin upper bound
-      bin_upper_bound_ = std::vector<double>(bin_cnt);
-      num_bin_ = bin_cnt;
-      for (int i = 0; i < bin_cnt - 1; ++i) {
-        bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
+        if (distinct_values[i] > bin_upper_bound_[i_bin]) {
+          ++i_bin;
+        } 
+        cnt_in_bin[i_bin] += counts[i];
      }
-      // last bin upper bound
-      bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
    }
    CHECK(num_bin_ <= max_bin);
  } else {

--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -15,6 +15,10 @@

 namespace LightGBM {

+#ifdef USE_GPU
+const int kMaxBinPerGroup = 256;
+#endif // USE_GPU
+
 const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";

 Dataset::Dataset() {
@@ -43,12 +47,180 @@ std::vector<std::vector<int>> NoGroup(
  return features_in_group;
 }

+int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) {
+  int ret = 0;
+  for (int i = 0; i < num_indices; ++i) {
+    if (mark[indices[i]]) {
+      ++ret;
+      if (ret > max_cnt) {
+        return -1;
+      }
+    }
+  }
+  return ret;
+}
+void MarkUsed(std::vector<bool>& mark, const int* indices, int num_indices) {
+  for (int i = 0; i < num_indices; ++i) {
+    mark[indices[i]] = true;
+  }
+}
+
+
+std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
+                                         const std::vector<int>& find_order,
+                                         int** sample_indices,
+                                         const int* num_per_col,
+                                         size_t total_sample_cnt,
+                                         data_size_t max_error_cnt,
+                                         data_size_t filter_cnt,
+                                         data_size_t num_data) {
+  const int max_search_group = 100;
+  Random rand(num_data);
+  std::vector<std::vector<int>> features_in_group;
+  std::vector<std::vector<bool>> conflict_marks;
+  std::vector<int> group_conflict_cnt;
+  std::vector<size_t> group_non_zero_cnt;
+
+  #ifdef USE_GPU
+  std::vector<int> group_num_bin;
+  #endif // USE_GPU
+
+  for (auto fidx : find_order) {
+    const size_t cur_non_zero_cnt = num_per_col[fidx];
+    bool need_new_group = true;
+    std::vector<int> available_groups;
+    for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
+      if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt
+          #ifdef USE_GPU
+          && group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)
+          <= kMaxBinPerGroup
+          #endif // USE_GPU
+          ) {
+        available_groups.push_back(gid);
+      }
+    }
+    std::vector<int> search_groups;
+    if (!available_groups.empty()) {
+      int last = static_cast<int>(available_groups.size()) - 1;
+      auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
+      search_groups.push_back(available_groups.back());
+      for (auto idx : indices) {
+        search_groups.push_back(available_groups[idx]);
+      }
+    }
+    for (auto gid : search_groups) {
+      const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid];
+      int cnt = GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
+      if (cnt >= 0 && cnt <= rest_max_cnt) {
+        data_size_t rest_non_zero_data = static_cast<data_size_t>(
+          static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt);
+        if (rest_non_zero_data < filter_cnt) { continue; }
+        need_new_group = false;
+        features_in_group[gid].push_back(fidx);
+        group_conflict_cnt[gid] += cnt;
+        group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
+        MarkUsed(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
+        #ifdef USE_GPU
+        group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
+        #endif // USE_GPU
+        break;
+      }
+    }
+    if (need_new_group) {
+      features_in_group.emplace_back();
+      features_in_group.back().push_back(fidx);
+      group_conflict_cnt.push_back(0);
+      conflict_marks.emplace_back(total_sample_cnt, false);
+      MarkUsed(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx]);
+      group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
+      #ifdef USE_GPU
+      group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
+      #endif // USE_GPU
+    }
+  }
+  return features_in_group;
+}
+
+std::vector<std::vector<int>> FastFeatureBundling(std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
+                                                  int** sample_indices,
+                                                  const int* num_per_col,
+                                                  size_t total_sample_cnt,
+                                                  const std::vector<int>& used_features,
+                                                  double max_conflict_rate,
+                                                  data_size_t num_data,
+                                                  data_size_t min_data,
+                                                  double sparse_threshold,
+                                                  bool is_enable_sparse) {
+  // filter is based on sampling data, so decrease its range
+  const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
+  const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
+  int cur_used_feature_cnt = 0;
+  std::vector<size_t> feature_non_zero_cnt;
+  // put dense feature first
+  for (auto fidx : used_features) {
+    feature_non_zero_cnt.emplace_back(num_per_col[fidx]);
+    ++cur_used_feature_cnt;
+  }
+  // sort by non zero cnt
+  std::vector<int> sorted_idx;
+  for (int i = 0; i < cur_used_feature_cnt; ++i) {
+    sorted_idx.emplace_back(i);
+  }
+  // sort by non zero cnt, bigger first
+  std::sort(sorted_idx.begin(), sorted_idx.end(),
+            [&feature_non_zero_cnt](int a, int b) {
+    return feature_non_zero_cnt[a] > feature_non_zero_cnt[b];
+  });
+
+  std::vector<int> feature_order_by_cnt;
+  for (auto sidx : sorted_idx) {
+    feature_order_by_cnt.push_back(used_features[sidx]);
+  }
+  auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, num_per_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data);
+  auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, num_per_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data);
+  if (features_in_group.size() > group2.size()) {
+    features_in_group = group2;
+  }
+  std::vector<std::vector<int>> ret;
+  for (size_t i = 0; i < features_in_group.size(); ++i) {
+    if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
+      ret.push_back(features_in_group[i]);
+    } else {
+      int cnt_non_zero = 0;
+      for (size_t j = 0; j < features_in_group[i].size(); ++j) {
+        const int fidx = features_in_group[i][j];
+        cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
+      }
+      double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
+      // take apart small sparse group, due it will not gain on speed 
+      if (sparse_rate >= sparse_threshold && is_enable_sparse) {
+        for (size_t j = 0; j < features_in_group[i].size(); ++j) {
+          const int fidx = features_in_group[i][j];
+          ret.emplace_back();
+          ret.back().push_back(fidx);
+        }
+      } else {
+        ret.push_back(features_in_group[i]);
+      }
+    }
+  }
+  // shuffle groups
+  int num_group = static_cast<int>(ret.size());
+  Random tmp_rand(12);
+  for (int i = 0; i < num_group - 1; ++i) {
+    int j = tmp_rand.NextShort(i + 1, num_group);
+    std::swap(ret[i], ret[j]);
+  }
+  return ret;
+}
+
 void Dataset::Construct(
  std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
-  int**,
-  const int*,
-  size_t,
+  int** sample_non_zero_indices,
+  const int* num_per_col,
+  size_t total_sample_cnt,
  const IOConfig& io_config) {
+
  num_total_features_ = static_cast<int>(bin_mappers.size());
  sparse_threshold_ = io_config.sparse_threshold;
  // get num_features
@@ -61,6 +233,15 @@ void Dataset::Construct(

  auto features_in_group = NoGroup(used_features);

+  if (io_config.enable_bundle) {
+    std::chrono::duration<double, std::milli> bundling_time_;
+    features_in_group = FastFeatureBundling(bin_mappers,
+                                            sample_non_zero_indices, num_per_col, total_sample_cnt,
+                                            used_features, io_config.max_conflict_rate,
+                                            num_data_, io_config.min_data_in_leaf,
+                                            sparse_threshold_, io_config.is_enable_sparse);
+  }
+
  num_features_ = 0;
  for (const auto& fs : features_in_group) {
    num_features_ += static_cast<int>(fs.size());
@@ -86,7 +267,8 @@ void Dataset::Construct(
      ++cur_fidx;
    }
    feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
-      new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, sparse_threshold_, io_config.is_enable_sparse)));
+      new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, sparse_threshold_,
+                       io_config.is_enable_sparse)));
  }
  feature_groups_.shrink_to_fit();
  group_bin_boundaries_.clear();
@@ -116,7 +298,7 @@ void Dataset::Construct(
 void Dataset::FinishLoad() {
  if (is_finish_load_) { return; }
  OMP_INIT_EX();
-#pragma omp parallel for schedule(guided)
+  #pragma omp parallel for schedule(guided)
  for (int i = 0; i < num_groups_; ++i) {
    OMP_LOOP_EX_BEGIN();
    feature_groups_[i]->bin_data_->FinishLoad();
@@ -212,7 +394,7 @@ void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
    OMP_INIT_EX();
-#pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(static)
    for (int group = 0; group < num_groups_; ++group) {
      OMP_LOOP_EX_BEGIN();
      feature_groups_[group]->bin_data_->ReSize(num_data_);
@@ -314,7 +496,7 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in

 void Dataset::SaveBinaryFile(const char* bin_filename) {
  if (bin_filename != nullptr
-    && std::string(bin_filename) == std::string(data_filename_)) {
+      && std::string(bin_filename) == std::string(data_filename_)) {
    Log::Warning("Bianry file %s already existed", bin_filename);
    return;
  }
@@ -326,11 +508,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
  }
  bool is_file_existed = false;
  FILE* file;
-#ifdef _MSC_VER
+  #ifdef _MSC_VER
  fopen_s(&file, bin_filename, "rb");
-#else
+  #else
  file = fopen(bin_filename, "rb");
-#endif
+  #endif

  if (file != NULL) {
    is_file_existed = true;
@@ -339,11 +521,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
  }

  if (!is_file_existed) {
-#ifdef _MSC_VER
+    #ifdef _MSC_VER
    fopen_s(&file, bin_filename, "wb");
-#else
+    #else
    file = fopen(bin_filename, "wb");
-#endif
+    #endif
    if (file == NULL) {
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
    }

--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -188,29 +188,31 @@ public:
  }

  virtual data_size_t Split(
-    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
    if (num_data <= 0) { return 0; }
    VAL_T th = static_cast<VAL_T>(threshold + min_bin);
    VAL_T minb = static_cast<VAL_T>(min_bin);
    VAL_T maxb = static_cast<VAL_T>(max_bin);
+    VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
    if (default_bin == 0) {
      th -= 1;
+      t_default_bin -= 1;
    }
    data_size_t lte_count = 0;
    data_size_t gt_count = 0;
    data_size_t* default_indices = gt_indices;
    data_size_t* default_count = &gt_count;
    if (bin_type == BinType::NumericalBin) {
-      if (default_bin <= threshold) {
+      if (default_bin_for_zero <= threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
        VAL_T bin = data_[idx];
-        if (bin > maxb || bin < minb) {
+        if ( bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
        } else if (bin > th) {
          gt_indices[gt_count++] = idx;
@@ -219,14 +221,14 @@ public:
        }
      }
    } else {
-      if (default_bin == threshold) {
+      if (default_bin_for_zero == threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
        VAL_T bin = data_[idx];
-        if (bin > maxb || bin < minb) {
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
        } else if (bin != th) {
          gt_indices[gt_count++] = idx;

--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -227,29 +227,31 @@ public:
  }

  virtual data_size_t Split(
-    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
    if (num_data <= 0) { return 0; }
    uint8_t th = static_cast<uint8_t>(threshold + min_bin);
    uint8_t minb = static_cast<uint8_t>(min_bin);
    uint8_t maxb = static_cast<uint8_t>(max_bin);
+    uint8_t t_default_bin = static_cast<uint8_t>(min_bin + default_bin);
    if (default_bin == 0) {
      th -= 1;
+      t_default_bin -= 1;
    }
    data_size_t lte_count = 0;
    data_size_t gt_count = 0;
    data_size_t* default_indices = gt_indices;
    data_size_t* default_count = &gt_count;
    if (bin_type == BinType::NumericalBin) {
-      if (default_bin <= threshold) {
+      if (default_bin_for_zero <= threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
        const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-        if (bin > maxb || bin < minb) {
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
        } else if (bin > th) {
          gt_indices[gt_count++] = idx;
@@ -258,14 +260,14 @@ public:
        }
      }
    } else {
-      if (default_bin == threshold) {
+      if (default_bin_for_zero == threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
        const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-        if (bin > maxb || bin < minb) {
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
        } else if (bin != th) {
          gt_indices[gt_count++] = idx;

--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -142,7 +142,7 @@ public:
  }

  virtual data_size_t Split(
-    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
    // not need to split
@@ -150,8 +150,10 @@ public:
    VAL_T th = static_cast<VAL_T>(threshold + min_bin);
    VAL_T minb = static_cast<VAL_T>(min_bin);
    VAL_T maxb = static_cast<VAL_T>(max_bin);
+    VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
    if (default_bin == 0) {
      th -= 1;
+      t_default_bin -= 1;
    }
    SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
    data_size_t lte_count = 0;
@@ -159,14 +161,14 @@ public:
    data_size_t* default_indices = gt_indices;
    data_size_t* default_count = &gt_count;
    if (bin_type == BinType::NumericalBin) {
-      if (default_bin <= threshold) {
+      if (default_bin_for_zero <= threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
        VAL_T bin = iterator.InnerRawGet(idx);
-        if (bin > maxb || bin < minb) {
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
        } else if (bin > th) {
          gt_indices[gt_count++] = idx;
@@ -175,14 +177,14 @@ public:
        }
      }
    } else {
-      if (default_bin == threshold) {
+      if (default_bin_for_zero == threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
        VAL_T bin = iterator.InnerRawGet(idx);
-        if (bin > maxb || bin < minb) {
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
        } else if (bin != th) {
          gt_indices[gt_count++] = idx;

--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -24,20 +24,23 @@ Tree::Tree(int max_leaves)
  :max_leaves_(max_leaves) {

  num_leaves_ = 0;
-  left_child_ = std::vector<int>(max_leaves_ - 1);
-  right_child_ = std::vector<int>(max_leaves_ - 1);
-  split_feature_inner = std::vector<int>(max_leaves_ - 1);
-  split_feature_ = std::vector<int>(max_leaves_ - 1);
-  threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1);
-  threshold_ = std::vector<double>(max_leaves_ - 1);
-  decision_type_ = std::vector<int8_t>(max_leaves_ - 1);
-  split_gain_ = std::vector<double>(max_leaves_ - 1);
-  leaf_parent_ = std::vector<int>(max_leaves_);
-  leaf_value_ = std::vector<double>(max_leaves_);
-  leaf_count_ = std::vector<data_size_t>(max_leaves_);
-  internal_value_ = std::vector<double>(max_leaves_ - 1);
-  internal_count_ = std::vector<data_size_t>(max_leaves_ - 1);
-  leaf_depth_ = std::vector<int>(max_leaves_);
+  left_child_.resize(max_leaves_ - 1);
+  right_child_.resize(max_leaves_ - 1);
+  split_feature_inner_.resize(max_leaves_ - 1);
+  split_feature_.resize(max_leaves_ - 1);
+  threshold_in_bin_.resize(max_leaves_ - 1);
+  threshold_.resize(max_leaves_ - 1);
+  decision_type_.resize(max_leaves_ - 1);
+  default_value_.resize(max_leaves_ - 1);
+  zero_bin_.resize(max_leaves_ - 1);
+  default_bin_for_zero_.resize(max_leaves_ - 1);
+  split_gain_.resize(max_leaves_ - 1);
+  leaf_parent_.resize(max_leaves_);
+  leaf_value_.resize(max_leaves_);
+  leaf_count_.resize(max_leaves_);
+  internal_value_.resize(max_leaves_ - 1);
+  internal_count_.resize(max_leaves_ - 1);
+  leaf_depth_.resize(max_leaves_);
  // root is in the depth 0
  leaf_depth_[0] = 0;
  num_leaves_ = 1;
@@ -49,9 +52,9 @@ Tree::~Tree() {

 }

-int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature,
-    double threshold_double, double left_value,
-    double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
+int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature, double threshold_double, 
+                double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain,
+                uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value) {
  int new_node_idx = num_leaves_ - 1;
  // update parent info
  int parent = leaf_parent_[leaf];
@@ -64,17 +67,23 @@ int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin,
    }
  }
  // add new node
-  split_feature_inner[new_node_idx] = feature;
+  split_feature_inner_[new_node_idx] = feature;
  split_feature_[new_node_idx] = real_feature;
+
+  zero_bin_[new_node_idx] = zero_bin;
+  default_bin_for_zero_[new_node_idx] = default_bin_for_zero;
+  default_value_[new_node_idx] = Common::AvoidInf(default_value);
+
  if (bin_type == BinType::NumericalBin) {
    decision_type_[new_node_idx] = 0;
  } else {
    has_categorical_ = true;
    decision_type_[new_node_idx] = 1;
  }
+
  threshold_in_bin_[new_node_idx] = threshold_bin;
  threshold_[new_node_idx] = threshold_double;
-  split_gain_[new_node_idx] = gain == std::numeric_limits<double>::infinity() ? std::numeric_limits<double>::max() : gain;
+  split_gain_[new_node_idx] = Common::AvoidInf(gain);
  // add two new leaves
  left_child_[new_node_idx] = ~leaf;
  right_child_[new_node_idx] = ~num_leaves_;
@@ -104,15 +113,16 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
        [this, &data, score](int, data_size_t start, data_size_t end) {
        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner[i];
+          const int fidx = split_feature_inner_[i];
          iter[i].reset(data->FeatureIterator(fidx));
          iter[i]->Reset(start);
        }
        for (data_size_t i = start; i < end; ++i) {
          int node = 0;
          while (node >= 0) {
+            uint32_t fval = DefaultValueForZero(iter[node]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
            if (inner_decision_funs[decision_type_[node]](
-              iter[node]->Get(i),
+              fval,
              threshold_in_bin_[node])) {
              node = left_child_[node];
            } else {
@@ -133,8 +143,9 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
        for (data_size_t i = start; i < end; ++i) {
          int node = 0;
          while (node >= 0) {
+            uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
            if (inner_decision_funs[decision_type_[node]](
-              iter[split_feature_inner[node]]->Get(i),
+              fval,
              threshold_in_bin_[node])) {
              node = left_child_[node];
            } else {
@@ -151,14 +162,15 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
        [this, &data, score](int, data_size_t start, data_size_t end) {
        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner[i];
+          const int fidx = split_feature_inner_[i];
          iter[i].reset(data->FeatureIterator(fidx));
          iter[i]->Reset(start);
        }
        for (data_size_t i = start; i < end; ++i) {
          int node = 0;
          while (node >= 0) {
-            if (iter[node]->Get(i) <= threshold_in_bin_[node]) {
+            uint32_t fval = DefaultValueForZero(iter[node]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
+            if (fval <= threshold_in_bin_[node]) {
              node = left_child_[node];
            } else {
              node = right_child_[node];
@@ -178,7 +190,8 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
        for (data_size_t i = start; i < end; ++i) {
          int node = 0;
          while (node >= 0) {
-            if (iter[split_feature_inner[node]]->Get(i) <= threshold_in_bin_[node]) {
+            uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
+            if (fval <= threshold_in_bin_[node]) {
              node = left_child_[node];
            } else {
              node = right_child_[node];
@@ -201,7 +214,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
        [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner[i];
+          const int fidx = split_feature_inner_[i];
          iter[i].reset(data->FeatureIterator(fidx));
          iter[i]->Reset(used_data_indices[start]);
        }
@@ -209,8 +222,9 @@ void Tree::AddPredictionToScore(const Dataset* data,
          int node = 0;
          const data_size_t idx = used_data_indices[i];
          while (node >= 0) {
+            uint32_t fval = DefaultValueForZero(iter[node]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
            if (inner_decision_funs[decision_type_[node]](
-              iter[node]->Get(idx),
+              fval,
              threshold_in_bin_[node])) {
              node = left_child_[node];
            } else {
@@ -232,8 +246,9 @@ void Tree::AddPredictionToScore(const Dataset* data,
          const data_size_t idx = used_data_indices[i];
          int node = 0;
          while (node >= 0) {
+            uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
            if (inner_decision_funs[decision_type_[node]](
-              iter[split_feature_inner[node]]->Get(idx),
+              fval,
              threshold_in_bin_[node])) {
              node = left_child_[node];
            } else {
@@ -250,7 +265,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
        [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner[i];
+          const int fidx = split_feature_inner_[i];
          iter[i].reset(data->FeatureIterator(fidx));
          iter[i]->Reset(used_data_indices[start]);
        }
@@ -258,7 +273,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
          int node = 0;
          const data_size_t idx = used_data_indices[i];
          while (node >= 0) {
-            if (iter[node]->Get(idx) <= threshold_in_bin_[node]) {
+            uint32_t fval = DefaultValueForZero(iter[node]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
+            if (fval <= threshold_in_bin_[node]) {
              node = left_child_[node];
            } else {
              node = right_child_[node];
@@ -279,7 +295,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
          const data_size_t idx = used_data_indices[i];
          int node = 0;
          while (node >= 0) {
-            if (iter[split_feature_inner[node]]->Get(idx) <= threshold_in_bin_[node]) {
+            uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
+            if (fval <= threshold_in_bin_[node]) {
              node = left_child_[node];
            } else {
              node = right_child_[node];
@@ -303,6 +320,8 @@ std::string Tree::ToString() {
    << Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "decision_type="
    << Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl;
+  str_buf << "default_value="
+    << Common::ArrayToString<double>(default_value_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "left_child="
    << Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "right_child="
@@ -349,6 +368,7 @@ std::string Tree::NodeToJSON(int index) {
    str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
    str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
    str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
+    str_buf << "\"default_value\":" << default_value_[index] << "," << std::endl;
    str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
    str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
    str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
@@ -389,7 +409,11 @@ std::string Tree::NodeToIfElse(int index, bool is_predict_leaf_index) {
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
-    str_buf << "if ( arr[" << split_feature_[index] << "] ";
+    std::stringstream tmp_str_buf;
+    tmp_str_buf << "arr[" << split_feature_[index] << "]";
+    std::string str_fval = tmp_str_buf.str();
+    str_buf << "if( ( " << str_fval <<" <= " << kMissingValueRange  << " && "<< str_fval << " > -" << kMissingValueRange <<" ?  "
+      << default_value_[index] << " : " << str_fval << " ) ";
    if (decision_type_[index] == 0) {
      str_buf << "<";
    } else {
@@ -461,6 +485,12 @@ Tree::Tree(const std::string& str) {
    Log::Fatal("Tree model string format error, should contain threshold field");
  }

+  if (key_vals.count("default_value")) {
+    default_value_ = Common::StringToArray<double>(key_vals["default_value"], ' ', num_leaves_ - 1);
+  } else {
+    Log::Fatal("Tree model string format error, should contain default_value field");
+  }
+
  if (key_vals.count("leaf_value")) {
    leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
  } else {

--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -91,7 +91,7 @@ public:
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
-  void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, int right_leaf) {
+  void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, uint32_t default_bin_for_zero, int right_leaf, int expected_left_cnt) {
    const data_size_t min_inner_size = 512;
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
@@ -111,7 +111,7 @@ public:
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
      // split data inner, reduce the times of function called
-      data_size_t cur_left_count = dataset->Split(feature, threshold, indices_.data() + begin + cur_start, cur_cnt,
+      data_size_t cur_left_count = dataset->Split(feature, threshold, default_bin_for_zero, indices_.data() + begin + cur_start, cur_cnt,
                                                  temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
@@ -141,6 +141,7 @@ public:
    }
    // update leaf boundary
    leaf_count_[leaf] = left_cnt;
+    CHECK(left_cnt == expected_left_cnt);
    leaf_begin_[right_leaf] = left_cnt + begin;
    leaf_count_[right_leaf] = cnt - left_cnt;
  }

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -15,6 +15,7 @@ class FeatureMetainfo {
 public:
  int num_bin;
  int bias = 0;
+  uint32_t default_bin;
  /*! \brief pointer of tree config */
  const TreeConfig* tree_config;
 };
@@ -69,81 +70,28 @@ public:

  void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
                         SplitInfo* output) {
+    output->default_bin_for_zero = meta_->default_bin;
+    output->gain = kMinScore;
    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
  }

  void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
                                  SplitInfo* output) {
-    double best_sum_left_gradient = NAN;
-    double best_sum_left_hessian = NAN;
-    double best_gain = kMinScore;
-    data_size_t best_left_count = 0;
-    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
-    double sum_right_gradient = 0.0f;
-    double sum_right_hessian = kEpsilon;
-    data_size_t right_count = 0;
+
+    is_splittable_ = false;
    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
    double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
-    is_splittable_ = false;
-    const int bias = meta_->bias;
-    int t = meta_->num_bin - 1 - bias;
-    const int t_end = 1 - bias;
-    // from right to left, and we don't need data in bin0
-    for (; t >= t_end; --t) {
-      sum_right_gradient += data_[t].sum_gradients;
-      sum_right_hessian += data_[t].sum_hessians;
-      right_count += data_[t].cnt;
-      // if data not enough, or sum hessian too small
-      if (right_count < meta_->tree_config->min_data_in_leaf
-          || sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
-      data_size_t left_count = num_data - right_count;
-      // if data not enough
-      if (left_count < meta_->tree_config->min_data_in_leaf) break;
-
-      double sum_left_hessian = sum_hessian - sum_right_hessian;
-      // if sum hessian too small
-      if (sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
-
-      double sum_left_gradient = sum_gradient - sum_right_gradient;
-      // current split gain
-      double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
-                                             meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
-        + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
-                           meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
-      // gain with split is worse than without split
-      if (current_gain <= min_gain_shift) continue;

-      // mark to is splittable
-      is_splittable_ = true;
-      // better split point
-      if (current_gain > best_gain) {
-        best_left_count = left_count;
-        best_sum_left_gradient = sum_left_gradient;
-        best_sum_left_hessian = sum_left_hessian;
-        // left is <= threshold, right is > threshold.  so this is t-1
-        best_threshold = static_cast<uint32_t>(t - 1 + bias);
-        best_gain = current_gain;
-      }
+    FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, 0);
+    // Zero is not in leftmost or rightmost
+    if (static_cast<int>(meta_->default_bin) > 0 && static_cast<int>(meta_->default_bin) < meta_->num_bin - 1) {
+      FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->default_bin);
    }
-    if (is_splittable_) {
-      // update split information
-      output->threshold = best_threshold;
-      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
-                                                        meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
-      output->left_count = best_left_count;
-      output->left_sum_gradient = best_sum_left_gradient;
-      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
-      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-                                                         sum_hessian - best_sum_left_hessian,
-                                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
-      output->right_count = num_data - best_left_count;
-      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
-      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
-      output->gain = best_gain - gain_shift;
-    } else {
-      output->gain = kMinScore;
+    if (meta_->num_bin > 2) {
+      FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->num_bin - 1);
    }
+    output->gain -= min_gain_shift;
  }

  void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
@@ -242,10 +190,8 @@ public:
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
-      output->gain = best_gain - gain_shift;
-    } else {
-      output->gain = kMinScore;
-    }
+      output->gain = best_gain - min_gain_shift;
+    } 
  }

  /*!
@@ -301,6 +247,142 @@ public:

 private:

+  void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_gain_shift,
+                                  SplitInfo* output, uint32_t default_bin_for_zero) {
+    int dir = -1;
+    if (static_cast<int>(default_bin_for_zero) == meta_->num_bin - 1) { dir = 1; };
+
+    bool skip_default_bin = true;
+    if (static_cast<int>(default_bin_for_zero) > 0 && static_cast<int>(default_bin_for_zero) < meta_->num_bin - 1) {
+      skip_default_bin = false;
+    }
+    const int bias = meta_->bias;
+
+    double best_sum_left_gradient = NAN;
+    double best_sum_left_hessian = NAN;
+    double best_gain = kMinScore;
+    data_size_t best_left_count = 0;
+    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
+
+    if (dir == -1) {
+
+      double sum_right_gradient = 0.0f;
+      double sum_right_hessian = kEpsilon;
+      data_size_t right_count = 0;
+
+      int t = meta_->num_bin - 1 - bias;
+      const int t_end = 1 - bias;
+
+      // from right to left, and we don't need data in bin0
+      for (; t >= t_end; --t) {
+
+        // need to skip default bin
+        if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
+
+        sum_right_gradient += data_[t].sum_gradients;
+        sum_right_hessian += data_[t].sum_hessians;
+        right_count += data_[t].cnt;
+        // if data not enough, or sum hessian too small
+        if (right_count < meta_->tree_config->min_data_in_leaf
+            || sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+        data_size_t left_count = num_data - right_count;
+        // if data not enough
+        if (left_count < meta_->tree_config->min_data_in_leaf) break;
+
+        double sum_left_hessian = sum_hessian - sum_right_hessian;
+        // if sum hessian too small
+        if (sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
+
+        double sum_left_gradient = sum_gradient - sum_right_gradient;
+        // current split gain
+        double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
+                                               meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+          + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
+                             meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
+        // gain with split is worse than without split
+        if (current_gain <= min_gain_shift) continue;
+
+        // mark to is splittable
+        is_splittable_ = true;
+        // better split point
+        if (current_gain > best_gain) {
+          best_left_count = left_count;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+          // left is <= threshold, right is > threshold.  so this is t-1
+          best_threshold = static_cast<uint32_t>(t - 1 + bias);
+          best_gain = current_gain;
+        }
+      }
+    } else{
+      double sum_left_gradient = 0.0f;
+      double sum_left_hessian = kEpsilon;
+      data_size_t left_count = 0;
+
+      int t = 0;
+      const int t_end = meta_->num_bin - 2 - bias;
+
+      // from right to left, and we don't need data in bin0
+      for (; t <= t_end; ++t) {
+
+        // need to skip default bin
+        if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
+
+        sum_left_gradient += data_[t].sum_gradients;
+        sum_left_hessian += data_[t].sum_hessians;
+        left_count += data_[t].cnt;
+        // if data not enough, or sum hessian too small
+        if (left_count < meta_->tree_config->min_data_in_leaf
+            || sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+        data_size_t right_count = num_data - left_count;
+        // if data not enough
+        if (right_count < meta_->tree_config->min_data_in_leaf) break;
+
+        double sum_right_hessian = sum_hessian - sum_left_hessian;
+        // if sum hessian too small
+        if (sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
+
+        double sum_right_gradient = sum_gradient - sum_left_gradient;
+        // current split gain
+        double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
+                                               meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+          + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
+                             meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
+        // gain with split is worse than without split
+        if (current_gain <= min_gain_shift) continue;
+
+        // mark to is splittable
+        is_splittable_ = true;
+        // better split point
+        if (current_gain > best_gain) {
+          best_left_count = left_count;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+          best_threshold = static_cast<uint32_t>(t + bias);
+          best_gain = current_gain;
+        }
+      }
+    }
+
+    if (is_splittable_ && best_gain > output->gain) {
+      // update split information
+      output->threshold = best_threshold;
+      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
+                                                        meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
+      output->left_count = best_left_count;
+      output->left_sum_gradient = best_sum_left_gradient;
+      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
+      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
+                                                         sum_hessian - best_sum_left_hessian,
+                                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
+      output->right_count = num_data - best_left_count;
+      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
+      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
+      output->gain = best_gain;
+      output->default_bin_for_zero = default_bin_for_zero;
+    }
+  }
+
  const FeatureMetainfo* meta_;
  /*! \brief sum of gradient of each bin */
  HistogramBinEntry* data_;
@@ -364,6 +446,7 @@ public:
      #pragma omp parallel for schedule(static, 512) if(num_feature >= 1024)
      for (int i = 0; i < num_feature; ++i) {
        feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
+        feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
        if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
          feature_metas_[i].bias = 1;
        } else {

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -543,6 +543,10 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
  const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
  // left = parent
  *left_leaf = best_Leaf;
+  double default_value = 0.0f;
+  if (train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() != best_split_info.default_bin_for_zero) {
+    default_value = train_data_->RealThreshold(inner_feature_index, best_split_info.default_bin_for_zero);
+  }
  // split tree, will return right leaf
  *right_leaf = tree->Split(best_Leaf,
                            inner_feature_index,
@@ -554,10 +558,13 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
                            static_cast<double>(best_split_info.right_output),
                            static_cast<data_size_t>(best_split_info.left_count),
                            static_cast<data_size_t>(best_split_info.right_count),
-                            static_cast<double>(best_split_info.gain));
+                            static_cast<double>(best_split_info.gain),
+                            train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin(),
+                            best_split_info.default_bin_for_zero,
+                            default_value);
  // split data partition
  data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
-                         best_split_info.threshold, *right_leaf);
+                         best_split_info.threshold, best_split_info.default_bin_for_zero, *right_leaf, best_split_info.left_count);

  // init the leaves that used on next iteration
  if (best_split_info.left_count < best_split_info.right_count) {

--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -19,7 +19,9 @@ public:
  /*! \brief Feature index */
  int feature;
  /*! \brief Split threshold */
-  unsigned int threshold;
+  uint32_t threshold;
+
+  uint32_t default_bin_for_zero;
  /*! \brief Left output after split */
  double left_output;
  /*! \brief Right output after split */

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -67,6 +67,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
 #pragma omp parallel for schedule(static)
  for (int i = 0; i < train_data->num_features(); ++i) {
    feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
+    feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
    if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
      feature_metas_[i].bias = 1;
    } else {