Support both row-wise and col-wise multi-threading (#2699)

* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

Support both row-wise and col-wise multi-threading (#2699)
* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
509c2e50 · Guolin Ke · GitHub · bc7bc4a1 · 509c2e50 · 509c2e50
Unverified Commit 509c2e50 authored Feb 02, 2020 by Guolin Ke Committed by GitHub Feb 02, 2020
20 changed files
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
  API_BEGIN();
  auto target_d = reinterpret_cast<Dataset*>(target);
  auto source_d = reinterpret_cast<Dataset*>(source);
-  target_d->addFeaturesFrom(source_d);
+  target_d->AddFeaturesFrom(source_d);
  API_END();
 }


--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -15,7 +15,8 @@

 #include "dense_bin.hpp"
 #include "dense_nbits_bin.hpp"
-#include "ordered_sparse_bin.hpp"
+#include "multi_val_dense_bin.hpp"
+#include "multi_val_sparse_bin.hpp"
 #include "sparse_bin.hpp"

 namespace LightGBM {
@@ -636,21 +637,10 @@ namespace LightGBM {
  template class SparseBin<uint16_t>;
  template class SparseBin<uint32_t>;

-  template class OrderedSparseBin<uint8_t>;
-  template class OrderedSparseBin<uint16_t>;
-  template class OrderedSparseBin<uint32_t>;
+  template class MultiValDenseBin<uint8_t>;
+  template class MultiValDenseBin<uint16_t>;
+  template class MultiValDenseBin<uint32_t>;

-  Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
-    bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
-    // sparse threshold
-    if (sparse_rate >= sparse_threshold && is_enable_sparse) {
-      *is_sparse = true;
-      return CreateSparseBin(num_data, num_bin);
-    } else {
-      *is_sparse = false;
-      return CreateDenseBin(num_data, num_bin);
-    }
-  }

  Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
    if (num_bin <= 16) {
@@ -674,4 +664,25 @@ namespace LightGBM {
    }
  }

+  MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate) {
+    const double multi_val_bin_sparse_threshold = 0.25f;
+    if (sparse_rate >= multi_val_bin_sparse_threshold) {
+      if (num_bin <= 256) {
+        return new MultiValSparseBin<uint8_t>(num_data, num_bin);
+      } else if (num_bin <= 65536) {
+        return new MultiValSparseBin<uint16_t>(num_data, num_bin);
+      } else {
+        return new MultiValSparseBin<uint32_t>(num_data, num_bin);
+      }
+    } else {
+      if (num_bin <= 256) {
+        return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature);
+      } else if (num_bin <= 65536) {
+        return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature);
+      } else {
+        return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature);
+      }
+    }
+  }
+
 }  // namespace LightGBM
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -312,6 +312,11 @@ void Config::CheckParamConflict() {
      num_leaves = static_cast<int>(full_num_leaves);
    }
  }
+  // force col-wise for gpu
+  if (device_type == std::string("gpu")) {
+    force_col_wise = true;
+    force_row_wise = false;
+  }
 }

 std::string Config::ToString() const {

--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -116,9 +116,6 @@ std::unordered_map<std::string, std::string> Config::alias_table({
  {"is_pre_partition", "pre_partition"},
  {"is_enable_bundle", "enable_bundle"},
  {"bundle", "enable_bundle"},
-  {"is_sparse", "is_enable_sparse"},
-  {"enable_sparse", "is_enable_sparse"},
-  {"sparse", "is_enable_sparse"},
  {"two_round_loading", "two_round"},
  {"use_two_round_loading", "two_round"},
  {"is_save_binary", "save_binary"},
@@ -181,6 +178,8 @@ std::unordered_set<std::string> Config::parameter_set({
  "num_threads",
  "device_type",
  "seed",
+  "force_col_wise",
+  "force_row_wise",
  "max_depth",
  "min_data_in_leaf",
  "min_sum_hessian_in_leaf",
@@ -236,9 +235,6 @@ std::unordered_set<std::string> Config::parameter_set({
  "valid_data_initscores",
  "pre_partition",
  "enable_bundle",
-  "max_conflict_rate",
-  "is_enable_sparse",
-  "sparse_threshold",
  "use_missing",
  "zero_as_missing",
  "two_round",
@@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str

  GetInt(params, "num_threads", &num_threads);

+  GetBool(params, "force_col_wise", &force_col_wise);
+
+  GetBool(params, "force_row_wise", &force_row_wise);
+
  GetInt(params, "max_depth", &max_depth);

  GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
@@ -467,16 +467,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str

  GetBool(params, "enable_bundle", &enable_bundle);

-  GetDouble(params, "max_conflict_rate", &max_conflict_rate);
-  CHECK(max_conflict_rate >=0.0);
-  CHECK(max_conflict_rate <1.0);
-
-  GetBool(params, "is_enable_sparse", &is_enable_sparse);
-
-  GetDouble(params, "sparse_threshold", &sparse_threshold);
-  CHECK(sparse_threshold >0.0);
-  CHECK(sparse_threshold <=1.0);
-
  GetBool(params, "use_missing", &use_missing);

  GetBool(params, "zero_as_missing", &zero_as_missing);
@@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const {
  str_buf << "[learning_rate: " << learning_rate << "]\n";
  str_buf << "[num_leaves: " << num_leaves << "]\n";
  str_buf << "[num_threads: " << num_threads << "]\n";
+  str_buf << "[force_col_wise: " << force_col_wise << "]\n";
+  str_buf << "[force_row_wise: " << force_row_wise << "]\n";
  str_buf << "[max_depth: " << max_depth << "]\n";
  str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n";
  str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n";
@@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const {
  str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n";
  str_buf << "[pre_partition: " << pre_partition << "]\n";
  str_buf << "[enable_bundle: " << enable_bundle << "]\n";
-  str_buf << "[max_conflict_rate: " << max_conflict_rate << "]\n";
-  str_buf << "[is_enable_sparse: " << is_enable_sparse << "]\n";
-  str_buf << "[sparse_threshold: " << sparse_threshold << "]\n";
  str_buf << "[use_missing: " << use_missing << "]\n";
  str_buf << "[zero_as_missing: " << zero_as_missing << "]\n";
  str_buf << "[two_round: " << two_round << "]\n";

--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -36,6 +36,7 @@ Dataset::Dataset(data_size_t num_data) {
 }

 Dataset::~Dataset() {
+
 }

 std::vector<std::vector<int>> NoGroup(
@@ -48,19 +49,20 @@ std::vector<std::vector<int>> NoGroup(
  return features_in_group;
 }

-int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) {
+int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, data_size_t max_cnt) {
  int ret = 0;
  for (int i = 0; i < num_indices; ++i) {
    if (mark[indices[i]]) {
      ++ret;
-      if (ret > max_cnt) {
-        return -1;
-      }
+    }
+    if (ret > max_cnt) {
+      return -1;
    }
  }
  return ret;
 }
-void MarkUsed(std::vector<bool>* mark, const int* indices, int num_indices) {
+
+void MarkUsed(std::vector<bool>* mark, const int* indices, data_size_t num_indices) {
  auto& ref_mark = *mark;
  for (int i = 0; i < num_indices; ++i) {
    ref_mark[indices[i]] = true;
@@ -93,29 +95,31 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
                                         int** sample_indices,
                                         const int* num_per_col,
                                         int num_sample_col,
-                                         size_t total_sample_cnt,
-                                         data_size_t max_error_cnt,
-                                         data_size_t filter_cnt,
+                                         data_size_t total_sample_cnt,
                                         data_size_t num_data,
-                                         bool is_use_gpu) {
+                                         bool is_use_gpu,
+                                         std::vector<int8_t>* multi_val_group) {
  const int max_search_group = 100;
-  const int gpu_max_bin_per_group = 256;
+  const int max_bin_per_group = 256;
+  const data_size_t single_val_max_conflict_cnt = static_cast<data_size_t>(total_sample_cnt / 10000);
+  multi_val_group->clear();
+
  Random rand(num_data);
  std::vector<std::vector<int>> features_in_group;
  std::vector<std::vector<bool>> conflict_marks;
-  std::vector<int> group_conflict_cnt;
-  std::vector<size_t> group_non_zero_cnt;
+  std::vector<data_size_t> group_used_row_cnt;
+  std::vector<data_size_t> group_total_data_cnt;
  std::vector<int> group_num_bin;

+  // first round: fill the single val group
  for (auto fidx : find_order) {
    bool is_filtered_feature = fidx >= num_sample_col;
-    const size_t cur_non_zero_cnt = is_filtered_feature ? 0: num_per_col[fidx];
-    bool need_new_group = true;
+    const data_size_t cur_non_zero_cnt = is_filtered_feature ? 0 : num_per_col[fidx];
    std::vector<int> available_groups;
    for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
-      if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt) {
-        if (!is_use_gpu || group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)
-            <= gpu_max_bin_per_group) {
+      auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
+      if (group_total_data_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + single_val_max_conflict_cnt) {
+        if (!is_use_gpu || cur_num_bin <= max_bin_per_group) {
          available_groups.push_back(gid);
        }
      }
@@ -124,44 +128,82 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
    if (!available_groups.empty()) {
      int last = static_cast<int>(available_groups.size()) - 1;
      auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
+      // always push the last group
      search_groups.push_back(available_groups.back());
      for (auto idx : indices) {
        search_groups.push_back(available_groups[idx]);
      }
    }
+    int best_gid = -1;
+    int best_conflict_cnt = -1;
    for (auto gid : search_groups) {
-      const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid];
-      const int cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
-      if (cnt >= 0 && cnt <= rest_max_cnt) {
-        data_size_t rest_non_zero_data = static_cast<data_size_t>(
-          static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt);
-        if (rest_non_zero_data < filter_cnt) { continue; }
-        need_new_group = false;
-        features_in_group[gid].push_back(fidx);
-        group_conflict_cnt[gid] += cnt;
-        group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
-        if (!is_filtered_feature) {
-          MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
-        }
-        if (is_use_gpu) {
-          group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
-        }
+      const data_size_t rest_max_cnt = single_val_max_conflict_cnt - group_total_data_cnt[gid] + group_used_row_cnt[gid];
+      const data_size_t cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
+      if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) {
+        best_gid = gid;
+        best_conflict_cnt = cnt;
        break;
      }
    }
-    if (need_new_group) {
+    if (best_gid >= 0) {
+      features_in_group[best_gid].push_back(fidx);
+      group_total_data_cnt[best_gid] += cur_non_zero_cnt;
+      group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt;
+      if (!is_filtered_feature) {
+        MarkUsed(&conflict_marks[best_gid], sample_indices[fidx], num_per_col[fidx]);
+      }
+      group_num_bin[best_gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
+    } else {
      features_in_group.emplace_back();
      features_in_group.back().push_back(fidx);
-      group_conflict_cnt.push_back(0);
      conflict_marks.emplace_back(total_sample_cnt, false);
      if (!is_filtered_feature) {
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
      }
-      group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
-      if (is_use_gpu) {
-        group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
+      group_total_data_cnt.emplace_back(cur_non_zero_cnt);
+      group_used_row_cnt.emplace_back(cur_non_zero_cnt);
+      group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
+    }
+  }
+  std::vector<int> second_round_features;
+  std::vector<std::vector<int>> features_in_group2;
+  std::vector<std::vector<bool>> conflict_marks2;
+
+  const double dense_threshold = 0.4;
+  for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
+    const double dense_rate = static_cast<double>(group_used_row_cnt[gid]) / total_sample_cnt;
+    if (dense_rate >= dense_threshold) {
+      features_in_group2.push_back(std::move(features_in_group[gid]));
+      conflict_marks2.push_back(std::move(conflict_marks[gid]));
+    } else {
+      for (auto fidx : features_in_group[gid]) {
+        second_round_features.push_back(fidx);
+      }
+    }
+  }
+
+  features_in_group = features_in_group2;
+  conflict_marks = conflict_marks2;
+  multi_val_group->resize(features_in_group.size(), false);
+  if (!second_round_features.empty()) {
+    features_in_group.emplace_back();
+    conflict_marks.emplace_back(total_sample_cnt, false);
+    bool is_multi_val = is_use_gpu ? true : false;
+    int conflict_cnt = 0;
+    for (auto fidx : second_round_features) {
+      features_in_group.back().push_back(fidx);
+      if (!is_multi_val) {
+        const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt;
+        const auto cnt = GetConfilctCount(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
+        conflict_cnt += cnt;
+        if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) {
+          is_multi_val = true;
+          continue;
+        }
+        MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
      }
    }
+    multi_val_group->push_back(is_multi_val);
  }
  return features_in_group;
 }
@@ -171,17 +213,12 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
                                                  double** sample_values,
                                                  const int* num_per_col,
                                                  int num_sample_col,
-                                                  size_t total_sample_cnt,
+                                                  data_size_t total_sample_cnt,
                                                  const std::vector<int>& used_features,
-                                                  double max_conflict_rate,
                                                  data_size_t num_data,
-                                                  data_size_t min_data,
-                                                  double sparse_threshold,
-                                                  bool is_enable_sparse,
-                                                  bool is_use_gpu) {
-  // filter is based on sampling data, so decrease its range
-  const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
-  const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
+                                                  bool is_use_gpu,
+                                                  std::vector<int8_t>* multi_val_group) {
+  Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
  std::vector<size_t> feature_non_zero_cnt;
  feature_non_zero_cnt.reserve(used_features.size());
  // put dense feature first
@@ -209,6 +246,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
  for (auto sidx : sorted_idx) {
    feature_order_by_cnt.push_back(used_features[sidx]);
  }
+
  std::vector<std::vector<int>> tmp_indices;
  std::vector<int> tmp_num_per_col(num_sample_col, 0);
  for (auto fidx : used_features) {
@@ -224,42 +262,25 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
      tmp_num_per_col[fidx] = num_per_col[fidx];
    }
  }
-  auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
-  auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
+  std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
+  auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val);
+  auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val2);
+
  if (features_in_group.size() > group2.size()) {
    features_in_group = group2;
-  }
-  std::vector<std::vector<int>> ret;
-  for (size_t i = 0; i < features_in_group.size(); ++i) {
-    if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
-      ret.push_back(features_in_group[i]);
-    } else {
-      int cnt_non_zero = 0;
-      for (size_t j = 0; j < features_in_group[i].size(); ++j) {
-        const int fidx = features_in_group[i][j];
-        cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
-      }
-      double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
-      // take apart small sparse group, due it will not gain on speed
-      if (sparse_rate >= sparse_threshold && is_enable_sparse) {
-        for (size_t j = 0; j < features_in_group[i].size(); ++j) {
-          const int fidx = features_in_group[i][j];
-          ret.emplace_back();
-          ret.back().push_back(fidx);
-        }
-      } else {
-        ret.push_back(features_in_group[i]);
-      }
-    }
+    group_is_multi_val = group_is_multi_val2;
  }
  // shuffle groups
-  int num_group = static_cast<int>(ret.size());
-  Random tmp_rand(12);
+  int num_group = static_cast<int>(features_in_group.size());
+  Random tmp_rand(num_data);
  for (int i = 0; i < num_group - 1; ++i) {
    int j = tmp_rand.NextShort(i + 1, num_group);
-    std::swap(ret[i], ret[j]);
+    std::swap(features_in_group[i], features_in_group[j]);
+    // Use std::swap for vector<bool> will cause the wrong result..
+    std::swap(group_is_multi_val[i], group_is_multi_val[j]);
  }
-  return ret;
+  *multi_val_group = group_is_multi_val;
+  return features_in_group;
 }

 void Dataset::Construct(
@@ -274,7 +295,6 @@ void Dataset::Construct(
  const Config& io_config) {
  num_total_features_ = num_total_features;
  CHECK(num_total_features_ == static_cast<int>(bin_mappers->size()));
-  sparse_threshold_ = io_config.sparse_threshold;
  // get num_features
  std::vector<int> used_features;
  auto& ref_bin_mappers = *bin_mappers;
@@ -287,13 +307,11 @@ void Dataset::Construct(
    Log::Warning("There are no meaningful features, as all feature values are constant.");
  }
  auto features_in_group = NoGroup(used_features);
-
+  std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
  if (io_config.enable_bundle && !used_features.empty()) {
    features_in_group = FastFeatureBundling(*bin_mappers,
-                                            sample_non_zero_indices, sample_values, num_per_col, num_sample_col, total_sample_cnt,
-                                            used_features, io_config.max_conflict_rate,
-                                            num_data_, io_config.min_data_in_leaf,
-                                            sparse_threshold_, io_config.is_enable_sparse, io_config.device_type == std::string("gpu"));
+                                            sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast<data_size_t>(total_sample_cnt),
+                                            used_features, num_data_, io_config.device_type == std::string("gpu"), &group_is_multi_val);
  }

  num_features_ = 0;
@@ -306,10 +324,14 @@ void Dataset::Construct(
  real_feature_idx_.resize(num_features_);
  feature2group_.resize(num_features_);
  feature2subfeature_.resize(num_features_);
+  int num_multi_val_group = 0;
  feature_need_push_zeros_.clear();
  for (int i = 0; i < num_groups_; ++i) {
    auto cur_features = features_in_group[i];
    int cur_cnt_features = static_cast<int>(cur_features.size());
+    if (group_is_multi_val[i]) {
+      ++num_multi_val_group;
+    }
    // get bin_mappers
    std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
    for (int j = 0; j < cur_cnt_features; ++j) {
@@ -325,8 +347,7 @@ void Dataset::Construct(
      ++cur_fidx;
    }
    feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
-      new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_,
-                       io_config.is_enable_sparse)));
+      new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_)));
  }
  feature_groups_.shrink_to_fit();
  group_bin_boundaries_.clear();
@@ -414,9 +435,6 @@ void Dataset::ResetConfig(const char* parameters) {
  if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) {
    Log::Warning("Cannot change zero_as_missing after constructed Dataset handle.");
  }
-  if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
-    Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
-  }
  if (param.count("forcedbins_filename")) {
    Log::Warning("Cannot change forced bins after constructed Dataset handle.");
  }
@@ -452,23 +470,229 @@ void Dataset::ResetConfig(const char* parameters) {
 void Dataset::FinishLoad() {
  if (is_finish_load_) { return; }
  if (num_groups_ > 0) {
-    OMP_INIT_EX();
-#pragma omp parallel for schedule(guided)
    for (int i = 0; i < num_groups_; ++i) {
-      OMP_LOOP_EX_BEGIN();
-      feature_groups_[i]->bin_data_->FinishLoad();
-      OMP_LOOP_EX_END();
+      feature_groups_[i]->FinishLoad();
    }
-    OMP_THROW_EX();
  }
  is_finish_load_ = true;
 }

+void PushDataToMultiValBin(int num_threads, data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
+  const std::vector<uint32_t> offsets, std::vector<std::vector<std::unique_ptr<BinIterator>>>& iters, MultiValBin* ret) {
+  Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer);
+  const data_size_t min_block_size = 4096;
+  const int n_block = std::min(num_threads, (num_data + min_block_size - 1) / min_block_size);
+  const data_size_t block_size = (num_data + n_block - 1) / n_block;
+  if (ret->IsSparse()) {
+    #pragma omp parallel for schedule(static)
+    for (int tid = 0; tid < n_block; ++tid) {
+      std::vector<uint32_t> cur_data;
+      data_size_t start = tid * block_size;
+      data_size_t end = std::min(num_data, start + block_size);
+      for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+        iters[tid][j]->Reset(start);
+      }
+      for (data_size_t i = start; i < end; ++i) {
+        cur_data.clear();
+        for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+          auto cur_bin = iters[tid][j]->Get(i);
+          if (cur_bin == most_freq_bins[j]) {
+            continue;
+          }
+          cur_bin += offsets[j];
+          if (most_freq_bins[j] == 0) {
+            cur_bin -= 1;
+          }
+          cur_data.push_back(cur_bin);
+        }
+        ret->PushOneRow(tid, i, cur_data);
+      }
+    }
+  } else {
+    #pragma omp parallel for schedule(static)
+    for (int tid = 0; tid < n_block; ++tid) {
+      std::vector<uint32_t> cur_data;
+      data_size_t start = tid * block_size;
+      data_size_t end = std::min(num_data, start + block_size);
+      for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+        iters[tid][j]->Reset(start);
+      }
+      for (data_size_t i = start; i < end; ++i) {
+        cur_data.clear();
+        for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+          auto cur_bin = iters[tid][j]->Get(i);
+          if (cur_bin == most_freq_bins[j]) {
+            cur_bin = 0;
+          } else {
+            cur_bin += offsets[j];
+            if (most_freq_bins[j] == 0) {
+              cur_bin -= 1;
+            }
+          }
+          cur_data.push_back(cur_bin);
+        }
+        ret->PushOneRow(tid, i, cur_data);
+      }
+    }
+  }
+}
+
+MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
+  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer);
+  int multi_group_id = -1;
+  for (int i = 0; i < num_groups_; ++i) {
+    if (feature_groups_[i]->is_multi_val_) {
+      if (multi_group_id < 0) {
+        multi_group_id = i;
+      } else {
+        Log::Fatal("Bug. There should be only one multi-val group.");
+      }
+    }
+  }
+  if (multi_group_id < 0) {
+    return nullptr;
+  }
+  const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
+  const int num_feature = feature_groups_[multi_group_id]->num_feature_;
+  int num_threads = 1;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    num_threads = omp_get_num_threads();
+  }
+
+  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
+  std::vector<uint32_t> most_freq_bins;
+  double sum_sparse_rate = 0;
+  for (int i = 0; i < num_feature; ++i) {
+    for (int tid = 0; tid < num_threads; ++tid) {
+      iters[tid].emplace_back(feature_groups_[multi_group_id]->SubFeatureIterator(i));
+    }
+    most_freq_bins.push_back(feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
+    sum_sparse_rate += feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
+  }
+  sum_sparse_rate /= num_feature;
+  Log::Debug("GetMultiBinFromSparseFeatures:: sparse rate %f", sum_sparse_rate);
+  std::unique_ptr<MultiValBin> ret;
+  ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), num_feature, sum_sparse_rate));
+  PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
+  ret->FinishLoad();
+  return ret.release();
+}
+
+MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
+  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures", global_timer);
+  int num_threads = 1;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    num_threads = omp_get_num_threads();
+  }
+  double sum_dense_ratio = 0;
+
+  std::unique_ptr<MultiValBin> ret;
+  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
+  std::vector<uint32_t> most_freq_bins;
+  std::vector<uint32_t> offsets;
+  int num_total_bin = 1;
+  offsets.push_back(num_total_bin);
+  for (int gid = 0; gid < num_groups_; ++gid) {
+    if (feature_groups_[gid]->is_multi_val_) {
+      for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
+        const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
+        sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
+        most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
+        num_total_bin += bin_mapper->num_bin();
+        if (most_freq_bins.back() == 0) {
+          num_total_bin -= 1;
+        }
+        offsets.push_back(num_total_bin);
+        for (int tid = 0; tid < num_threads; ++tid) {
+          iters[tid].emplace_back(feature_groups_[gid]->SubFeatureIterator(fid));
+        }
+      }
+    } else {
+      most_freq_bins.push_back(0);
+      num_total_bin += feature_groups_[gid]->bin_offsets_.back() - 1;
+      for (int tid = 0; tid < num_threads; ++tid) {
+        iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
+      }
+      offsets.push_back(num_total_bin);
+      for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
+        const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
+        sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
+      }
+    }
+  }
+  sum_dense_ratio /= static_cast<double>(most_freq_bins.size());
+  Log::Debug("GetMultiBinFromAllFeatures:: sparse rate %f", 1.0 - sum_dense_ratio);
+  ret.reset(MultiValBin::CreateMultiValBin(num_data_, num_total_bin, static_cast<int>(most_freq_bins.size()), 1.0 - sum_dense_ratio));
+  PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
+  ret->FinishLoad();
+  return ret.release();
+}
+
+MultiValBin* Dataset::TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+  bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const {
+  int num_threads = 1;
+#pragma omp parallel
+#pragma omp master
+  { num_threads = omp_get_num_threads(); }
+  Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer);
+  if (force_colwise && force_rowwise) {
+    Log::Fatal("cannot set both `force_col_wise` and `force_row_wise` to `true`.");
+  }
+  if (num_groups_ <= 0) {
+    return nullptr;
+  }
+  if (force_colwise) {
+    *is_hist_col_wise = true;
+    return GetMultiBinFromSparseFeatures();
+  } else if (force_rowwise) {
+    *is_hist_col_wise = false;
+    auto ret = GetMultiBinFromAllFeatures();
+    const int num_bin_aligned =
+        (ret->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
+    hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
+    return ret;
+  } else {
+    std::unique_ptr<MultiValBin> sparse_bin;
+    std::unique_ptr<MultiValBin> all_bin;
+    sparse_bin.reset(GetMultiBinFromSparseFeatures());
+    all_bin.reset(GetMultiBinFromAllFeatures());
+    std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_data(NumTotalBin() * 2);
+    const int num_bin_aligned =
+        (all_bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
+    hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
+    std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
+    auto start_time = std::chrono::steady_clock::now();
+    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, is_constant_hessian, sparse_bin.get(), true, hist_data.data());
+    col_wise_time = std::chrono::steady_clock::now() - start_time;
+    start_time = std::chrono::steady_clock::now();
+    ConstructHistogramsMultiVal(all_bin.get(), nullptr, num_data_, gradients, hessians, is_constant_hessian, hist_data.data());
+    row_wise_time = std::chrono::steady_clock::now() - start_time;
+    Log::Debug("colwise cost %f seconds, rowwise cost %f seconds", col_wise_time * 1e-3, row_wise_time * 1e-3);
+    if (col_wise_time < row_wise_time) {
+      *is_hist_col_wise = true;
+      hist_buf_.clear();
+      return sparse_bin.release();
+    } else {
+      *is_hist_col_wise = false;
+      Log::Info("Use row-wise multi-threading, may increase memory usage. If memory is not enough, you can set `force_col_wise=true`.");
+      if (all_bin->IsSparse()) {
+        Log::Debug("Use Sparse Multi-Val Bin");
+      } else {
+        Log::Debug("Use Dense Multi-Val Bin");
+      }
+      return all_bin.release();
+    }
+  }
+}
+
 void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
  feature_groups_.clear();
  num_features_ = dataset->num_features_;
  num_groups_ = dataset->num_groups_;
-  sparse_threshold_ = dataset->sparse_threshold_;
  // copy feature bin mapper data
  for (int i = 0; i < num_groups_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
@@ -477,9 +701,9 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
    }
    feature_groups_.emplace_back(new FeatureGroup(
      dataset->feature_groups_[i]->num_feature_,
+      dataset->feature_groups_[i]->is_multi_val_,
      &bin_mappers,
-      num_data_,
-      dataset->feature_groups_[i]->is_sparse_));
+      num_data_));
  }
  feature_groups_.shrink_to_fit();
  used_feature_map_ = dataset->used_feature_map_;
@@ -502,8 +726,6 @@ void Dataset::CreateValid(const Dataset* dataset) {
  feature_groups_.clear();
  num_features_ = dataset->num_features_;
  num_groups_ = num_features_;
-  sparse_threshold_ = dataset->sparse_threshold_;
-  bool is_enable_sparse = true;
  feature2group_.clear();
  feature2subfeature_.clear();
  // copy feature bin mapper data
@@ -514,12 +736,8 @@ void Dataset::CreateValid(const Dataset* dataset) {
    if (bin_mappers.back()->GetDefaultBin() != bin_mappers.back()->GetMostFreqBin()) {
      feature_need_push_zeros_.push_back(i);
    }
-    feature_groups_.emplace_back(new FeatureGroup(
-      1,
-      &bin_mappers,
-      num_data_,
-      sparse_threshold_,
-      is_enable_sparse));
+    feature_groups_.emplace_back(new FeatureGroup(&bin_mappers,
+                                                  num_data_));
    feature2group_.push_back(i);
    feature2subfeature_.push_back(0);
  }
@@ -721,7 +939,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
    writer->Write(binary_file_token, size_of_token);
    // get size of header
    size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
-      + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + sizeof(sparse_threshold_)
+      + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) 
      + 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
      + sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
    // size of feature names
@@ -743,7 +961,6 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
    writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
    writer->Write(&use_missing_, sizeof(use_missing_));
    writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
-    writer->Write(&sparse_threshold_, sizeof(sparse_threshold_));
    writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
    writer->Write(&num_groups_, sizeof(num_groups_));
    writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
@@ -866,20 +1083,110 @@ void Dataset::DumpTextFile(const char* text_filename) {
  fclose(file);
 }

+void Dataset::ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data,
+                                          const score_t* gradients, const score_t* hessians,
+                                          bool is_constant_hessian,
+                                          hist_t* hist_data) const {
+  Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", global_timer);
+  if (multi_val_bin == nullptr) { return; }
+  int num_threads = 1;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    num_threads = omp_get_num_threads();
+  }
+
+  global_timer.Start("Dataset::sparse_bin_histogram");
+  const int num_bin = multi_val_bin->num_bin();
+  const int num_bin_aligned = (num_bin + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
+  const int min_data_block_size = 1024;
+  const int n_data_block = std::min(num_threads, (num_data + min_data_block_size - 1) / min_data_block_size);
+  const int data_block_size = (num_data + n_data_block - 1) / n_data_block;
+
+  const size_t buf_size = static_cast<size_t>(n_data_block - 1)* num_bin_aligned * 2;
+  if (hist_buf_.size() < buf_size) {
+    hist_buf_.resize(buf_size);
+  }
+
+  #pragma omp parallel for schedule(static)
+  for (int tid = 0; tid < n_data_block; ++tid) {
+    data_size_t start = tid * data_block_size;
+    data_size_t end = std::min(start + data_block_size, num_data);
+    auto data_ptr = hist_data;
+    if (tid > 0) {
+      data_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
+    }
+    std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
+    if (data_indices != nullptr && num_data < num_data_) {
+      if (!is_constant_hessian) {
+        multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, hessians, data_ptr);
+      } else {
+        multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, data_ptr);
+      }
+    } else {
+      if (!is_constant_hessian) {
+        multi_val_bin->ConstructHistogram(start, end, gradients, hessians, data_ptr);
+      } else {
+        multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr);
+      }
+    }
+  }
+  global_timer.Stop("Dataset::sparse_bin_histogram");
+
+  global_timer.Start("Dataset::sparse_bin_histogram_merge");
+
+  const int min_bin_block_size = 512;
+  const int n_bin_block = std::min(num_threads, (num_bin + min_bin_block_size - 1) / min_bin_block_size);
+  const int bin_block_size = (num_bin + n_bin_block - 1) / n_bin_block;
+  if (!is_constant_hessian) {
+    #pragma omp parallel for schedule(static)
+    for (int t = 0; t < n_bin_block; ++t) {
+      const int start = t * bin_block_size;
+      const int end = std::min(start + bin_block_size, num_bin);
+      for (int tid = 1; tid < n_data_block; ++tid) {
+        auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
+        for (int i = start * 2; i < end * 2; ++i) {
+          hist_data[i] += src_ptr[i];
+        }
+      }
+    }
+  } else {
+    #pragma omp parallel for schedule(static)
+    for (int t = 0; t < n_bin_block; ++t) {
+      const int start = t * bin_block_size;
+      const int end = std::min(start + bin_block_size, num_bin);
+      for (int tid = 1; tid < n_data_block; ++tid) {
+        auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
+        for (int i = start * 2; i < end * 2; ++i) {
+          hist_data[i] += src_ptr[i];
+        }
+      }
+      for (int i = start; i < end; i++) {
+        GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0];
+      }
+    }
+  }
+  global_timer.Stop("Dataset::sparse_bin_histogram_merge");
+}
+
 void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                                  const data_size_t* data_indices, data_size_t num_data,
-                                  int leaf_idx,
-                                  std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
                                  const score_t* gradients, const score_t* hessians,
                                  score_t* ordered_gradients, score_t* ordered_hessians,
                                  bool is_constant_hessian,
-                                  HistogramBinEntry* hist_data) const {
-  if (leaf_idx < 0 || num_data < 0 || hist_data == nullptr) {
+                                  const MultiValBin* multi_val_bin, bool is_colwise,
+                                  hist_t* hist_data) const {
+  Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
+  if (num_data < 0 || hist_data == nullptr) {
    return;
  }
-
-  std::vector<int> used_group;
-  used_group.reserve(num_groups_);
+  if (!is_colwise) {
+    return ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian, hist_data);
+  }
+  global_timer.Start("Dataset::Get used group");
+  std::vector<int> used_dense_group;
+  int multi_val_groud_id = -1;
+  used_dense_group.reserve(num_groups_);
  for (int group = 0; group < num_groups_; ++group) {
    const int f_cnt = group_feature_cnt_[group];
    bool is_group_used = false;
@@ -891,172 +1198,137 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
      }
    }
    if (is_group_used) {
-      used_group.push_back(group);
-    }
-  }
-  int num_used_group = static_cast<int>(used_group.size());
-  auto ptr_ordered_grad = gradients;
-  auto ptr_ordered_hess = hessians;
-  auto& ref_ordered_bins = *ordered_bins;
-  if (data_indices != nullptr && num_data < num_data_) {
-    if (!is_constant_hessian) {
-      #pragma omp parallel for schedule(static)
-      for (data_size_t i = 0; i < num_data; ++i) {
-        ordered_gradients[i] = gradients[data_indices[i]];
-        ordered_hessians[i] = hessians[data_indices[i]];
-      }
-    } else {
-      #pragma omp parallel for schedule(static)
-      for (data_size_t i = 0; i < num_data; ++i) {
-        ordered_gradients[i] = gradients[data_indices[i]];
+      if (feature_groups_[group]->is_multi_val_) {
+        multi_val_groud_id = group;
+      } else {
+        used_dense_group.push_back(group);
      }
    }
-    ptr_ordered_grad = ordered_gradients;
-    ptr_ordered_hess = ordered_hessians;
-    if (!is_constant_hessian) {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
-          feature_groups_[group]->bin_data_->ConstructHistogram(
-            data_indices,
-            0,
-            num_data,
-            ptr_ordered_grad,
-            ptr_ordered_hess,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      hessians,
-                                                      data_ptr);
+  }
+  int num_used_dense_group = static_cast<int>(used_dense_group.size());
+  global_timer.Stop("Dataset::Get used group");
+  global_timer.Start("Dataset::dense_bin_histogram");
+  if (num_used_dense_group > 0) {
+    auto ptr_ordered_grad = gradients;
+    auto ptr_ordered_hess = hessians;
+    if (data_indices != nullptr && num_data < num_data_) {
+      if (!is_constant_hessian) {
+#pragma omp parallel for schedule(static)
+        for (data_size_t i = 0; i < num_data; ++i) {
+          ordered_gradients[i] = gradients[data_indices[i]];
+          ordered_hessians[i] = hessians[data_indices[i]];
+        }
+      } else {
+#pragma omp parallel for schedule(static)
+        for (data_size_t i = 0; i < num_data; ++i) {
+          ordered_gradients[i] = gradients[data_indices[i]];
        }
-        OMP_LOOP_EX_END();
      }
-      OMP_THROW_EX();
-    } else {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
+      ptr_ordered_grad = ordered_gradients;
+      ptr_ordered_hess = ordered_hessians;
+      if (!is_constant_hessian) {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
          feature_groups_[group]->bin_data_->ConstructHistogram(
-            data_indices,
-            0,
-            num_data,
-            ptr_ordered_grad,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      data_ptr);
-        }
-        // fixed hessian.
-        for (int i = 0; i < num_bin; ++i) {
-          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
+              data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+              data_ptr);
+          OMP_LOOP_EX_END();
        }
-        OMP_LOOP_EX_END();
-      }
-      OMP_THROW_EX();
-    }
-  } else {
-    if (!is_constant_hessian) {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
+        OMP_THROW_EX();
+
+      } else {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
          feature_groups_[group]->bin_data_->ConstructHistogram(
-            0,
-            num_data,
-            ptr_ordered_grad,
-            ptr_ordered_hess,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      hessians,
-                                                      data_ptr);
+              data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
+          // fixed hessian.
+          for (int i = 0; i < num_bin; ++i) {
+            GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
+          }
+          OMP_LOOP_EX_END();
        }
-        OMP_LOOP_EX_END();
+        OMP_THROW_EX();
      }
-      OMP_THROW_EX();
    } else {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
+      if (!is_constant_hessian) {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
          feature_groups_[group]->bin_data_->ConstructHistogram(
-            0,
-            num_data,
-            ptr_ordered_grad,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      data_ptr);
+              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+          OMP_LOOP_EX_END();
        }
-        // fixed hessian.
-        for (int i = 0; i < num_bin; ++i) {
-          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
+        OMP_THROW_EX();
+      } else {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
+          feature_groups_[group]->bin_data_->ConstructHistogram(
+              0, num_data, ptr_ordered_grad, data_ptr);
+          // fixed hessian.
+          for (int i = 0; i < num_bin; ++i) {
+            GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
+          }
+          OMP_LOOP_EX_END();
        }
-        OMP_LOOP_EX_END();
+        OMP_THROW_EX();
      }
-      OMP_THROW_EX();
    }
  }
+  global_timer.Stop("Dataset::dense_bin_histogram");
+  if (multi_val_groud_id >= 0) {
+    ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian,
+                                hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+  }
 }

-void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
-                           HistogramBinEntry* data) const {
+void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const {
  const int group = feature2group_[feature_idx];
  const int sub_feature = feature2subfeature_[feature_idx];
  const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
  const int most_freq_bin = bin_mapper->GetMostFreqBin();
  if (most_freq_bin > 0) {
    const int num_bin = bin_mapper->num_bin();
-    data[most_freq_bin].sum_gradients = sum_gradient;
-    data[most_freq_bin].sum_hessians = sum_hessian;
-    data[most_freq_bin].cnt = num_data;
+    GET_GRAD(data, most_freq_bin) = sum_gradient;
+    GET_HESS(data, most_freq_bin) = sum_hessian;
    for (int i = 0; i < num_bin; ++i) {
      if (i != most_freq_bin) {
-        data[most_freq_bin].sum_gradients -= data[i].sum_gradients;
-        data[most_freq_bin].sum_hessians -= data[i].sum_hessians;
-        data[most_freq_bin].cnt -= data[i].cnt;
+        GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i);
+        GET_HESS(data, most_freq_bin) -= GET_HESS(data, i);
      }
    }
  }
@@ -1094,7 +1366,7 @@ void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::ve
  }
 }

-void Dataset::addFeaturesFrom(Dataset* other) {
+void Dataset::AddFeaturesFrom(Dataset* other) {
  if (other->num_data_ != num_data_) {
    throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
  }

--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
  mem_ptr += sizeof(dataset->use_missing_);
  dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
  mem_ptr += sizeof(dataset->zero_as_missing_);
-  dataset->sparse_threshold_ = *(reinterpret_cast<const double*>(mem_ptr));
-  mem_ptr += sizeof(dataset->sparse_threshold_);
  const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
  dataset->used_feature_map_.clear();
  for (int i = 0; i < dataset->num_total_features_; ++i) {

--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator {
  }
  inline uint32_t RawGet(data_size_t idx) override;
  inline uint32_t Get(data_size_t idx) override;
-  inline void Reset(data_size_t) override { }
+  inline void Reset(data_size_t) override {}

- private:
+private:
  const DenseBin<VAL_T>* bin_data_;
  VAL_T min_bin_;
  VAL_T max_bin_;
@@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator {
 */
 template <typename VAL_T>
 class DenseBin: public Bin {
- public:
+public:
  friend DenseBinIterator<VAL_T>;
  explicit DenseBin(data_size_t num_data)
    : num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
@@ -68,84 +68,65 @@ class DenseBin: public Bin {

  BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;

-  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+
+    if (use_prefetch) {
+      const data_size_t pf_offset = 64 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(data_.data() + pf_idx);
+        const VAL_T bin = data_[idx];
+        if (use_hessians) {
+          ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+        } else {
+          ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+        }
+      }
    }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
+      const VAL_T bin = data_[idx];
+      if (use_hessians) {
+        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+      } else {
+        ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+      }
    }
  }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
+  }

  void ConstructHistogram(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + i + pf_offset);
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
  }

  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
  }

  void ConstructHistogram(data_size_t start, data_size_t end,
    const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + i + pf_offset);
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
  }

  data_size_t Split(
@@ -257,9 +238,6 @@ class DenseBin: public Bin {

  data_size_t num_data() const override { return num_data_; }

-  /*! \brief not ordered bin for dense feature */
-  OrderedBin* CreateOrderedBin() const override { return nullptr; }
-
  void FinishLoad() override {}

  void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
@@ -287,17 +265,18 @@ class DenseBin: public Bin {
  }

  size_t SizesInByte() const override {
-    return sizeof(VAL_T) * num_data_;
+    return sizeof(VAL_T)* num_data_;
  }

  DenseBin<VAL_T>* Clone() override;

- private:
+private:
  data_size_t num_data_;
-  std::vector<VAL_T> data_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;

  DenseBin<VAL_T>(const DenseBin<VAL_T>& other)
-    : num_data_(other.num_data_), data_(other.data_){}
+    : num_data_(other.num_data_), data_(other.data_) {
+  }
 };

 template<typename VAL_T>

--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -16,7 +16,7 @@ namespace LightGBM {
 class Dense4bitsBin;

 class Dense4bitsBinIterator : public BinIterator {
- public:
+public:
  explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
    : bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
    max_bin_(static_cast<uint8_t>(max_bin)),
@@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator {
  inline uint32_t Get(data_size_t idx) override;
  inline void Reset(data_size_t) override {}

- private:
+private:
  const Dense4bitsBin* bin_data_;
  uint8_t min_bin_;
  uint8_t max_bin_;
@@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator {
 };

 class Dense4bitsBin : public Bin {
- public:
+public:
  friend Dense4bitsBinIterator;
  explicit Dense4bitsBin(data_size_t num_data)
    : num_data_(num_data) {
    int len = (num_data_ + 1) / 2;
-    data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
+    data_.resize(len, static_cast<uint8_t>(0));
    buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
  }

@@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin {

  inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;

-  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = (i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+
+    if (use_prefetch) {
+      const data_size_t pf_offset = 64;
+      const data_size_t pf_end = end - pf_offset;
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(data_.data() + (pf_idx >> 1));
+        const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+        if (use_hessians) {
+          ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+        } else {
+          ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+        }
+      }
    }
-    for (; i < end; i++) {
-      const data_size_t idx = data_indices[i];
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+      if (use_hessians) {
+        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+      } else {
+        ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+      }
    }
  }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
+  }

  void ConstructHistogram(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
  }

  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
  }

  void ConstructHistogram(data_size_t start, data_size_t end,
    const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
  }

  data_size_t Split(
@@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin {

  data_size_t num_data() const override { return num_data_; }

-  /*! \brief not ordered bin for dense feature */
-  OrderedBin* CreateOrderedBin() const override { return nullptr; }

  void FinishLoad() override {
    if (buf_.empty()) { return; }
@@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin {
  }

  size_t SizesInByte() const override {
-    return sizeof(uint8_t) * data_.size();
+    return sizeof(uint8_t)* data_.size();
  }

  Dense4bitsBin* Clone() override {
    return new Dense4bitsBin(*this);
  }

- protected:
+protected:
  Dense4bitsBin(const Dense4bitsBin& other)
-    : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {}
+    : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {
+  }

  data_size_t num_data_;
-  std::vector<uint8_t> data_;
+  std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> data_;
  std::vector<uint8_t> buf_;
 };


--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
+/*!
+ * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
+#define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
+
+
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace LightGBM {
+
+
+template <typename VAL_T>
+class MultiValDenseBin : public MultiValBin {
+public:
+
+  explicit MultiValDenseBin(data_size_t num_data, int num_bin, int num_feature)
+    : num_data_(num_data), num_bin_(num_bin), num_feature_(num_feature) {
+    data_.resize(static_cast<size_t>(num_data_) * num_feature_, static_cast<VAL_T>(0));
+  }
+
+  ~MultiValDenseBin() {
+  }
+
+  data_size_t num_data() const override {
+    return num_data_;
+  }
+
+  int num_bin() const override {
+    return num_bin_;
+  }
+
+
+  void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
+    auto start = RowPtr(idx);
+    CHECK(num_feature_ == static_cast<int>(values.size()));
+    for (auto i = 0; i < num_feature_; ++i) {
+      data_[start + i] = static_cast<VAL_T>(values[i]);
+    }
+  }
+
+  void FinishLoad() override {
+
+  }
+
+  bool IsSparse() override{
+    return false;
+  }
+
+  void ReSize(data_size_t num_data) override {
+    if (num_data_ != num_data) {
+      num_data_ = num_data;
+    }
+  }
+
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians, hist_t* out) const {
+    data_size_t i = start;
+    if (use_prefetch) {
+      const data_size_t pf_offset = 32 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(gradients + pf_idx);
+        if (use_hessians) {
+          PREFETCH_T0(hessians + pf_idx);
+        }
+        PREFETCH_T0(data_.data() + RowPtr(pf_idx));
+        const auto j_start = RowPtr(idx);
+        for (auto j = j_start; j < j_start + num_feature_; ++j) {
+          const VAL_T bin = data_[j];
+          if (use_hessians) {
+            ACC_GH(out, bin, gradients[idx], hessians[idx]);
+          } else {
+            ACC_GH(out, bin, gradients[idx], 1.0f);
+          }
+        }
+      }
+    }
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
+      const auto j_start = RowPtr(idx);
+      for (auto j = j_start; j < j_start + num_feature_; ++j) {
+        const VAL_T bin = data_[j];
+        if (use_hessians) {
+          ACC_GH(out, bin, gradients[idx], hessians[idx]);
+        } else {
+          ACC_GH(out, bin, gradients[idx], 1.0f);
+        }
+      }
+    }
+  }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
+  }
+
+  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
+    auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
+    data_.clear();
+    for (data_size_t i = 0; i < num_used_indices; ++i) {
+      for (int64_t j = other_bin->RowPtr(used_indices[i]); j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
+        data_.push_back(other_bin->data_[j]);
+      }
+    }
+  }
+
+  inline int64_t RowPtr(data_size_t idx) const {
+    return static_cast<int64_t>(idx) * num_feature_;
+  }
+
+  MultiValDenseBin<VAL_T>* Clone() override;
+
+private:
+  data_size_t num_data_;
+  int num_bin_;
+  int num_feature_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
+
+  MultiValDenseBin<VAL_T>(const MultiValDenseBin<VAL_T>& other)
+    : num_data_(other.num_data_), num_bin_(other.num_bin_), num_feature_(other.num_feature_), data_(other.data_) {
+  }
+};
+
+template<typename VAL_T>
+MultiValDenseBin<VAL_T>* MultiValDenseBin<VAL_T>::Clone() {
+  return new MultiValDenseBin<VAL_T>(*this);
+}
+
+
+
+}  // namespace LightGBM
+#endif   // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
+/*!
+ * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
+#define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
+
+
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace LightGBM {
+
+
+template <typename VAL_T>
+class MultiValSparseBin : public MultiValBin {
+public:
+
+  explicit MultiValSparseBin(data_size_t num_data, int num_bin)
+    : num_data_(num_data), num_bin_(num_bin) {
+    row_ptr_.resize(num_data_ + 1, 0);
+    data_.reserve(num_data_);
+    int num_threads = 1;
+    #pragma omp parallel
+    #pragma omp master
+    {
+      num_threads = omp_get_num_threads();
+    }
+    if (num_threads > 1) {
+      t_data_.resize(num_threads - 1);
+    }
+  }
+
+  ~MultiValSparseBin() {
+  }
+
+  data_size_t num_data() const override {
+    return num_data_;
+  }
+
+  int num_bin() const override {
+    return num_bin_;
+  }
+
+
+  void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t> & values) override {
+    row_ptr_[idx + 1] = static_cast<data_size_t>(values.size());
+    if (tid == 0) {
+      for (auto val : values) {
+        data_.push_back(static_cast<VAL_T>(val));
+      }
+    } else {
+      for (auto val : values) {
+        t_data_[tid - 1].push_back(static_cast<VAL_T>(val));
+      }
+    }
+  }
+
+  void FinishLoad() override {
+    for (data_size_t i = 0; i < num_data_; ++i) {
+      row_ptr_[i + 1] += row_ptr_[i];
+    }
+    if (t_data_.size() > 0) {
+      size_t offset = data_.size();
+      data_.resize(row_ptr_[num_data_]);
+      for (size_t tid = 0; tid < t_data_.size(); ++tid) {
+        std::memcpy(data_.data() + offset, t_data_[tid].data(), t_data_[tid].size() * sizeof(VAL_T));
+        offset += t_data_[tid].size();
+        t_data_[tid].clear();
+      }
+    }
+    row_ptr_.shrink_to_fit();
+    data_.shrink_to_fit();
+    t_data_.clear();
+    t_data_.shrink_to_fit();
+  }
+
+  bool IsSparse() override {
+    return true;
+  }
+
+  void ReSize(data_size_t num_data) override {
+    if (num_data_ != num_data) {
+      num_data_ = num_data;
+    }
+  }
+
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians, hist_t* out) const {
+    data_size_t i = start;
+    if (use_prefetch) {
+      const data_size_t pf_offset = 32 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(gradients + pf_idx);
+        if (use_hessians) {
+          PREFETCH_T0(hessians + pf_idx);
+        }
+        PREFETCH_T0(row_ptr_.data() + pf_idx);
+        PREFETCH_T0(data_.data() + row_ptr_[pf_idx]);
+        const auto j_start = RowPtr(idx);
+        const auto j_end = RowPtr(idx + 1);
+        for (auto j = j_start; j < j_end; ++j) {
+          const VAL_T bin = data_[j];
+          if (use_hessians) {
+            ACC_GH(out, bin, gradients[idx], hessians[idx]);
+          } else {
+            ACC_GH(out, bin, gradients[idx], 1.0f);
+          }
+        }
+      }
+    }
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
+      const auto j_start = RowPtr(idx);
+      const auto j_end = RowPtr(idx + 1);
+      for (auto j = j_start; j < j_end; ++j) {
+        const VAL_T bin = data_[j];
+        if (use_hessians) {
+          ACC_GH(out, bin, gradients[idx], hessians[idx]);
+        } else {
+          ACC_GH(out, bin, gradients[idx], 1.0f);
+        }
+      }
+    }
+  }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
+  }
+
+  void CopySubset(const Bin * full_bin, const data_size_t * used_indices, data_size_t num_used_indices) override {
+    auto other_bin = dynamic_cast<const MultiValSparseBin<VAL_T>*>(full_bin);
+    row_ptr_.resize(num_data_ + 1, 0);
+    data_.clear();
+    for (data_size_t i = 0; i < num_used_indices; ++i) {
+      for (data_size_t j = other_bin->row_ptr_[used_indices[i]]; j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
+        data_.push_back(other_bin->data_[j]);
+      }
+      row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] - other_bin->row_ptr_[used_indices[i]];
+    }
+  }
+
+  inline data_size_t RowPtr(data_size_t idx) const {
+    return row_ptr_[idx];
+  }
+
+  MultiValSparseBin<VAL_T>* Clone() override;
+
+private:
+  data_size_t num_data_;
+  int num_bin_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, 32>> row_ptr_;
+  std::vector<std::vector<VAL_T>> t_data_;
+
+  MultiValSparseBin<VAL_T>(const MultiValSparseBin<VAL_T> & other)
+    : num_data_(other.num_data_), num_bin_(other.num_bin_), data_(other.data_), row_ptr_(other.row_ptr_) {
+  }
+};
+
+template<typename VAL_T>
+MultiValSparseBin<VAL_T>* MultiValSparseBin<VAL_T>::Clone() {
+  return new MultiValSparseBin<VAL_T>(*this);
+}
+
+
+
+}  // namespace LightGBM
+#endif   // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
--- a/src/io/ordered_sparse_bin.hpp
+++ b/src/io/ordered_sparse_bin.hpp
-/*!
- * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
-#define LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
-
-#include <LightGBM/bin.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <mutex>
-#include <utility>
-#include <vector>
-
-#include "sparse_bin.hpp"
-
-namespace LightGBM {
-
-/*!
-* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
-*        There are 2 advantages by using ordered bin.
-*        1. group the data by leafs to improve the cache hit.
-*        2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
-*        However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
-*        So we only using ordered bin for sparse situations.
-*/
-template <typename VAL_T>
-class OrderedSparseBin: public OrderedBin {
- public:
-  /*! \brief Pair to store one bin entry */
-  struct SparsePair {
-    data_size_t ridx;  // data(row) index
-    VAL_T bin;  // bin for this data
-    SparsePair() : ridx(0), bin(0) {}
-  };
-
-  explicit OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
-    :bin_data_(bin_data) {
-    data_size_t cur_pos = 0;
-    data_size_t i_delta = -1;
-    int non_zero_cnt = 0;
-    while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
-      ++non_zero_cnt;
-    }
-    ordered_pair_.resize(non_zero_cnt);
-    leaf_cnt_.push_back(non_zero_cnt);
-  }
-
-  ~OrderedSparseBin() {
-  }
-
-  void Init(const char* used_idices, int num_leaves) override {
-    // initialize the leaf information
-    leaf_start_ = std::vector<data_size_t>(num_leaves, 0);
-    leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
-    if (used_idices == nullptr) {
-      // if using all data, copy all non-zero pair
-      data_size_t j = 0;
-      data_size_t cur_pos = 0;
-      data_size_t i_delta = -1;
-      while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
-        ordered_pair_[j].ridx = cur_pos;
-        ordered_pair_[j].bin = bin_data_->vals_[i_delta];
-        ++j;
-      }
-      leaf_cnt_[0] = static_cast<data_size_t>(j);
-    } else {
-      // if using part of data(bagging)
-      data_size_t j = 0;
-      data_size_t cur_pos = 0;
-      data_size_t i_delta = -1;
-      while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
-        if (used_idices[cur_pos]) {
-          ordered_pair_[j].ridx = cur_pos;
-          ordered_pair_[j].bin = bin_data_->vals_[i_delta];
-          ++j;
-        }
-      }
-      leaf_cnt_[0] = j;
-    }
-  }
-
-  void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
-                          HistogramBinEntry* out) const override {
-    // get current leaf boundary
-    const data_size_t start = leaf_start_[leaf];
-    const data_size_t end = start + leaf_cnt_[leaf];
-    for (data_size_t i = start; i < end; ++i) {
-      const VAL_T bin = ordered_pair_[i].bin;
-      const auto g = gradient[ordered_pair_[i].ridx];
-      const auto h = hessian[ordered_pair_[i].ridx];
-
-      out[bin].sum_gradients += g;
-      out[bin].sum_hessians += h;
-      ++out[bin].cnt;
-    }
-  }
-
-  void ConstructHistogram(int leaf, const score_t* gradient,
-                          HistogramBinEntry* out) const override {
-    // get current leaf boundary
-    const data_size_t start = leaf_start_[leaf];
-    const data_size_t end = start + leaf_cnt_[leaf];
-    for (data_size_t i = start; i < end; ++i) {
-      const VAL_T bin = ordered_pair_[i].bin;
-      const auto g = gradient[ordered_pair_[i].ridx];
-      out[bin].sum_gradients += g;
-      ++out[bin].cnt;
-    }
-  }
-
-  void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
-    // get current leaf boundary
-    const data_size_t l_start = leaf_start_[leaf];
-    const data_size_t l_end = l_start + leaf_cnt_[leaf];
-    // new left leaf end after split
-    data_size_t new_left_end = l_start;
-
-    for (data_size_t i = l_start; i < l_end; ++i) {
-      if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
-        std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
-        ++new_left_end;
-      }
-    }
-
-    leaf_start_[right_leaf] = new_left_end;
-    leaf_cnt_[leaf] = new_left_end - l_start;
-    leaf_cnt_[right_leaf] = l_end - new_left_end;
-  }
-  data_size_t NonZeroCount(int leaf) const override {
-    return static_cast<data_size_t>(leaf_cnt_[leaf]);
-  }
-  /*! \brief Disable copy */
-  OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
-  /*! \brief Disable copy */
-  OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;
-
- private:
-  const SparseBin<VAL_T>* bin_data_;
-  /*! \brief Store non-zero pair , group by leaf */
-  std::vector<SparsePair> ordered_pair_;
-  /*! \brief leaf_start_[i] means data in i-th leaf start from */
-  std::vector<data_size_t> leaf_start_;
-  /*! \brief leaf_cnt_[i] means number of data in i-th leaf */
-  std::vector<data_size_t> leaf_cnt_;
-};
-
-template <typename VAL_T>
-OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
-  return new OrderedSparseBin<VAL_T>(this);
-}
-
-}  // namespace LightGBM
-#endif   // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64;

 template <typename VAL_T>
 class SparseBinIterator: public BinIterator {
- public:
+public:
  SparseBinIterator(const SparseBin<VAL_T>* bin_data,
    uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
    : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
@@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator {

  inline void Reset(data_size_t idx) override;

- private:
+private:
  const SparseBin<VAL_T>* bin_data_;
  data_size_t cur_pos_;
  data_size_t i_delta_;
@@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator {
  uint8_t offset_;
 };

-template <typename VAL_T>
-class OrderedSparseBin;
-
 template <typename VAL_T>
 class SparseBin: public Bin {
- public:
+public:
  friend class SparseBinIterator<VAL_T>;
-  friend class OrderedSparseBin<VAL_T>;

  explicit SparseBin(data_size_t num_data)
    : num_data_(num_data) {
    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
+    #pragma omp parallel
+    #pragma omp master
    {
      num_threads = omp_get_num_threads();
    }
@@ -102,41 +98,97 @@ class SparseBin: public Bin {

  BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;

-  void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*,
-    const score_t*, HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(data_indices[start], &i_delta, &cur_pos);
+    data_size_t i = start;
+    for (;;) {
+      if (cur_pos < data_indices[i]) {
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      } else if (cur_pos > data_indices[i]) {
+        if (++i >= end) { break; }
+      } else {
+        const VAL_T bin = vals_[i_delta];
+        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+        if (++i >= end) { break; }
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      }
+    }
  }

-  void ConstructHistogram(data_size_t, data_size_t, const score_t*,
-                          const score_t*, HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(start, &i_delta, &cur_pos);
+    while (cur_pos < start && i_delta < num_vals_) {
+      cur_pos += deltas_[++i_delta];
+    }
+    while (cur_pos < end && i_delta < num_vals_) {
+      const VAL_T bin = vals_[i_delta];
+      ACC_GH(out, bin, ordered_gradients[cur_pos], ordered_hessians[cur_pos]);
+      cur_pos += deltas_[++i_delta];
+    }
  }

-  void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*,
-                          HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(data_indices[start], &i_delta, &cur_pos);
+    data_size_t i = start;
+    for (;;) {
+      if (cur_pos < data_indices[i]) {
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      } else if (cur_pos > data_indices[i]) {
+        if (++i >= end) { break; }
+      } else {
+        const VAL_T bin = vals_[i_delta];
+        ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+        if (++i >= end) { break; }
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      }
+    }
  }

-  void ConstructHistogram(data_size_t, data_size_t, const score_t*,
-                          HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(start, &i_delta, &cur_pos);
+    while (cur_pos < start && i_delta < num_vals_) {
+      cur_pos += deltas_[++i_delta];
+    }
+    while (cur_pos < end && i_delta < num_vals_) {
+      const VAL_T bin = vals_[i_delta];
+      ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f);
+      cur_pos += deltas_[++i_delta];
+    }
  }
+  #undef ACC_GH

-  inline bool NextNonzero(data_size_t* i_delta,
-                          data_size_t* cur_pos) const {
-    ++(*i_delta);
-    data_size_t shift = 0;
-    data_size_t delta = deltas_[*i_delta];
-    while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
-      ++(*i_delta);
-      shift += 8;
-      delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
+  inline void NextNonzeroFast(data_size_t* i_delta,
+    data_size_t* cur_pos) const {
+    *cur_pos += deltas_[++(*i_delta)];
+    if (*i_delta >= num_vals_) {
+      *cur_pos = num_data_;
    }
-    *cur_pos += delta;
+  }
+
+  inline bool NextNonzero(data_size_t* i_delta,
+    data_size_t* cur_pos) const {
+    *cur_pos += deltas_[++(*i_delta)];
    if (*i_delta < num_vals_) {
      return true;
    } else {
@@ -257,8 +309,6 @@ class SparseBin: public Bin {

  data_size_t num_data() const override { return num_data_; }

-  OrderedBin* CreateOrderedBin() const override;
-
  void FinishLoad() override {
    // get total non zero size
    size_t pair_cnt = 0;
@@ -276,8 +326,8 @@ class SparseBin: public Bin {
    // sort by data index
    std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
      [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
-      return a.first < b.first;
-    });
+        return a.first < b.first;
+      });
    // load delta array
    LoadFromPair(idx_val_pairs);
  }
@@ -291,11 +341,12 @@ class SparseBin: public Bin {
      const data_size_t cur_idx = idx_val_pairs[i].first;
      const VAL_T bin = idx_val_pairs[i].second;
      data_size_t cur_delta = cur_idx - last_idx;
+      // disallow the multi-val in one row
      if (i > 0 && cur_delta == 0) { continue; }
      while (cur_delta >= 256) {
-        deltas_.push_back(cur_delta & 0xff);
+        deltas_.push_back(255);
        vals_.push_back(0);
-        cur_delta >>= 8;
+        cur_delta -= 255;
      }
      deltas_.push_back(static_cast<uint8_t>(cur_delta));
      vals_.push_back(bin);
@@ -384,7 +435,7 @@ class SparseBin: public Bin {
        while (cur_pos < idx && j < num_vals_) {
          NextNonzero(&j, &cur_pos);
        }
-        if (cur_pos == idx && j < num_vals_) {
+        if (cur_pos == idx && j < num_vals_ && vals_[j] > 0) {
          // new row index is i
          tmp_pair.emplace_back(i, vals_[j]);
        }
@@ -405,13 +456,13 @@ class SparseBin: public Bin {
    // transform to delta array
    data_size_t last_idx = 0;
    for (data_size_t i = 0; i < num_used_indices; ++i) {
-      VAL_T bin = iterator.InnerRawGet(used_indices[i]);
+      auto bin = iterator.InnerRawGet(used_indices[i]);
      if (bin > 0) {
        data_size_t cur_delta = i - last_idx;
        while (cur_delta >= 256) {
-          deltas_.push_back(cur_delta & 0xff);
+          deltas_.push_back(255);
          vals_.push_back(0);
-          cur_delta >>= 8;
+          cur_delta -= 255;
        }
        deltas_.push_back(static_cast<uint8_t>(cur_delta));
        vals_.push_back(bin);
@@ -432,15 +483,29 @@ class SparseBin: public Bin {

  SparseBin<VAL_T>* Clone() override;

- protected:
  SparseBin<VAL_T>(const SparseBin<VAL_T>& other)
    : num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_),
-      num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
-      fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {}
+    num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
+    fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {
+  }
+
+  void InitIndex(data_size_t start_idx, data_size_t * i_delta, data_size_t * cur_pos) const {
+    auto idx = start_idx >> fast_index_shift_;
+    if (static_cast<size_t>(idx) < fast_index_.size()) {
+      const auto fast_pair = fast_index_[start_idx >> fast_index_shift_];
+      *i_delta = fast_pair.first;
+      *cur_pos = fast_pair.second;
+    } else {
+      *i_delta = -1;
+      *cur_pos = 0;
+    }
+  }
+
+private:

  data_size_t num_data_;
-  std::vector<uint8_t> deltas_;
-  std::vector<VAL_T> vals_;
+  std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> deltas_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> vals_;
  data_size_t num_vals_;
  std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
  std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
@@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
 template <typename VAL_T>
 inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
  while (cur_pos_ < idx) {
-    bin_data_->NextNonzero(&i_delta_, &cur_pos_);
+    bin_data_->NextNonzeroFast(&i_delta_, &cur_pos_);
  }
  if (cur_pos_ == idx) {
    return bin_data_->vals_[i_delta_];
@@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {

 template <typename VAL_T>
 inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
-  auto idx = start_idx >> bin_data_->fast_index_shift_;
-  if (static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
-    const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
-    i_delta_ = fast_pair.first;
-    cur_pos_ = fast_pair.second;
-  } else {
-    i_delta_ = -1;
-    cur_pos_ = 0;
-  }
+  bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_);
 }

 template <typename VAL_T>

--- a/src/objective/rank_xendcg_objective.hpp
+++ b/src/objective/rank_xendcg_objective.hpp
@@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction {
    // Skip query if sum of labels is 0.
    float sum_labels = 0;
    for (data_size_t i = 0; i < cnt; ++i) {
-      sum_labels += phi(label[i], gammas[i]);
+      sum_labels += static_cast<float>(phi(label[i], gammas[i]));
    }
-    if (sum_labels == 0) {
+    if (std::fabs(sum_labels) < kEpsilon) {
      return;
    }

@@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction {
  }

  double phi(const label_t l, double g) const {
-    return Common::Pow(2, l) - g;
+    return Common::Pow(2, static_cast<int>(l)) - g;
  }

  const char* GetName() const override {

--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -27,7 +27,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo
  rank_ = Network::rank();
  num_machines_ = Network::num_machines();
  // allocate buffer for communication
-  size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry);
+  size_t buffer_size = this->train_data_->NumTotalBin() * KHistEntrySize;

  input_buffer_.resize(buffer_size);
  output_buffer_.resize(buffer_size);
@@ -82,7 +82,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
      if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
        num_bin -= 1;
      }
-      block_len_[i] += num_bin * sizeof(HistogramBinEntry);
+      block_len_[i] += num_bin * KHistEntrySize;
    }
    reduce_scatter_size_ += block_len_[i];
  }
@@ -101,7 +101,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
      if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
        num_bin -= 1;
      }
-      bin_size += num_bin * sizeof(HistogramBinEntry);
+      bin_size += num_bin * KHistEntrySize;
    }
  }

@@ -113,7 +113,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
    if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
      num_bin -= 1;
    }
-    bin_size += num_bin * sizeof(HistogramBinEntry);
+    bin_size += num_bin * KHistEntrySize;
  }

  // sync global data sumup info
@@ -158,8 +158,8 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
                this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
  }
  // Reduce scatter for histogram
-  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(),
-                         block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
+  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(),
+                         block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
  this->FindBestSplitsFromHistograms(this->is_feature_used_, true);
 }

@@ -186,7 +186,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const

    this->train_data_->FixHistogram(feature_index,
                                    this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
-                                    GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
                                    this->smaller_leaf_histogram_array_[feature_index].RawData());
    SplitInfo smaller_split;
    // find best threshold for smaller child

--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -108,58 +108,70 @@ class DataPartition {
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
-  void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, int right_leaf) {
+  void Split(int leaf, const Dataset* dataset, int feature,
+             const uint32_t* threshold, int num_threshold, bool default_left,
+             int right_leaf) {
+    Common::FunctionTimer fun_timer("DataPartition::Split", global_timer);
    const data_size_t min_inner_size = 512;
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
    const data_size_t cnt = leaf_count_[leaf];

-    data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
-    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
+    const int nblock =
+        std::min(num_threads_, (cnt + min_inner_size - 1) / min_inner_size);
+    data_size_t inner_size = SIZE_ALIGNED((cnt + nblock - 1) / nblock);
+    auto left_start = indices_.data() + begin;
+    global_timer.Start("DataPartition::Split.MT");
    // split data multi-threading
    OMP_INIT_EX();
-    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < nblock; ++i) {
      OMP_LOOP_EX_BEGIN();
-      left_cnts_buf_[i] = 0;
-      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
-      if (cur_start > cnt) { continue; }
-      data_size_t cur_cnt = inner_size;
-      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
+      data_size_t cur_cnt = std::min(inner_size, cnt - cur_start);
+      if (cur_cnt <= 0) {
+        left_cnts_buf_[i] = 0;
+        right_cnts_buf_[i] = 0;
+        continue;
+      }
      // split data inner, reduce the times of function called
-      data_size_t cur_left_count = dataset->Split(feature, threshold, num_threshold, default_left, indices_.data() + begin + cur_start, cur_cnt,
-                                                  temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
+      data_size_t cur_left_count =
+          dataset->Split(feature, threshold, num_threshold, default_left,
+                         left_start + cur_start, cur_cnt,
+                         temp_left_indices_.data() + cur_start,
+                         temp_right_indices_.data() + cur_start);
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
-    data_size_t left_cnt = 0;
+    global_timer.Stop("DataPartition::Split.MT");
+    global_timer.Start("DataPartition::Split.Merge");
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
-    for (int i = 1; i < num_threads_; ++i) {
-      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
-      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
+    for (int i = 1; i < nblock; ++i) {
+      left_write_pos_buf_[i] =
+          left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
+      right_write_pos_buf_[i] =
+          right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
-    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
-    // copy back indices of right leaf to indices_
-    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
-      if (left_cnts_buf_[i] > 0) {
-        std::memcpy(indices_.data() + begin + left_write_pos_buf_[i],
-                    temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
-      }
-      if (right_cnts_buf_[i] > 0) {
-        std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i],
-                    temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
-      }
+    data_size_t left_cnt =
+        left_write_pos_buf_[nblock - 1] + left_cnts_buf_[nblock - 1];
+
+    auto right_start = left_start + left_cnt;
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < nblock; ++i) {
+      std::copy_n(temp_left_indices_.data() + offsets_buf_[i],
+                  left_cnts_buf_[i], left_start + left_write_pos_buf_[i]);
+      std::copy_n(temp_right_indices_.data() + offsets_buf_[i],
+                  right_cnts_buf_[i], right_start + right_write_pos_buf_[i]);
    }
    // update leaf boundary
    leaf_count_[leaf] = left_cnt;
    leaf_begin_[right_leaf] = left_cnt + begin;
    leaf_count_[right_leaf] = cnt - left_cnt;
+    global_timer.Stop("DataPartition::Split.Merge");
  }

  /*!
@@ -201,11 +213,11 @@ class DataPartition {
  /*! \brief number of data on one leaf */
  std::vector<data_size_t> leaf_count_;
  /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
-  std::vector<data_size_t> indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> indices_;
  /*! \brief team indices buffer for split */
-  std::vector<data_size_t> temp_left_indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_left_indices_;
  /*! \brief team indices buffer for split */
-  std::vector<data_size_t> temp_right_indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_right_indices_;
  /*! \brief used data indices, used for bagging */
  const data_size_t* used_data_indices_;
  /*! \brief used data count, used for bagging */

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -5,6 +5,7 @@
 #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
 #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_

+#include <LightGBM/bin.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/array_args.h>

@@ -20,7 +21,7 @@
 namespace LightGBM {

 class FeatureMetainfo {
- public:
+public:
  int num_bin;
  MissingType missing_type;
  int8_t offset = 0;
@@ -35,7 +36,7 @@ class FeatureMetainfo {
 * \brief FeatureHistogram is used to construct and store a histogram for a feature.
 */
 class FeatureHistogram {
- public:
+public:
  FeatureHistogram() {
    data_ = nullptr;
  }
@@ -53,19 +54,19 @@ class FeatureHistogram {
  * \param feature the feature data for this histogram
  * \param min_num_data_one_leaf minimal number of data in one leaf
  */
-  void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) {
+  void Init(hist_t* data, const FeatureMetainfo* meta) {
    meta_ = meta;
    data_ = data;
    if (meta_->bin_type == BinType::NumericalBin) {
      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
-                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
+        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
    } else {
      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
-                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
+        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
    }
  }

-  HistogramBinEntry* RawData() {
+  hist_t* RawData() {
    return data_;
  }
  /*!
@@ -73,15 +74,13 @@ class FeatureHistogram {
  * \param other The histogram that want to subtract
  */
  void Subtract(const FeatureHistogram& other) {
-    for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
-      data_[i].cnt -= other.data_[i].cnt;
-      data_[i].sum_gradients -= other.data_[i].sum_gradients;
-      data_[i].sum_hessians -= other.data_[i].sum_hessians;
+    for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
+      data_[i] -= other.data_[i];
    }
  }

  void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                         SplitInfo* output) {
+    SplitInfo* output) {
    output->default_left = true;
    output->gain = kMinScore;
    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
@@ -89,10 +88,10 @@ class FeatureHistogram {
  }

  void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                                  SplitInfo* output) {
+    SplitInfo* output) {
    is_splittable_ = false;
    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
-                                         meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
    if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
      if (meta_->missing_type == MissingType::Zero) {
@@ -116,8 +115,8 @@ class FeatureHistogram {
  }

  void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
-                                    double min_constraint, double max_constraint,
-                                    SplitInfo* output) {
+    double min_constraint, double max_constraint,
+    SplitInfo* output) {
    output->default_left = false;
    double best_gain = kMinScore;
    data_size_t best_left_count = 0;
@@ -134,25 +133,28 @@ class FeatureHistogram {
    bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
    int best_threshold = -1;
    int best_dir = 1;
-
+    const double cnt_factor = num_data / sum_hessian;
    if (use_onehot) {
      for (int t = 0; t < used_bin; ++t) {
+        const auto grad = GET_GRAD(data_, t);
+        const auto hess = GET_HESS(data_, t);
+        data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
        // if data not enough, or sum hessian too small
-        if (data_[t].cnt < meta_->config->min_data_in_leaf
-            || data_[t].sum_hessians < meta_->config->min_sum_hessian_in_leaf) continue;
-        data_size_t other_count = num_data - data_[t].cnt;
+        if (cnt < meta_->config->min_data_in_leaf
+          || hess < meta_->config->min_sum_hessian_in_leaf) continue;
+        data_size_t other_count = num_data - cnt;
        // if data not enough
        if (other_count < meta_->config->min_data_in_leaf) continue;

-        double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
+        double sum_other_hessian = sum_hessian - hess - kEpsilon;
        // if sum hessian too small
        if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue;

-        double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
+        double sum_other_gradient = sum_gradient - grad;
        // current split gain
-        double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
-                                            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, 0);
+        double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
+          meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+          min_constraint, max_constraint, 0);
        // gain with split is worse than without split
        if (current_gain <= min_gain_shift) continue;

@@ -161,15 +163,15 @@ class FeatureHistogram {
        // better split point
        if (current_gain > best_gain) {
          best_threshold = t;
-          best_sum_left_gradient = data_[t].sum_gradients;
-          best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
-          best_left_count = data_[t].cnt;
+          best_sum_left_gradient = grad;
+          best_sum_left_hessian = hess + kEpsilon;
+          best_left_count = cnt;
          best_gain = current_gain;
        }
      }
    } else {
      for (int i = 0; i < used_bin; ++i) {
-        if (data_[i].cnt >= meta_->config->cat_smooth) {
+        if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= meta_->config->cat_smooth) {
          sorted_idx.push_back(i);
        }
      }
@@ -181,9 +183,9 @@ class FeatureHistogram {
        return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
      };
      std::sort(sorted_idx.begin(), sorted_idx.end(),
-                [this, &ctr_fun](int i, int j) {
-        return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
-      });
+        [this, &ctr_fun](int i, int j) {
+          return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
+        });

      std::vector<int> find_direction(1, 1);
      std::vector<int> start_position(1, 0);
@@ -203,14 +205,17 @@ class FeatureHistogram {
        for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
          auto t = sorted_idx[start_pos];
          start_pos += dir;
+          const auto grad = GET_GRAD(data_, t);
+          const auto hess = GET_HESS(data_, t);
+          data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));

-          sum_left_gradient += data_[t].sum_gradients;
-          sum_left_hessian += data_[t].sum_hessians;
-          left_count += data_[t].cnt;
-          cnt_cur_group += data_[t].cnt;
+          sum_left_gradient += grad;
+          sum_left_hessian += hess;
+          left_count += cnt;
+          cnt_cur_group += cnt;

          if (left_count < meta_->config->min_data_in_leaf
-              || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
+            || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
          data_size_t right_count = num_data - left_count;
          if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group) break;

@@ -223,8 +228,8 @@ class FeatureHistogram {

          double sum_right_gradient = sum_gradient - sum_left_gradient;
          double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                              meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                              min_constraint, max_constraint, 0);
+            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+            min_constraint, max_constraint, 0);
          if (current_gain <= min_gain_shift) continue;
          is_splittable_ = true;
          if (current_gain > best_gain) {
@@ -241,15 +246,15 @@ class FeatureHistogram {

    if (is_splittable_) {
      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
-                                                        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                                        min_constraint, max_constraint);
+        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-                                                         sum_hessian - best_sum_left_hessian,
-                                                         meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                                         min_constraint, max_constraint);
+        sum_hessian - best_sum_left_hessian,
+        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
@@ -279,22 +284,22 @@ class FeatureHistogram {
  }

  void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
-                              uint32_t threshold, data_size_t num_data, SplitInfo *output) {
+    uint32_t threshold, data_size_t num_data, SplitInfo* output) {
    if (meta_->bin_type == BinType::NumericalBin) {
      GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
-                                      num_data, output);
+        num_data, output);
    } else {
      GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
-                                        num_data, output);
+        num_data, output);
    }
  }

  void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
-                                       uint32_t threshold, data_size_t num_data,
-                                       SplitInfo *output) {
+    uint32_t threshold, data_size_t num_data,
+    SplitInfo* output) {
    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
-                                         meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                         meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;

    // do stuff here
@@ -315,27 +320,29 @@ class FeatureHistogram {

    int t = meta_->num_bin - 1 - offset - use_na_as_missing;
    const int t_end = 1 - offset;
-
+    const double cnt_factor = num_data / sum_hessian;
    // from right to left, and we don't need data in bin0
    for (; t >= t_end; --t) {
      if (static_cast<uint32_t>(t + offset) < threshold) { break; }

      // need to skip default bin
      if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
-
-      sum_right_gradient += data_[t].sum_gradients;
-      sum_right_hessian += data_[t].sum_hessians;
-      right_count += data_[t].cnt;
+      const auto grad = GET_GRAD(data_, t);
+      const auto hess = GET_HESS(data_, t);
+      data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+      sum_right_gradient += grad;
+      sum_right_hessian += hess;
+      right_count += cnt;
    }
    double sum_left_gradient = sum_gradient - sum_right_gradient;
    double sum_left_hessian = sum_hessian - sum_right_hessian;
    data_size_t left_count = num_data - right_count;
    double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
-                                           meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                           meta_->config->max_delta_step)
-          + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
-                             meta_->config->lambda_l1, meta_->config->lambda_l2,
-                             meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step)
+      + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
+        meta_->config->lambda_l1, meta_->config->lambda_l2,
+        meta_->config->max_delta_step);

    // gain with split is worse than without split
    if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
@@ -347,15 +354,15 @@ class FeatureHistogram {
    // update split information
    output->threshold = threshold;
    output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
-                                                      meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                                      meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
    output->left_count = left_count;
    output->left_sum_gradient = sum_left_gradient;
    output->left_sum_hessian = sum_left_hessian - kEpsilon;
    output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient,
-                                                       sum_hessian - sum_left_hessian,
-                                                       meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                                       meta_->config->max_delta_step);
+      sum_hessian - sum_left_hessian,
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
    output->right_count = num_data - left_count;
    output->right_sum_gradient = sum_gradient - sum_left_gradient;
    output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
@@ -365,13 +372,13 @@ class FeatureHistogram {
  }

  void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian,
-                                         uint32_t threshold, data_size_t num_data, SplitInfo *output) {
+    uint32_t threshold, data_size_t num_data, SplitInfo* output) {
    // get SplitInfo for a given one-hot categorical split.
    output->default_left = false;
    double gain_shift = GetLeafSplitGain(
-            sum_gradient, sum_hessian,
-            meta_->config->lambda_l1, meta_->config->lambda_l2,
-            meta_->config->max_delta_step);
+      sum_gradient, sum_hessian,
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
    bool is_full_categorical = meta_->missing_type == MissingType::None;
    int used_bin = meta_->num_bin - 1 + is_full_categorical;
@@ -380,21 +387,25 @@ class FeatureHistogram {
      Log::Warning("Invalid categorical threshold split");
      return;
    }
+    const double cnt_factor = num_data / sum_hessian;
+    const auto grad = GET_GRAD(data_, threshold);
+    const auto hess = GET_HESS(data_, threshold);
+    data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));

    double l2 = meta_->config->lambda_l2;
-    data_size_t left_count = data_[threshold].cnt;
+    data_size_t left_count = cnt;
    data_size_t right_count = num_data - left_count;
-    double sum_left_hessian = data_[threshold].sum_hessians + kEpsilon;
+    double sum_left_hessian = hess + kEpsilon;
    double sum_right_hessian = sum_hessian - sum_left_hessian;
-    double sum_left_gradient = data_[threshold].sum_gradients;
+    double sum_left_gradient = grad;
    double sum_right_gradient = sum_gradient - sum_left_gradient;
    // current split gain
    double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
-                                           meta_->config->lambda_l1, l2,
-                                           meta_->config->max_delta_step)
-        + GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
-                           meta_->config->lambda_l1, l2,
-                           meta_->config->max_delta_step);
+      meta_->config->lambda_l1, l2,
+      meta_->config->max_delta_step)
+      + GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
+        meta_->config->lambda_l1, l2,
+        meta_->config->max_delta_step);
    if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
      output->gain = kMinScore;
      Log::Warning("'Forced Split' will be ignored since the gain getting worse. ");
@@ -402,14 +413,14 @@ class FeatureHistogram {
    }

    output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
-                                                      meta_->config->lambda_l1, l2,
-                                                      meta_->config->max_delta_step);
+      meta_->config->lambda_l1, l2,
+      meta_->config->max_delta_step);
    output->left_count = left_count;
    output->left_sum_gradient = sum_left_gradient;
    output->left_sum_hessian = sum_left_hessian - kEpsilon;
    output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian,
-                                                       meta_->config->lambda_l1, l2,
-                                                       meta_->config->max_delta_step);
+      meta_->config->lambda_l1, l2,
+      meta_->config->max_delta_step);
    output->right_count = right_count;
    output->right_sum_gradient = sum_gradient - sum_left_gradient;
    output->right_sum_hessian = sum_right_hessian - kEpsilon;
@@ -423,14 +434,14 @@ class FeatureHistogram {
  * \brief Binary size of this histogram
  */
  int SizeOfHistgram() const {
-    return (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry);
+    return (meta_->num_bin - meta_->offset) * KHistEntrySize;
  }

  /*!
  * \brief Restore histogram from memory
  */
  void FromMemory(char* memory_data) {
-    std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry));
+    std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * KHistEntrySize);
  }

  /*!
@@ -457,11 +468,11 @@ class FeatureHistogram {
    }
  }

- private:
+private:
  static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
-                              double sum_right_gradients, double sum_right_hessians,
-                              double l1, double l2, double max_delta_step,
-                              double min_constraint, double max_constraint, int8_t monotone_constraint) {
+    double sum_right_gradients, double sum_right_hessians,
+    double l1, double l2, double max_delta_step,
+    double min_constraint, double max_constraint, int8_t monotone_constraint) {
    double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
    double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
    if (((monotone_constraint > 0) && (left_output > right_output)) ||
@@ -479,7 +490,7 @@ class FeatureHistogram {
  * \return leaf output
  */
  static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step,
-                                            double min_constraint, double max_constraint) {
+    double min_constraint, double max_constraint) {
    double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
    if (ret < min_constraint) {
      ret = min_constraint;
@@ -506,7 +517,7 @@ class FeatureHistogram {
  }

  void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                                 double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
+    double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
    const int8_t offset = meta_->offset;

    double best_sum_left_gradient = NAN;
@@ -514,7 +525,7 @@ class FeatureHistogram {
    double best_gain = kMinScore;
    data_size_t best_left_count = 0;
    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
-
+    const double cnt_factor = num_data / sum_hessian;
    if (dir == -1) {
      double sum_right_gradient = 0.0f;
      double sum_right_hessian = kEpsilon;
@@ -528,12 +539,15 @@ class FeatureHistogram {
        // need to skip default bin
        if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }

-        sum_right_gradient += data_[t].sum_gradients;
-        sum_right_hessian += data_[t].sum_hessians;
-        right_count += data_[t].cnt;
+        const auto grad = GET_GRAD(data_, t);
+        const auto hess = GET_HESS(data_, t);
+        data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+        sum_right_gradient += grad;
+        sum_right_hessian += hess;
+        right_count += cnt;
        // if data not enough, or sum hessian too small
        if (right_count < meta_->config->min_data_in_leaf
-            || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
+          || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
        data_size_t left_count = num_data - right_count;
        // if data not enough
        if (left_count < meta_->config->min_data_in_leaf) break;
@@ -545,8 +559,8 @@ class FeatureHistogram {
        double sum_left_gradient = sum_gradient - sum_right_gradient;
        // current split gain
        double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                            meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, meta_->monotone_type);
+          meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+          min_constraint, max_constraint, meta_->monotone_type);
        // gain with split is worse than without split
        if (current_gain <= min_gain_shift) continue;

@@ -575,9 +589,12 @@ class FeatureHistogram {
        sum_left_hessian = sum_hessian - kEpsilon;
        left_count = num_data;
        for (int i = 0; i < meta_->num_bin - offset; ++i) {
-          sum_left_gradient -= data_[i].sum_gradients;
-          sum_left_hessian -= data_[i].sum_hessians;
-          left_count -= data_[i].cnt;
+          const auto grad = GET_GRAD(data_, i);
+          const auto hess = GET_HESS(data_, i);
+          data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+          sum_left_gradient -= grad;
+          sum_left_hessian -= hess;
+          left_count -= cnt;
        }
        t = -1;
      }
@@ -586,13 +603,13 @@ class FeatureHistogram {
        // need to skip default bin
        if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
        if (t >= 0) {
-          sum_left_gradient += data_[t].sum_gradients;
-          sum_left_hessian += data_[t].sum_hessians;
-          left_count += data_[t].cnt;
+          sum_left_gradient += GET_GRAD(data_, t);
+          sum_left_hessian += GET_HESS(data_, t);
+          left_count += static_cast<data_size_t>(Common::RoundInt(GET_HESS(data_, t) * cnt_factor));
        }
        // if data not enough, or sum hessian too small
        if (left_count < meta_->config->min_data_in_leaf
-            || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
+          || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
        data_size_t right_count = num_data - left_count;
        // if data not enough
        if (right_count < meta_->config->min_data_in_leaf) break;
@@ -604,8 +621,8 @@ class FeatureHistogram {
        double sum_right_gradient = sum_gradient - sum_left_gradient;
        // current split gain
        double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                            meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, meta_->monotone_type);
+          meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+          min_constraint, max_constraint, meta_->monotone_type);
        // gain with split is worse than without split
        if (current_gain <= min_gain_shift) continue;

@@ -626,15 +643,15 @@ class FeatureHistogram {
      // update split information
      output->threshold = best_threshold;
      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
-                                                        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                                        min_constraint, max_constraint);
+        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
      output->left_count = best_left_count;
      output->left_sum_gradient = best_sum_left_gradient;
      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-                                                         sum_hessian - best_sum_left_hessian,
-                                                         meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                                         min_constraint, max_constraint);
+        sum_hessian - best_sum_left_hessian,
+        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
      output->right_count = num_data - best_left_count;
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
@@ -645,14 +662,13 @@ class FeatureHistogram {

  const FeatureMetainfo* meta_;
  /*! \brief sum of gradient of each bin */
-  HistogramBinEntry* data_;
-  // std::vector<HistogramBinEntry> data_;
+  hist_t* data_;
  bool is_splittable_ = true;

  std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_;
 };
 class HistogramPool {
- public:
+public:
  /*!
  * \brief Constructor
  */
@@ -698,7 +714,7 @@ class HistogramPool {
    }
  }

-  void DynamicChangeSize(const Dataset* train_data, const Config* config, int cache_size, int total_size) {
+  void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise, const Config* config, int cache_size, int total_size) {
    if (feature_metas_.empty()) {
      uint64_t bin_cnt_over_features = 0;
      int num_feature = train_data->num_features();
@@ -720,7 +736,6 @@ class HistogramPool {
      }
      Log::Info("Total Bins %d", bin_cnt_over_features);
    }
-    uint64_t num_total_bin = train_data->NumTotalBin();
    int old_cache_size = static_cast<int>(pool_.size());
    Reset(cache_size, total_size);

@@ -728,24 +743,39 @@ class HistogramPool {
      pool_.resize(cache_size);
      data_.resize(cache_size);
    }
+    int num_total_bin = static_cast<int>(train_data->NumTotalBin());

+    std::vector<int> offsets;
+    if (is_hist_colwise) {
+      int offset = 0;
+      for (int j = 0; j < train_data->num_features(); ++j) {
+        offset += train_data->SubFeatureBinOffset(j);
+        offsets.push_back(offset);
+        auto num_bin = train_data->FeatureNumBin(j);
+        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
+          num_bin -= 1;
+        }
+        offset += num_bin;
+      }
+    } else {
+      num_total_bin = 1;
+      for (int j = 0; j < train_data->num_features(); ++j) {
+        offsets.push_back(num_total_bin);
+        num_total_bin += train_data->FeatureBinMapper(j)->num_bin();
+        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
+          num_total_bin -= 1;
+        }
+      }
+    }
    OMP_INIT_EX();
    #pragma omp parallel for schedule(static)
    for (int i = old_cache_size; i < cache_size; ++i) {
      OMP_LOOP_EX_BEGIN();
      pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
-      data_[i].resize(num_total_bin);
-      uint64_t offset = 0;
+      data_[i].resize(num_total_bin * 2);
      for (int j = 0; j < train_data->num_features(); ++j) {
-        offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
-        pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]);
-        auto num_bin = train_data->FeatureNumBin(j);
-        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
-          num_bin -= 1;
-        }
-        offset += static_cast<uint64_t>(num_bin);
+        pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
      }
-      CHECK(offset == num_total_bin);
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
@@ -816,9 +846,9 @@ class HistogramPool {
    inverse_mapper_[slot] = dst_idx;
  }

- private:
+private:
  std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
-  std::vector<std::vector<HistogramBinEntry>> data_;
+  std::vector<std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>> data_;
  std::vector<FeatureMetainfo> feature_metas_;
  int cache_size_;
  int total_size_;

--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
 // some functions used for debugging the GPU histogram construction
 #if GPU_DEBUG > 0

-void PrintHistograms(HistogramBinEntry* h, size_t size) {
-  size_t total = 0;
+void PrintHistograms(hist_t* h, size_t size) {
+  double total_hess = 0;
  for (size_t i = 0; i < size; ++i) {
-    printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
-    total += h[i].cnt;
-    if ((i & 3) == 3)
+    printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
+    if ((i & 2) == 2)
        printf("\n");
+    total_hess += GET_HESS(h, i);
  }
-  printf("\nTotal examples: %lu\n", total);
+  printf("\nSum hessians: %9.3g\n", total_hess);
 }

 union Float_t {
@@ -69,27 +69,23 @@ union Float_t {
 };


-void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
+void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
  size_t i;
  Float_t a, b;
  for (i = 0; i < size; ++i) {
-    a.f = h1[i].sum_gradients;
-    b.f = h2[i].sum_gradients;
+    a.f = GET_GRAD(h1, i);
+    b.f = GET_GRAD(h2, i);
    int32_t ulps = Float_t::ulp_diff(a, b);
-    if (fabs(h1[i].cnt           - h2[i].cnt != 0)) {
-      printf("%d != %d\n", h1[i].cnt, h2[i].cnt);
-      goto err;
-    }
    if (ulps > 0) {
-      // printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps);
+      // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
      // goto err;
    }
-    a.f = h1[i].sum_hessians;
-    b.f = h2[i].sum_hessians;
+    a.f = GET_HESS(h1, i);
+    b.f = GET_HESS(h2, i);
    ulps = Float_t::ulp_diff(a, b);
-    if (ulps > 0) {
-      // printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps);
-      // goto err;
+    if (std::fabs(a.f - b.f) >= 1e-20) {
+      printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
+      goto err;
    }
  }
  return;
@@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
 }

 template <typename HistType>
-void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
+void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
  HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
  // when the output is ready, the computation is done
  histograms_wait_obj_.wait();
@@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
      continue;
    }
    int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
    if (device_bin_mults_[i] == 1) {
      for (int j = 0; j < bin_size; ++j) {
-        old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
-        old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
-        old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
+        GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
+        GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
      }
    } else {
      // values of this feature has been redistributed to multiple bins; need a reduction here
      int ind = 0;
      for (int j = 0; j < bin_size; ++j) {
        double sum_g = 0.0, sum_h = 0.0;
-        size_t cnt = 0;
        for (int k = 0; k < device_bin_mults_[i]; ++k) {
-          sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients;
-          sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians;
-          cnt += hist_outputs[i * device_bin_size_ + ind].cnt;
+          sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind);
+          sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind);
          ind++;
        }
-        old_histogram_array[j].sum_gradients = sum_g;
-        old_histogram_array[j].sum_hessians = sum_h;
-        old_histogram_array[j].cnt = (data_size_t)cnt;
+        GET_GRAD(old_histogram_array, j) = sum_g;
+        GET_HESS(old_histogram_array, j) = sum_h;
      }
    }
  }
@@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
 void GPUTreeLearner::AllocateGPUMemory() {
  num_dense_feature_groups_ = 0;
  for (int i = 0; i < num_feature_groups_; ++i) {
-    if (ordered_bins_[i] == nullptr) {
+    if (!train_data_->IsMultiGroup(i)) {
      num_dense_feature_groups_++;
    }
  }
@@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
  device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
  boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
  // histogram bin entry size depends on the precision (single/double)
-  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry);
+  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2;
  Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
  // create output buffer, each feature has a histogram with device_bin_size_ bins,
  // each work group generates a sub-histogram of dword_features_ features.
@@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
  std::vector<int> dense_dword_ind(dword_features_);
  for (int i = 0; i < num_feature_groups_; ++i) {
    // looking for dword_features_ non-sparse feature-groups
-    if (ordered_bins_[i] == nullptr) {
+    if (!train_data_->IsMultiGroup(i)) {
      dense_dword_ind[k] = i;
      // decide if we need to redistribute the bin
      double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
@@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
  printf("bin size: ");
  #endif
  for (int i = 0; i < num_feature_groups_; ++i) {
+    if (train_data_->IsMultiGroup(i)) {
+      continue;
+    }
    #if GPU_DEBUG >= 1
    printf("%d, ", train_data_->FeatureGroupNumBin(i));
    #endif
@@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used_[feature_index]) continue;
    if (!is_feature_used[feature_index]) continue;
-    if (ordered_bins_[train_data_->Feature2Group(feature_index)]) {
+    if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
      is_sparse_feature_used[feature_index] = 1;
    } else {
      is_dense_feature_used[feature_index] = 1;
    }
  }
  // construct smaller leaf
-  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
  // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
    nullptr, nullptr,
    nullptr, nullptr);
  // then construct sparse features on CPU
-  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
  train_data_->ConstructHistograms(is_sparse_feature_used,
-    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
-    smaller_leaf_splits_->LeafIndex(),
-    &ordered_bins_, gradients_, hessians_,
+    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+    gradients_, hessians_,
    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+    multi_val_bin_.get(), is_hist_colwise_,
    ptr_smaller_leaf_hist_data);
  // wait for GPU to finish, only if GPU is actually used
  if (is_gpu_used) {
    if (config_->gpu_use_dp) {
      // use double precision
-      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
    } else {
      // use single precision
-      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
    }
  }

@@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
      continue;
    int dense_feature_group_index = dense_feature_group_map_[i];
    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
-    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
-    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
-    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
+    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
+    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
+    hist_t* gpu_histogram = new hist_t[size * 2];
    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
    printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
-    std::copy(current_histogram, current_histogram + size, gpu_histogram);
-    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
-    train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-      num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr,
-      num_data,
-      num_data != num_data_ ? ordered_gradients_.data() : gradients_,
-      num_data != num_data_ ? ordered_hessians_.data() : hessians_,
-      current_histogram);
+    std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
+    std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
+    if(train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr){continue;}
+    if (num_data != num_data_ ) {
+      train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+        smaller_leaf_splits_->data_indices(),
+        0,
+        num_data,
+        ordered_gradients_.data(),
+        ordered_hessians_.data(),
+        current_histogram);
+    } else {
+      train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+        0,
+        num_data,
+        gradients_,
+        hessians_,
+        current_histogram);
+    }
    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
-    std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
+    std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
    delete [] gpu_histogram;
  }
  #endif

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
-    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset;
    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
      gradients_, hessians_,
      ordered_gradients_.data(), ordered_hessians_.data());
    // then construct sparse features on CPU
-    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
    train_data_->ConstructHistograms(is_sparse_feature_used,
-      nullptr, larger_leaf_splits_->num_data_in_leaf(),
-      larger_leaf_splits_->LeafIndex(),
-      &ordered_bins_, gradients_, hessians_,
+      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
+      gradients_, hessians_,
      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+      multi_val_bin_.get(), is_hist_colwise_,
      ptr_larger_leaf_hist_data);
    // wait for GPU to finish, only if GPU is actually used
    if (is_gpu_used) {
      if (config_->gpu_use_dp) {
        // use double precision
-        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
      } else {
        // use single precision
-        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
      }
    }
  }

--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner {
      uint8_t s[4];
  };

-  /*! \brief Single precision histogram entiry for GPU */
-  struct GPUHistogramBinEntry {
-    score_t sum_gradients;
-    score_t sum_hessians;
-    uint32_t cnt;
-  };
+  typedef float gpu_hist_t;

  /*!
  * \brief Find the best number of workgroups processing one feature for maximizing efficiency
@@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner {
   * \param histograms Destination of histogram results from GPU.
  */
  template <typename HistType>
-  void WaitAndGetHistograms(HistogramBinEntry* histograms);
+  void WaitAndGetHistograms(hist_t* histograms);

  /*!
   * \brief Construct GPU histogram asynchronously.

--- a/src/treelearner/ocl/histogram16.cl
+++ b/src/treelearner/ocl/histogram16.cl
@@ -163,7 +163,7 @@ R""()
 void within_kernel_reduction16x8(uchar8 feature_mask,
                           __global const acc_type* restrict feature4_sub_hist, 
                           const uint skip_id,
-                           acc_type stat_val, uint cnt_val,
+                           acc_type stat_val,
                           const ushort num_sub_hist,
                           __global acc_type* restrict output_buf,
                           __local acc_type * restrict local_hist) {
@@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
            // 256 threads working on 8 features' 16 bins, gradient and hessian
            stat_val += *p;
            p += NUM_BINS * DWORD_FEATURES * 2;
-            if (ltid < LOCAL_SIZE_0 / 2) {
-                cnt_val += as_acc_int_type(*p); 
-            }
-            p += NUM_BINS * DWORD_FEATURES;
    }
    // skip the counters we already have
-    p += 3 * DWORD_FEATURES * NUM_BINS;
+    p += 2 * DWORD_FEATURES * NUM_BINS;
    for (i = i + 1; i < num_sub_hist; ++i) {
            stat_val += *p; 
            p += NUM_BINS * DWORD_FEATURES * 2;
-            if (ltid < LOCAL_SIZE_0 / 2) {
-                cnt_val += as_acc_int_type(*p); 
-            }
-            p += NUM_BINS * DWORD_FEATURES;
    }
    #endif
    // printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val);
    // now overwrite the local_hist for final reduction and output
    // reverse the f7...f0 order to match the real order
    feature_id = DWORD_FEATURES_MASK - feature_id;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + is_hessian_first] = stat_val;
-    bin_id = ltid >> (LOG2_DWORD_FEATURES); // range 0 - 16, for counter
-    if (ltid < LOCAL_SIZE_0 / 2) {
-        local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
-    }
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + is_hessian_first] = stat_val;
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (i = ltid; i < DWORD_FEATURES * 3 * NUM_BINS; i += lsize) {
+    for (i = ltid; i < DWORD_FEATURES * 2 * NUM_BINS; i += lsize) {
        output_buf[i] = local_hist[i];
    }
 }
@@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
       bk7_c_f0_bin16  bk7_c_f1_bin16  bk7_c_f2_bin16  bk7_c_f3_bin16  bk7_c_f4_bin16  bk7_c_f5_bin16  bk7_c_f6_bin16  bk7_c_f7_bin0
       -----------------------------------------------
    */
+    #if CONST_HESSIAN == 1
    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
+    #endif

    // thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first
    // thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first
@@ -547,7 +537,7 @@ R""()
            atomic_local_add_f(gh_hist + addr2, stat2);
            #endif
        }
-
+        #if CONST_HESSIAN == 1
        // STAGE 3: accumulate counter
        // there are 8 counters for 8 features
        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7
@@ -614,6 +604,7 @@ R""()
            // printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset);
            atom_inc(cnt_hist + addr);
        }
+        #endif
        stat1 = stat1_next;
        stat2 = stat2_next;
        feature4 = feature4_next;
@@ -642,6 +633,7 @@ R""()
        ushort bank_id = (i + offset) & BANK_MASK;
        stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id];
    }
+    #if CONST_HESSIAN == 1
    if (ltid < LOCAL_SIZE_0 / 2) {
        // first 128 threads accumulate the 8 * 16 = 128 counter values
        bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID
@@ -651,6 +643,7 @@ R""()
            cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id];
        }
    }
+    #endif
    
    // now thread 0 - 7  holds feature 0 - 7's gradient for bin 0 and counter bin 0
    // now thread 8 - 15 holds feature 0 - 7's hessian  for bin 0 and counter bin 1
@@ -687,7 +680,7 @@ R""()
    // write to output
    // write gradients and hessians histogram for all 4 features
    // output data in linear order for further reduction
-    // output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float)
+    // output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float)
    /* memory layout of output:
       g_f0_bin0   g_f1_bin0   g_f2_bin0   g_f3_bin0   g_f4_bin0   g_f5_bin0   g_f6_bin0   g_f7_bin0
       h_f0_bin0   h_f1_bin0   h_f2_bin0   h_f3_bin0   h_f4_bin0   h_f5_bin0   h_f6_bin0   h_f7_bin0
@@ -705,14 +698,10 @@ R""()
    // if there is only one workgroup processing this feature4, don't even need to write
    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
    #if POWER_FEATURE_WORKGROUPS != 0
-    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 3 * NUM_BINS;
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS;
    // if g_val and h_val are double, they are converted to float here
    // write gradients and hessians for 8 features
    output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val;
-    // write counts for 8 features
-    if (ltid < LOCAL_SIZE_0 / 2) {
-        output[2 * DWORD_FEATURES * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
-    }
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    mem_fence(CLK_GLOBAL_MEM_FENCE);
    // To avoid the cost of an extra reducting kernel, we have to deal with some 
@@ -738,7 +727,7 @@ R""()
    // The is done by using an global atomic counter.
    // On AMD GPUs ideally this should be done in GDS,
    // but currently there is no easy way to access it via OpenCL.
-    __local uint * counter_val = cnt_hist;
+    __local uint * counter_val = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
    if (ltid == 0) {
        // all workgroups processing the same feature add this counter
        *counter_val = atom_inc(sync_counters + feature4_id);
@@ -762,12 +751,12 @@ R""()
        // locate our feature4's block in output memory
        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
        __global acc_type const * restrict feature4_subhists = 
-                 (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 3 * NUM_BINS;
+                 (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 2 * NUM_BINS;
        // skip reading the data already in local memory
        uint skip_id = group_id ^ output_offset;
        // locate output histogram location for this feature4
-        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 3 * NUM_BINS;
-        within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, cnt_val, 
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 2 * NUM_BINS;
+        within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, 
                                    1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
    }
 }
@@ -776,4 +765,3 @@ R""()
 // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
 // )"" "\n#endif" + 9
 #endif
-
--- a/src/treelearner/ocl/histogram256.cl
+++ b/src/treelearner/ocl/histogram256.cl
@@ -155,15 +155,6 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
    acc_type f1_hess_bin = local_hist[ltid * 8 + 5];
    acc_type f2_hess_bin = local_hist[ltid * 8 + 6];
    acc_type f3_hess_bin = local_hist[ltid * 8 + 7];
-    __local uint* restrict local_cnt = (__local uint *)(local_hist + 4 * 2 * NUM_BINS);
-    #if POWER_FEATURE_WORKGROUPS != 0
-    uint  f0_cont_bin = ltid ? local_cnt[ltid * 4] : old_val_f0_cont_bin0;
-    #else
-    uint  f0_cont_bin = local_cnt[ltid * 4];
-    #endif
-    uint  f1_cont_bin = local_cnt[ltid * 4 + 1];
-    uint  f2_cont_bin = local_cnt[ltid * 4 + 2];
-    uint  f3_cont_bin = local_cnt[ltid * 4 + 3];
    ushort i;
    // printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
 #if POWER_FEATURE_WORKGROUPS != 0
@@ -173,70 +164,62 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
        if (feature_mask.s3) {
            f0_grad_bin += *p;          p += NUM_BINS;
            f0_hess_bin += *p;          p += NUM_BINS;
-            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
        if (feature_mask.s2) {
            f1_grad_bin += *p;          p += NUM_BINS;
            f1_hess_bin += *p;          p += NUM_BINS;
-            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
        if (feature_mask.s1) {
            f2_grad_bin += *p;          p += NUM_BINS;
            f2_hess_bin += *p;          p += NUM_BINS;
-            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
        if (feature_mask.s0) {
            f3_grad_bin += *p;          p += NUM_BINS;
            f3_hess_bin += *p;          p += NUM_BINS;
-            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
    }
    // skip the counters we already have
-    p += 3 * 4 * NUM_BINS;
+    p += 2 * 4 * NUM_BINS;
    for (i = i + 1; i < num_sub_hist; ++i) {
        if (feature_mask.s3) {
            f0_grad_bin += *p;          p += NUM_BINS;
            f0_hess_bin += *p;          p += NUM_BINS;
-            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
        if (feature_mask.s2) {
            f1_grad_bin += *p;          p += NUM_BINS;
            f1_hess_bin += *p;          p += NUM_BINS;
-            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
        if (feature_mask.s1) {
            f2_grad_bin += *p;          p += NUM_BINS;
            f2_hess_bin += *p;          p += NUM_BINS;
-            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
        if (feature_mask.s0) {
            f3_grad_bin += *p;          p += NUM_BINS;
            f3_hess_bin += *p;          p += NUM_BINS;
-            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
        }
    }
    // printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
@@ -245,18 +228,14 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
    barrier(CLK_LOCAL_MEM_FENCE);
    #if USE_DP_FLOAT == 0
    // reverse the f3...f0 order to match the real order
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
-    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
-    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
-    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
-    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
-    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
-    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
+    local_hist[2 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
+    local_hist[2 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
+    local_hist[3 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
+    local_hist[3 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
    barrier(CLK_LOCAL_MEM_FENCE);
    /*
    for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) {
@@ -267,34 +246,28 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
    if (feature_mask.s0) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
    if (feature_mask.s1) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
    if (feature_mask.s2) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    #else
    // when double precision is used, we need to write twice, because local memory size is not enough
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
    barrier(CLK_LOCAL_MEM_FENCE);
    /*
    for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
@@ -305,21 +278,17 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
    if (feature_mask.s0) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
    if (feature_mask.s1) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
    barrier(CLK_LOCAL_MEM_FENCE);
    /*
    for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
@@ -328,15 +297,13 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
    */
    i = ltid;
    if (feature_mask.s2) {
-        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
-        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+        output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
+        output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
    }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
    if (feature_mask.s3) {
-        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
-        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+        output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
+        output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
    }
    #endif
 }
@@ -401,7 +368,9 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
    __local acc_type * gh_hist = (__local acc_type *)shared_array;
    // counter histogram
    // total size: 4 * 256 * size_of(uint) = 4 KB
+    #if CONST_HESSIAN == 1
    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);
+    #endif 

    // thread 0, 1, 2, 3 compute histograms for gradients first
    // thread 4, 5, 6, 7 compute histograms for hessians  first
@@ -602,7 +571,7 @@ R""()
            s0_stat1 += stat1;
            s0_stat2 += stat2;
        }
-
+        #if CONST_HESSIAN == 1
        // STAGE 3: accumulate counter
        // there are 4 counters for 4 features
        // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
@@ -633,6 +602,7 @@ R""()
            addr = bin * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
+        #endif
        stat1 = stat1_next;
        stat2 = stat2_next;
        feature4 = feature4_next;
@@ -741,7 +711,7 @@ R""()
    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
    // if there is only one workgroup processing this feature4, don't even need to write
    #if POWER_FEATURE_WORKGROUPS != 0
-    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
    // write gradients and hessians
    __global acc_type * restrict ptr_f = output;
    for (ushort j = 0; j < 4; ++j) {
@@ -751,17 +721,7 @@ R""()
            acc_type value = gh_hist[i * 4 + j];
            ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
        }
-        ptr_f += 3 * NUM_BINS;
-    }
-    // write counts
-    __global acc_int_type * restrict ptr_i = (__global acc_int_type * restrict)(output + 2 * NUM_BINS);
-    for (ushort j = 0; j < 4; ++j) {
-        for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-            // FIXME: 2-way bank conflict
-            uint value = cnt_hist[i * 4 + j];
-            ptr_i[i] = value;
-        }
-        ptr_i += 3 * NUM_BINS;
+        ptr_f += 2 * NUM_BINS;
    }
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -788,7 +748,7 @@ R""()
    // The is done by using an global atomic counter.
    // On AMD GPUs ideally this should be done in GDS,
    // but currently there is no easy way to access it via OpenCL.
-    __local uint * counter_val = cnt_hist;
+    __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);;
    // backup the old value
    uint old_val = *counter_val;
    if (ltid == 0) {
@@ -814,11 +774,11 @@ R""()
        // locate our feature4's block in output memory
        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
        __global acc_type const * restrict feature4_subhists = 
-                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
+                 (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
        // skip reading the data already in local memory
        uint skip_id = group_id ^ output_offset;
        // locate output histogram location for this feature4
-        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
        within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS, 
                                     hist_buf, (__local acc_type *)shared_array);
        // if (ltid == 0)