Support both row-wise and col-wise multi-threading (#2699)

* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

Support both row-wise and col-wise multi-threading (#2699)
* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
509c2e50 · Guolin Ke · GitHub · bc7bc4a1 · 509c2e50 · 509c2e50
Unverified Commit 509c2e50 authored Feb 02, 2020 by Guolin Ke Committed by GitHub Feb 02, 2020
10 changed files
--- a/src/treelearner/ocl/histogram64.cl
+++ b/src/treelearner/ocl/histogram64.cl
@@ -157,7 +157,7 @@ R""()
 void within_kernel_reduction64x4(uchar4 feature_mask,
                           __global const acc_type* restrict feature4_sub_hist, 
                           const uint skip_id,
-                           acc_type g_val, acc_type h_val, uint cnt_val,
+                           acc_type g_val, acc_type h_val,
                           const ushort num_sub_hist,
                           __global acc_type* restrict output_buf,
                           __local acc_type * restrict local_hist) {
@@ -173,38 +173,35 @@ void within_kernel_reduction64x4(uchar4 feature_mask,
    for (i = 0; i < skip_id; ++i) {
            g_val += *p;            p += NUM_BINS * 4; // 256 threads working on 4 features' 64 bins
            h_val += *p;            p += NUM_BINS * 4;
-            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
    }
    // skip the counters we already have
-    p += 3 * 4 * NUM_BINS;
+    p += 2 * 4 * NUM_BINS;
    for (i = i + 1; i < num_sub_hist; ++i) {
            g_val += *p;            p += NUM_BINS * 4;
            h_val += *p;            p += NUM_BINS * 4;
-            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
    }
    #endif
    // printf("thread %d: g_val=%f, h_val=%f cnt=%d", ltid, g_val, h_val, cnt_val);
    // now overwrite the local_hist for final reduction and output
    // reverse the f3...f0 order to match the real order
    feature_id = 3 - feature_id;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 0] = g_val;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 1] = h_val;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 0] = g_val;
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 1] = h_val;
    barrier(CLK_LOCAL_MEM_FENCE);
    i = ltid;
-    if (feature_mask.s0 && i < 1 * 3 * NUM_BINS) {
+    if (feature_mask.s0 && i < 1 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s1 && i < 2 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s1 && i < 2 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s2 && i < 3 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s2 && i < 3 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
 }
@@ -306,7 +303,9 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
       bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
       -----------------------------------------------
    */
+    #if CONST_HESSIAN == 1
    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);
+    #endif

    // thread 0, 1, 2, 3 compute histograms for gradients first
    // thread 4, 5, 6, 7 compute histograms for hessians  first
@@ -509,7 +508,7 @@ R""()
            s0_stat1 += stat1;
            s0_stat2 += stat2;
        }
-
+        #if CONST_HESSIAN == 1
        // STAGE 3: accumulate counter
        // there are 4 counters for 4 features
        // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
@@ -540,6 +539,7 @@ R""()
            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
+        #endif
        stat1 = stat1_next;
        stat2 = stat2_next;
        feature4 = feature4_next;
@@ -639,7 +639,9 @@ R""()
        ushort bank_id = (i + offset) & BANK_MASK;
        g_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id];
        h_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id + 4];
+        #if CONST_HESSIAN == 1
        cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * 4 + feature_id];
+        #endif
    }
    // now thread 0 - 3 holds feature 0, 1, 2, 3's gradient, hessian and count bin 0
    // now thread 4 - 7 holds feature 0, 1, 2, 3's gradient, hessian and count bin 1
@@ -670,14 +672,12 @@ R""()
    // if there is only one workgroup processing this feature4, don't even need to write
    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
    #if POWER_FEATURE_WORKGROUPS != 0
-    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
    // if g_val and h_val are double, they are converted to float here
    // write gradients for 4 features
    output[0 * 4 * NUM_BINS + ltid] = g_val;
    // write hessians for 4 features
    output[1 * 4 * NUM_BINS + ltid] = h_val;
-    // write counts for 4 features
-    output[2 * 4 * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    mem_fence(CLK_GLOBAL_MEM_FENCE);
    // To avoid the cost of an extra reducting kernel, we have to deal with some 
@@ -703,7 +703,7 @@ R""()
    // The is done by using an global atomic counter.
    // On AMD GPUs ideally this should be done in GDS,
    // but currently there is no easy way to access it via OpenCL.
-    __local uint * counter_val = cnt_hist;
+    __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);;
    if (ltid == 0) {
        // all workgroups processing the same feature add this counter
        *counter_val = atom_inc(sync_counters + feature4_id);
@@ -727,12 +727,12 @@ R""()
        // locate our feature4's block in output memory
        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
        __global acc_type const * restrict feature4_subhists = 
-                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
+                 (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
        // skip reading the data already in local memory
        uint skip_id = group_id ^ output_offset;
        // locate output histogram location for this feature4
-        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
-        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, cnt_val, 
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
+        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, 
                                    1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
    }
 }

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -181,8 +181,8 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
  /*! \brief Store global histogram for larger leaf  */
  std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;

-  std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
-  std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
+  std::vector<hist_t> smaller_leaf_histogram_data_;
+  std::vector<hist_t> larger_leaf_histogram_data_;
  std::vector<FeatureMetainfo> feature_metas_;
 };


--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -18,14 +18,6 @@

 namespace LightGBM {

-#ifdef TIMETAG
-std::chrono::duration<double, std::milli> init_train_time;
-std::chrono::duration<double, std::milli> init_split_time;
-std::chrono::duration<double, std::milli> hist_time;
-std::chrono::duration<double, std::milli> find_split_time;
-std::chrono::duration<double, std::milli> split_time;
-std::chrono::duration<double, std::milli> ordered_bin_time;
-#endif  // TIMETAG

 SerialTreeLearner::SerialTreeLearner(const Config* config)
  :config_(config) {
@@ -38,14 +30,7 @@ SerialTreeLearner::SerialTreeLearner(const Config* config)
 }

 SerialTreeLearner::~SerialTreeLearner() {
-  #ifdef TIMETAG
-  Log::Info("SerialTreeLearner::init_train costs %f", init_train_time * 1e-3);
-  Log::Info("SerialTreeLearner::init_split costs %f", init_split_time * 1e-3);
-  Log::Info("SerialTreeLearner::hist_build costs %f", hist_time * 1e-3);
-  Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3);
-  Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3);
-  Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3);
-  #endif
+
 }

 void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
@@ -60,7 +45,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  } else {
    size_t total_histogram_size = 0;
    for (int i = 0; i < train_data_->num_features(); ++i) {
-      total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
+      total_histogram_size += KHistEntrySize * train_data_->FeatureNumBin(i);
    }
    max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
  }
@@ -68,19 +53,10 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  max_cache_size = std::max(2, max_cache_size);
  max_cache_size = std::min(max_cache_size, config_->num_leaves);

-  histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
+  
  // push split information for all leaves
  best_split_per_leaf_.resize(config_->num_leaves);
-  // get ordered bin
-  train_data_->CreateOrderedBins(&ordered_bins_);

-  // check existing for ordered bin
-  for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
-    if (ordered_bins_[i] != nullptr) {
-      has_ordered_bin_ = true;
-      break;
-    }
-  }
  // initialize splits for leaf
  smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
  larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
@@ -92,17 +68,10 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  // initialize ordered gradients and hessians
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);
-  // if has ordered bin, need to allocate a buffer to fast split
-  if (has_ordered_bin_) {
-    is_data_in_leaf_.resize(num_data_);
-    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast<char>(0));
-    ordered_bin_indices_.clear();
-    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
-      if (ordered_bins_[i] != nullptr) {
-        ordered_bin_indices_.push_back(i);
-      }
-    }
-  }
+
+  GetMultiValBin(train_data_, true);
+
+  histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
  Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
  if (CostEfficientGradientBoosting::IsEnable(config_)) {
    cegb_.reset(new CostEfficientGradientBoosting(this));
@@ -110,14 +79,23 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  }
 }

+void SerialTreeLearner::GetMultiValBin(const Dataset* dataset, bool is_first_time) {
+  if (is_first_time) {
+    auto used_feature = GetUsedFeatures(true);
+    multi_val_bin_.reset(dataset->TestMultiThreadingMethod(ordered_gradients_.data(), ordered_hessians_.data(), used_feature,
+      is_constant_hessian_, config_->force_col_wise, config_->force_row_wise, &is_hist_colwise_));
+  } else {
+    // cannot change is_hist_col_wise during training
+    multi_val_bin_.reset(dataset->TestMultiThreadingMethod(ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_,
+      is_constant_hessian_, is_hist_colwise_, !is_hist_colwise_, &is_hist_colwise_));
+  }
+}
+
 void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
  CHECK(num_features_ == train_data_->num_features());

-  // get ordered bin
-  train_data_->CreateOrderedBins(&ordered_bins_);
-
  // initialize splits for leaf
  smaller_leaf_splits_->ResetNumData(num_data_);
  larger_leaf_splits_->ResetNumData(num_data_);
@@ -125,14 +103,12 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
  // initialize data partition
  data_partition_->ResetNumData(num_data_);

+  GetMultiValBin(train_data_, false);
+
  // initialize ordered gradients and hessians
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);
-  // if has ordered bin, need to allocate a buffer to fast split
-  if (has_ordered_bin_) {
-    is_data_in_leaf_.resize(num_data_);
-    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast<char>(0));
-  }
+
  if (cegb_ != nullptr) {
    cegb_->Init();
  }
@@ -148,14 +124,14 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
    } else {
      size_t total_histogram_size = 0;
      for (int i = 0; i < train_data_->num_features(); ++i) {
-        total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
+        total_histogram_size += KHistEntrySize * train_data_->FeatureNumBin(i);
      }
      max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
    }
    // at least need 2 leaves
    max_cache_size = std::max(2, max_cache_size);
    max_cache_size = std::min(max_cache_size, config_->num_leaves);
-    histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
+    histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);

    // push split information for all leaves
    best_split_per_leaf_.resize(config_->num_leaves);
@@ -171,19 +147,14 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
 }

 Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
+  Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
  gradients_ = gradients;
  hessians_ = hessians;
  is_constant_hessian_ = is_constant_hessian;
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
+
  // some initial works before training
  BeforeTrain();

-  #ifdef TIMETAG
-  init_train_time += std::chrono::steady_clock::now() - start_time;
-  #endif
-
  auto tree = std::unique_ptr<Tree>(new Tree(config_->num_leaves));
  // root leaf
  int left_leaf = 0;
@@ -199,14 +170,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
  }

  for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
-    #ifdef TIMETAG
-    start_time = std::chrono::steady_clock::now();
-    #endif
    // some initial works before finding best split
    if (!aborted_last_force_split && BeforeFindBestSplit(tree.get(), left_leaf, right_leaf)) {
-      #ifdef TIMETAG
-      init_split_time += std::chrono::steady_clock::now() - start_time;
-      #endif
      // find best threshold for every feature
      FindBestSplits();
    } else if (aborted_last_force_split) {
@@ -222,14 +187,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
      Log::Warning("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain);
      break;
    }
-    #ifdef TIMETAG
-    start_time = std::chrono::steady_clock::now();
-    #endif
    // split tree with best leaf
    Split(tree.get(), best_leaf, &left_leaf, &right_leaf);
-    #ifdef TIMETAG
-    split_time += std::chrono::steady_clock::now() - start_time;
-    #endif
    cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
  }
  Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth);
@@ -319,6 +278,7 @@ std::vector<int8_t> SerialTreeLearner::GetUsedFeatures(bool is_tree_level) {
 }

 void SerialTreeLearner::BeforeTrain() {
+  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeTrain", global_timer);
  // reset histogram pool
  histogram_pool_.ResetMap();

@@ -350,54 +310,10 @@ void SerialTreeLearner::BeforeTrain() {
  }

  larger_leaf_splits_->Init();
-
-  // if has ordered bin, need to initialize the ordered bin
-  if (has_ordered_bin_) {
-    #ifdef TIMETAG
-    auto start_time = std::chrono::steady_clock::now();
-    #endif
-    if (data_partition_->leaf_count(0) == num_data_) {
-      // use all data, pass nullptr
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
-        OMP_LOOP_EX_BEGIN();
-        ordered_bins_[ordered_bin_indices_[i]]->Init(nullptr, config_->num_leaves);
-        OMP_LOOP_EX_END();
-      }
-      OMP_THROW_EX();
-    } else {
-      // bagging, only use part of data
-
-      // mark used data
-      const data_size_t* indices = data_partition_->indices();
-      data_size_t begin = data_partition_->leaf_begin(0);
-      data_size_t end = begin + data_partition_->leaf_count(0);
-      #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-      for (data_size_t i = begin; i < end; ++i) {
-        is_data_in_leaf_[indices[i]] = 1;
-      }
-      OMP_INIT_EX();
-      // initialize ordered bin
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
-        OMP_LOOP_EX_BEGIN();
-        ordered_bins_[ordered_bin_indices_[i]]->Init(is_data_in_leaf_.data(), config_->num_leaves);
-        OMP_LOOP_EX_END();
-      }
-      OMP_THROW_EX();
-      #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-      for (data_size_t i = begin; i < end; ++i) {
-        is_data_in_leaf_[indices[i]] = 0;
-      }
-    }
-    #ifdef TIMETAG
-    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
-    #endif
-  }
 }

 bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
+  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeFindBestSplit", global_timer);
  // check depth of current leaf
  if (config_->max_depth > 0) {
    // only need to check left leaf, since right leaf is in same level of left leaf
@@ -435,44 +351,6 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
  }
-  // split for the ordered bin
-  if (has_ordered_bin_ && right_leaf >= 0) {
-    #ifdef TIMETAG
-    auto start_time = std::chrono::steady_clock::now();
-    #endif
-    // mark data that at left-leaf
-    const data_size_t* indices = data_partition_->indices();
-    const auto left_cnt = data_partition_->leaf_count(left_leaf);
-    const auto right_cnt = data_partition_->leaf_count(right_leaf);
-    char mark = 1;
-    data_size_t begin = data_partition_->leaf_begin(left_leaf);
-    data_size_t end = begin + left_cnt;
-    if (left_cnt > right_cnt) {
-      begin = data_partition_->leaf_begin(right_leaf);
-      end = begin + right_cnt;
-      mark = 0;
-    }
-    #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-    for (data_size_t i = begin; i < end; ++i) {
-      is_data_in_leaf_[indices[i]] = 1;
-    }
-    OMP_INIT_EX();
-    // split the ordered bin
-    #pragma omp parallel for schedule(static)
-    for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
-      OMP_LOOP_EX_BEGIN();
-      ordered_bins_[ordered_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
-      OMP_LOOP_EX_END();
-    }
-    OMP_THROW_EX();
-    #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-    for (data_size_t i = begin; i < end; ++i) {
-      is_data_in_leaf_[indices[i]] = 0;
-    }
-    #ifdef TIMETAG
-    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
-    #endif
-  }
  return true;
 }

@@ -494,37 +372,30 @@ void SerialTreeLearner::FindBestSplits() {
 }

 void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
+  Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer);
  // construct smaller leaf
-  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
  train_data_->ConstructHistograms(is_feature_used,
                                   smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
-                                   smaller_leaf_splits_->LeafIndex(),
-                                   &ordered_bins_, gradients_, hessians_,
+                                   gradients_, hessians_,
                                   ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+                                   multi_val_bin_.get(), is_hist_colwise_,
                                   ptr_smaller_leaf_hist_data);

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
-    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset;
    train_data_->ConstructHistograms(is_feature_used,
                                     larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
-                                     larger_leaf_splits_->LeafIndex(),
-                                     &ordered_bins_, gradients_, hessians_,
+                                     gradients_, hessians_,
                                     ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+                                     multi_val_bin_.get(), is_hist_colwise_,
                                     ptr_larger_leaf_hist_data);
  }
-  #ifdef TIMETAG
-  hist_time += std::chrono::steady_clock::now() - start_time;
-  #endif
 }

 void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
+  Common::FunctionTimer fun_timer("SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
  std::vector<SplitInfo> smaller_best(num_threads_);
  std::vector<SplitInfo> larger_best(num_threads_);
  std::vector<int8_t> smaller_node_used_features(num_features_, 1);
@@ -534,7 +405,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
    larger_node_used_features = GetUsedFeatures(false);
  }
  OMP_INIT_EX();
-  // find splits
+  // find splits	
  #pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    OMP_LOOP_EX_BEGIN();
@@ -543,7 +414,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
    SplitInfo smaller_split;
    train_data_->FixHistogram(feature_index,
                              smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
-                              smaller_leaf_splits_->num_data_in_leaf(),
                              smaller_leaf_histogram_array_[feature_index].RawData());
    int real_fidx = train_data_->RealFeatureIndex(feature_index);
    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
@@ -567,7 +437,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
      larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
    } else {
      train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
-                                larger_leaf_splits_->num_data_in_leaf(),
                                larger_leaf_histogram_array_[feature_index].RawData());
    }
    SplitInfo larger_split;
@@ -589,7 +458,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
    OMP_LOOP_EX_END();
  }
  OMP_THROW_EX();
-
  auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
  int leaf = smaller_leaf_splits_->LeafIndex();
  best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
@@ -599,9 +467,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
    auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
    best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
  }
-  #ifdef TIMETAG
-  find_split_time += std::chrono::steady_clock::now() - start_time;
-  #endif
 }

 int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
@@ -769,69 +634,80 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
 }

 void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
-  const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
+  Common::FunctionTimer fun_timer("SerialTreeLearner::Split", global_timer);
+  SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
  const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
  if (cegb_ != nullptr) {
    cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_);
  }
-  // left = parent
  *left_leaf = best_leaf;
+  auto next_leaf_id = tree->NextLeafId();
+
  bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin;
  if (is_numerical_split) {
    auto threshold_double = train_data_->RealThreshold(inner_feature_index, best_split_info.threshold);
+    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
+      &best_split_info.threshold, 1, best_split_info.default_left, next_leaf_id);
+    best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
+    best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
    // split tree, will return right leaf
    *right_leaf = tree->Split(best_leaf,
-                              inner_feature_index,
-                              best_split_info.feature,
-                              best_split_info.threshold,
-                              threshold_double,
-                              static_cast<double>(best_split_info.left_output),
-                              static_cast<double>(best_split_info.right_output),
-                              static_cast<data_size_t>(best_split_info.left_count),
-                              static_cast<data_size_t>(best_split_info.right_count),
-                              static_cast<double>(best_split_info.left_sum_hessian),
-                              static_cast<double>(best_split_info.right_sum_hessian),
-                              static_cast<float>(best_split_info.gain),
-                              train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
-                              best_split_info.default_left);
-    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
-                           &best_split_info.threshold, 1, best_split_info.default_left, *right_leaf);
+      inner_feature_index,
+      best_split_info.feature,
+      best_split_info.threshold,
+      threshold_double,
+      static_cast<double>(best_split_info.left_output),
+      static_cast<double>(best_split_info.right_output),
+      static_cast<data_size_t>(best_split_info.left_count),
+      static_cast<data_size_t>(best_split_info.right_count),
+      static_cast<double>(best_split_info.left_sum_hessian),
+      static_cast<double>(best_split_info.right_sum_hessian),
+      static_cast<float>(best_split_info.gain),
+      train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
+      best_split_info.default_left);
+
  } else {
+
    std::vector<uint32_t> cat_bitset_inner = Common::ConstructBitset(best_split_info.cat_threshold.data(), best_split_info.num_cat_threshold);
    std::vector<int> threshold_int(best_split_info.num_cat_threshold);
    for (int i = 0; i < best_split_info.num_cat_threshold; ++i) {
      threshold_int[i] = static_cast<int>(train_data_->RealThreshold(inner_feature_index, best_split_info.cat_threshold[i]));
    }
    std::vector<uint32_t> cat_bitset = Common::ConstructBitset(threshold_int.data(), best_split_info.num_cat_threshold);
-    *right_leaf = tree->SplitCategorical(best_leaf,
-                                         inner_feature_index,
-                                         best_split_info.feature,
-                                         cat_bitset_inner.data(),
-                                         static_cast<int>(cat_bitset_inner.size()),
-                                         cat_bitset.data(),
-                                         static_cast<int>(cat_bitset.size()),
-                                         static_cast<double>(best_split_info.left_output),
-                                         static_cast<double>(best_split_info.right_output),
-                                         static_cast<data_size_t>(best_split_info.left_count),
-                                         static_cast<data_size_t>(best_split_info.right_count),
-                                         static_cast<double>(best_split_info.left_sum_hessian),
-                                         static_cast<double>(best_split_info.right_sum_hessian),
-                                         static_cast<float>(best_split_info.gain),
-                                         train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
+
    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
-                           cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()), best_split_info.default_left, *right_leaf);
-  }
+      cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()), best_split_info.default_left, next_leaf_id);
+
+    best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
+    best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
+
+    *right_leaf = tree->SplitCategorical(best_leaf,
+      inner_feature_index,
+      best_split_info.feature,
+      cat_bitset_inner.data(),
+      static_cast<int>(cat_bitset_inner.size()),
+      cat_bitset.data(),
+      static_cast<int>(cat_bitset.size()),
+      static_cast<double>(best_split_info.left_output),
+      static_cast<double>(best_split_info.right_output),
+      static_cast<data_size_t>(best_split_info.left_count),
+      static_cast<data_size_t>(best_split_info.right_count),
+      static_cast<double>(best_split_info.left_sum_hessian),
+      static_cast<double>(best_split_info.right_sum_hessian),
+      static_cast<float>(best_split_info.gain),
+      train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
+  }
+  CHECK(*right_leaf == next_leaf_id);

-  #ifdef DEBUG
-  CHECK(best_split_info.left_count == data_partition_->leaf_count(best_leaf));
-  #endif
  auto p_left = smaller_leaf_splits_.get();
  auto p_right = larger_leaf_splits_.get();
  // init the leaves that used on next iteration
  if (best_split_info.left_count < best_split_info.right_count) {
+    CHECK(best_split_info.left_count > 0);
    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
  } else {
+    CHECK(best_split_info.right_count > 0);
    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
    p_right = smaller_leaf_splits_.get();

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -79,7 +79,12 @@ class SerialTreeLearner: public TreeLearner {
  void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
                       data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;

+  bool IsHistColWise() const override { return is_hist_colwise_; }
+
 protected:
+
+  void GetMultiValBin(const Dataset* dataset, bool is_first_time);
+
  virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
  /*!
  * \brief Some initial works before training
@@ -161,17 +166,13 @@ class SerialTreeLearner: public TreeLearner {
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
 #else
  /*! \brief gradients of current iteration, ordered for cache optimized */
-  std::vector<score_t> ordered_gradients_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized */
-  std::vector<score_t> ordered_hessians_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_;
 #endif

-  /*! \brief Store ordered bin */
-  std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
-  /*! \brief True if has ordered bin */
-  bool has_ordered_bin_ = false;
  /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
-  std::vector<char> is_data_in_leaf_;
+  std::vector<char, Common::AlignmentAllocator<char, kAlignedSize>> is_data_in_leaf_;
  /*! \brief used to cache historical histogram to speed up*/
  HistogramPool histogram_pool_;
  /*! \brief config of tree learner*/
@@ -179,6 +180,8 @@ class SerialTreeLearner: public TreeLearner {
  int num_threads_;
  std::vector<int> ordered_bin_indices_;
  bool is_constant_hessian_;
+  std::unique_ptr<MultiValBin> multi_val_bin_;
+  bool is_hist_colwise_;
  std::unique_ptr<CostEfficientGradientBoosting> cegb_;
 };


--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -36,7 +36,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
    }
  }
  // calculate buffer size
-  size_t buffer_size = 2 * top_k_ * std::max(max_bin * sizeof(HistogramBinEntry), sizeof(LightSplitInfo) * num_machines_);
+  size_t buffer_size = 2 * top_k_ * std::max(max_bin * KHistEntrySize, sizeof(LightSplitInfo) * num_machines_);
  // left and right on same time, so need double size
  input_buffer_.resize(buffer_size);
  output_buffer_.resize(buffer_size);
@@ -290,7 +290,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
    const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index);
    this->train_data_->FixHistogram(feature_index,
      this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
-      this->smaller_leaf_splits_->num_data_in_leaf(),
      this->smaller_leaf_histogram_array_[feature_index].RawData());

    this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
@@ -308,7 +307,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
      this->larger_leaf_histogram_array_[feature_index].Subtract(this->smaller_leaf_histogram_array_[feature_index]);
    } else {
      this->train_data_->FixHistogram(feature_index, this->larger_leaf_splits_->sum_gradients(), this->larger_leaf_splits_->sum_hessians(),
-        this->larger_leaf_splits_->num_data_in_leaf(),
        this->larger_leaf_histogram_array_[feature_index].RawData());
    }
    // find best threshold for larger child
@@ -367,8 +365,8 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
  CopyLocalHistogram(smaller_top_features, larger_top_features);

  // Reduce scatter for histogram
-  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(),
-                         output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
+  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), block_len_.data(),
+                         output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);

  this->FindBestSplitsFromHistograms(is_feature_used, false);
 }
@@ -399,7 +397,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons

      this->train_data_->FixHistogram(feature_index,
                                      smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_hessians(),
-                                      GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
                                      smaller_leaf_histogram_array_global_[feature_index].RawData());

      // find best threshold
@@ -423,7 +420,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons

      this->train_data_->FixHistogram(feature_index,
                                      larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_hessians(),
-                                      GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
                                      larger_leaf_histogram_array_global_[feature_index].RawData());

      // find best threshold

--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -38,7 +38,9 @@ class FileLoader(object):
        return np.loadtxt(os.path.join(self.directory, result_file))

    def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
-        gbm = lgb.train(self.params, lgb_train)
+        params = dict(self.params)
+        params['force_row_wise'] = True
+        gbm = lgb.train(params, lgb_train)
        y_pred = gbm.predict(X_test)
        cpp_pred = gbm.predict(X_test_fn)
        np.testing.assert_allclose(y_pred, cpp_pred)
@@ -105,7 +107,9 @@ class TestEngine(unittest.TestCase):
        X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
        group_train = fd.load_field('.train.query')
        lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
-        gbm = lgb.LGBMRanker(**fd.params)
+        params = dict(fd.params)
+        params['force_col_wise'] = True
+        gbm = lgb.LGBMRanker(**params)
        gbm.fit(X_train, y_train, group=group_train)
        sk_pred = gbm.predict(X_test)
        fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -66,7 +66,7 @@ class TestEngine(unittest.TestCase):
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = log_loss(y_test, gbm.predict(X_test))
-        self.assertLess(ret, 0.11)
+        self.assertLess(ret, 0.14)
        self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50)
        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)

@@ -328,7 +328,7 @@ class TestEngine(unittest.TestCase):
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = multi_logloss(y_test, gbm.predict(X_test))
-        self.assertLess(ret, 0.15)
+        self.assertLess(ret, 0.16)
        self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)

    def test_multiclass_rf(self):
@@ -518,7 +518,7 @@ class TestEngine(unittest.TestCase):
                        valid_names=valid_set_name,
                        verbose_eval=False,
                        early_stopping_rounds=5)
-        self.assertLessEqual(gbm.best_iteration, 31)
+        self.assertLessEqual(gbm.best_iteration, 39)
        self.assertIn(valid_set_name, gbm.best_score)
        self.assertIn('binary_logloss', gbm.best_score[valid_set_name])

@@ -1740,7 +1740,7 @@ class TestEngine(unittest.TestCase):
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = log_loss(y_test, gbm.predict(X_test))
-        self.assertLess(ret, 0.13)
+        self.assertLess(ret, 0.14)
        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
        params['feature_fraction'] = 0.5
        gbm2 = lgb.train(params, lgb_train, num_boost_round=25)

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -77,7 +77,7 @@ class TestSklearn(unittest.TestCase):
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
        ret = log_loss(y_test, gbm.predict_proba(X_test))
-        self.assertLess(ret, 0.11)
+        self.assertLess(ret, 0.12)
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)

    def test_regression(self):
@@ -97,7 +97,7 @@ class TestSklearn(unittest.TestCase):
        ret = multi_error(y_test, gbm.predict(X_test))
        self.assertLess(ret, 0.05)
        ret = multi_logloss(y_test, gbm.predict_proba(X_test))
-        self.assertLess(ret, 0.15)
+        self.assertLess(ret, 0.16)
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)

    def test_lambdarank(self):
@@ -114,8 +114,8 @@ class TestSklearn(unittest.TestCase):
                eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False,
                callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
        self.assertLessEqual(gbm.best_iteration_, 24)
-        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333)
-        self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048)
+        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.5769)
+        self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.5920)

    def test_xendcg(self):
        dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -129,7 +129,7 @@ class TestSklearn(unittest.TestCase):
                eval_metric='ndcg',
                callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
        self.assertLessEqual(gbm.best_iteration_, 24)
-        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6579)
+        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6559)
        self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6421)

    def test_regression_with_custom_objective(self):

--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -30,24 +30,24 @@
    <SccLocalPath>SAK</SccLocalPath>
    <SccProvider>SAK</SccProvider>
    <ProjectName>LightGBM</ProjectName>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='DLL|x64'" Label="Configuration">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
    <ConfigurationType>DynamicLibrary</ConfigurationType>
  </PropertyGroup>
  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release_mpi|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
@@ -95,6 +95,8 @@
      <WholeProgramOptimization>false</WholeProgramOptimization>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>
@@ -116,6 +118,8 @@
      <WholeProgramOptimization>false</WholeProgramOptimization>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <AdditionalDependencies>
@@ -137,6 +141,8 @@
      <WholeProgramOptimization>true</WholeProgramOptimization>
      <OmitFramePointers>true</OmitFramePointers>
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>
@@ -162,6 +168,8 @@
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <OmitFramePointers>true</OmitFramePointers>
      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <AdditionalDependencies />
@@ -181,6 +189,8 @@
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <OmitFramePointers>true</OmitFramePointers>
      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <AdditionalDependencies>
@@ -224,7 +234,8 @@
    <ClInclude Include="..\src\boosting\score_updater.hpp" />
    <ClInclude Include="..\src\io\dense_bin.hpp" />
    <ClInclude Include="..\src\io\dense_nbits_bin.hpp" />
-    <ClInclude Include="..\src\io\ordered_sparse_bin.hpp" />
+    <ClInclude Include="..\src\io\multi_val_dense_bin.hpp" />
+    <ClInclude Include="..\src\io\multi_val_sparse_bin.hpp" />
    <ClInclude Include="..\src\io\parser.hpp" />
    <ClInclude Include="..\src\io\sparse_bin.hpp" />
    <ClInclude Include="..\src\metric\binary_metric.hpp" />

--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -57,9 +57,6 @@
    <ClInclude Include="..\src\io\dense_bin.hpp">
      <Filter>src\io</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\io\ordered_sparse_bin.hpp">
-      <Filter>src\io</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\io\parser.hpp">
      <Filter>src\io</Filter>
    </ClInclude>
@@ -213,6 +210,12 @@
    <ClInclude Include="..\src\treelearner\cost_effective_gradient_boosting.hpp">
      <Filter>src\treelearner</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\io\multi_val_dense_bin.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\io\multi_val_sparse_bin.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\src\application\application.cpp">