Support both row-wise and col-wise multi-threading (#2699)

* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

Support both row-wise and col-wise multi-threading (#2699)
* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
509c2e50 · Guolin Ke · GitHub · bc7bc4a1 · 509c2e50 · 509c2e50
Unverified Commit 509c2e50 authored Feb 02, 2020 by Guolin Ke Committed by GitHub Feb 02, 2020
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,11 +44,11 @@ before_install:
  - export BUILD_DIRECTORY="$TRAVIS_BUILD_DIR"
  - if [[ $TRAVIS_OS_NAME == "osx" ]]; then
        export OS_NAME="macos";
-        export COMPILER="gcc";
+        export COMPILER="clang";
        export R_MAC_VERSION=3.6.1;
    else
        export OS_NAME="linux";
-        export COMPILER="clang";
+        export COMPILER="gcc";
        export R_TRAVIS_LINUX_VERSION=3.6.1-3bionic;
    fi
  - export CONDA="$HOME/miniconda"

--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -17,7 +17,7 @@ jobs:
 - job: Linux
 ###########################################
  variables:
-    COMPILER: gcc
+    COMPILER: clang
  pool:
    vmImage: 'ubuntu-16.04'
  container: ubuntu1404
@@ -72,7 +72,7 @@ jobs:
 - job: MacOS
 ###########################################
  variables:
-    COMPILER: clang
+    COMPILER: gcc
  pool:
    vmImage: 'macOS-10.13'
  strategy:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,6 +68,10 @@ if(USE_R35)
    ADD_DEFINITIONS(-DR_VER_ABOVE_35)
 endif(USE_R35)

+if(USE_TIMETAG)
+    ADD_DEFINITIONS(-DTIMETAG)
+endif(USE_TIMETAG)
+
 if(USE_MPI)
    find_package(MPI REQUIRED)
    ADD_DEFINITIONS(-DUSE_MPI)
@@ -130,6 +134,21 @@ if(${MM_PREFETCH})
  ADD_DEFINITIONS(-DMM_PREFETCH)
 endif()

+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles("
+#include <mm_malloc.h>
+int main() {
+  char *a = (char*)_mm_malloc(8, 16);
+  _mm_free(a);
+  return 0;
+}
+" MM_MALLOC)
+
+if(${MM_MALLOC})
+  message(STATUS "Use _mm_malloc")
+  ADD_DEFINITIONS(-DMM_MALLOC)
+endif()
+
 if(UNIX OR MINGW OR CYGWIN)
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
    if(USE_SWIG)

--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -252,3 +252,46 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data
    )
  }, regexp = "each element of valids must have a name")
 })
+
+test_that("lgb.train() works with force_col_wise and force_row_wise", {
+  set.seed(1234L)
+  nrounds <- 10L
+  dtrain <- lgb.Dataset(
+    train$data
+    , label = train$label
+  )
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_col_wise = TRUE
+  )
+  bst_colwise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_row_wise = TRUE
+  )
+  bst_row_wise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  expected_error <- 0.003070782
+  expect_equal(bst_colwise$eval_train()[[1L]][["value"]], expected_error)
+  expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error)
+
+  # check some basic details of the boosters just to be sure force_col_wise
+  # and force_row_wise are not causing any weird side effects
+  for (bst in list(bst_row_wise, bst_colwise)) {
+    expect_equal(bst$current_iter(), nrounds)
+    parsed_model <- jsonlite::fromJSON(bst$dump_model())
+    expect_equal(parsed_model$objective, "binary sigmoid:1")
+    expect_false(parsed_model$average_output)
+  }
+})
--- a/R-package/tests/testthat/test_learning_to_rank.R
+++ b/R-package/tests/testthat/test_learning_to_rank.R
@@ -47,8 +47,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
    }
    expect_identical(sapply(eval_results, function(x) {x$name}), eval_names)
    expect_equal(eval_results[[1L]][["value"]], 0.825)
-    expect_true(abs(eval_results[[2L]][["value"]] - 0.795986) < TOLERANCE)
-    expect_true(abs(eval_results[[3L]][["value"]] - 0.7734639) < TOLERANCE)
+    expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE)
+    expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE)
 })

 test_that("learning-to-rank with lgb.cv() works as expected", {

--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -190,6 +190,38 @@ Core Parameters
 Learning Control Parameters
 ---------------------------

+-  ``force_col_wise`` :raw-html:`<a id="force_col_wise" title="Permalink to this parameter" href="#force_col_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
+
+   -  Recommend ``force_col_wise=true`` when:
+
+      -  the number of columns is large, or the total number of bin is large
+
+      -  when ``num_threads`` is large, e.g. ``>20``
+
+      -  want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
+
+      -  want to reduce memory cost
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
+
+-  ``force_row_wise`` :raw-html:`<a id="force_row_wise" title="Permalink to this parameter" href="#force_row_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
+
+   -  Recommend ``force_row_wise=true`` when:
+
+      -  the number of data is large, and the number of total bin is relatively small
+
+      -  want to use small ``bagging``, or ``goss``, to speed-up
+
+      -  when ``num_threads`` is relatively small, e.g. ``<=16``
+
+   -  set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
+
 -  ``max_depth`` :raw-html:`<a id="max_depth" title="Permalink to this parameter" href="#max_depth">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int

   -  limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
@@ -559,22 +591,6 @@ IO Parameters

   -  **Note**: disabling this may cause the slow training speed for sparse datasets

-  ``max_conflict_rate`` :raw-html:`<a id="max_conflict_rate" title="Permalink to this parameter" href="#max_conflict_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``0.0 <= max_conflict_rate < 1.0``
-
-   -  max conflict rate for bundles in EFB
-
-   -  set this to ``0.0`` to disallow the conflict and provide more accurate results
-
-   -  set this to a larger value to achieve faster speed
-
-  ``is_enable_sparse`` :raw-html:`<a id="is_enable_sparse" title="Permalink to this parameter" href="#is_enable_sparse">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool, aliases: ``is_sparse``, ``enable_sparse``, ``sparse``
-
-   -  used to enable/disable sparse optimization
-
-  ``sparse_threshold`` :raw-html:`<a id="sparse_threshold" title="Permalink to this parameter" href="#sparse_threshold">&#x1F517;&#xFE0E;</a>`, default = ``0.8``, type = double, constraints: ``0.0 < sparse_threshold <= 1.0``
-
-   -  the threshold of zero elements percentage for treating a feature as a sparse one
-
 -  ``use_missing`` :raw-html:`<a id="use_missing" title="Permalink to this parameter" href="#use_missing">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool

   -  set this to ``false`` to disable the special handle of missing value

--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -29,36 +29,29 @@ enum MissingType {
  NaN
 };

-/*! \brief Store data for one histogram bin */
-struct HistogramBinEntry {
- public:
-  /*! \brief Sum of gradients on this bin */
-  double sum_gradients = 0.0f;
-  /*! \brief Sum of hessians on this bin */
-  double sum_hessians = 0.0f;
-  /*! \brief Number of data on this bin */
-  data_size_t cnt = 0;
-  /*!
-  * \brief Sum up (reducers) functions for histogram bin
-  */
-  inline static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len) {
-    comm_size_t used_size = 0;
-    const HistogramBinEntry* p1;
-    HistogramBinEntry* p2;
-    while (used_size < len) {
-      // convert
-      p1 = reinterpret_cast<const HistogramBinEntry*>(src);
-      p2 = reinterpret_cast<HistogramBinEntry*>(dst);
-      // add
-      p2->cnt += p1->cnt;
-      p2->sum_gradients += p1->sum_gradients;
-      p2->sum_hessians += p1->sum_hessians;
-      src += type_size;
-      dst += type_size;
-      used_size += type_size;
-    }
+typedef double hist_t;
+
+const size_t KHistEntrySize = 2 * sizeof(hist_t);
+const int KHistOffset = 2;
+const double kSparseThreshold = 0.7;
+
+#define GET_GRAD(hist, i) hist[(i) << 1]
+#define GET_HESS(hist, i) hist[((i) << 1) + 1]
+
+inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+  comm_size_t used_size = 0;
+  const hist_t* p1;
+  hist_t* p2;
+  while (used_size < len) {
+    // convert
+    p1 = reinterpret_cast<const hist_t*>(src);
+    p2 = reinterpret_cast<hist_t*>(dst);
+    *p2 += *p1;
+    src += type_size;
+    dst += type_size;
+    used_size += type_size;
  }
-};
+}

 /*! \brief This class used to convert feature values into bin,
 *          and store some meta information for bin*/
@@ -252,7 +245,7 @@ class OrderedBin {
  * \param out Output Result
  */
  virtual void ConstructHistogram(int leaf, const score_t* gradients,
-    const score_t* hessians, HistogramBinEntry* out) const = 0;
+    const score_t* hessians, hist_t* out) const = 0;

  /*!
  * \brief Construct histogram by using this bin
@@ -262,7 +255,7 @@ class OrderedBin {
  * \param gradients Gradients, Note:non-ordered by leaf
  * \param out Output Result
  */
-  virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
+  virtual void ConstructHistogram(int leaf, const score_t* gradients, hist_t* out) const = 0;

  /*!
  * \brief Split current bin, and perform re-order by leaf
@@ -360,11 +353,11 @@ class Bin {
  virtual void ConstructHistogram(
    const data_size_t* data_indices, data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;

  virtual void ConstructHistogram(data_size_t start, data_size_t end,
    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;

  /*!
  * \brief Construct histogram of this feature,
@@ -380,10 +373,10 @@ class Bin {
  * \param out Output Result
  */
  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;

  virtual void ConstructHistogram(data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;

  /*!
  * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
@@ -423,30 +416,11 @@ class Bin {
                            data_size_t* data_indices, data_size_t num_data,
                            data_size_t* lte_indices, data_size_t* gt_indices) const = 0;

-  /*!
-  * \brief Create the ordered bin for this bin
-  * \return Pointer to ordered bin
-  */
-  virtual OrderedBin* CreateOrderedBin() const = 0;
-
  /*!
  * \brief After pushed all feature data, call this could have better refactor for bin data
  */
  virtual void FinishLoad() = 0;

-  /*!
-  * \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
-  * \param num_data Total number of data
-  * \param num_bin Number of bin
-  * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
-  * \param is_enable_sparse True if enable sparse feature
-  * \param sparse_threshold Threshold for treating a feature as a sparse feature
-  * \param is_sparse Will set to true if this bin is sparse
-  * \return The bin data object
-  */
-  static Bin* CreateBin(data_size_t num_data, int num_bin,
-    double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
-
  /*!
  * \brief Create object for bin data of one feature, used for dense feature
  * \param num_data Total number of data
@@ -469,6 +443,46 @@ class Bin {
  virtual Bin* Clone() = 0;
 };

+
+class MultiValBin {
+public:
+
+  virtual ~MultiValBin() {}
+
+  virtual data_size_t num_data() const = 0;
+
+  virtual int32_t num_bin() const = 0;
+
+  virtual void ReSize(data_size_t num_data) = 0;
+
+  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
+
+  virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
+
+  virtual void ConstructHistogram(
+    const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void FinishLoad() = 0;
+
+  virtual bool IsSparse() = 0;
+
+  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate);
+
+  virtual MultiValBin* Clone() = 0;
+};
+
 inline uint32_t BinMapper::ValueToBin(double value) const {
  if (std::isnan(value)) {
    if (missing_type_ == MissingType::NaN) {

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -214,6 +214,24 @@ struct Config {

  #pragma region Learning Control Parameters

+  // desc = set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
+  // desc = Recommend ``force_col_wise=true`` when:
+  // descl2 = the number of columns is large, or the total number of bin is large
+  // descl2 = when ``num_threads`` is large, e.g. ``>20``
+  // descl2 = want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
+  // descl2 = want to reduce memory cost
+  // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
+  bool force_col_wise = false;
+
+  // desc = set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
+  // desc = Recommend ``force_row_wise=true`` when:
+  // descl2 = the number of data is large, and the number of total bin is relatively small
+  // descl2 = want to use small ``bagging``, or ``goss``, to speed-up
+  // descl2 = when ``num_threads`` is relatively small, e.g. ``<=16``
+  // desc = set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
+  // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
+  bool force_row_wise = false;
+
  // desc = limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
  // desc = ``<= 0`` means no limit
  int max_depth = -1;
@@ -534,22 +552,6 @@ struct Config {
  // desc = **Note**: disabling this may cause the slow training speed for sparse datasets
  bool enable_bundle = true;

-  // check = >=0.0
-  // check = <1.0
-  // desc = max conflict rate for bundles in EFB
-  // desc = set this to ``0.0`` to disallow the conflict and provide more accurate results
-  // desc = set this to a larger value to achieve faster speed
-  double max_conflict_rate = 0.0;
-
-  // alias = is_sparse, enable_sparse, sparse
-  // desc = used to enable/disable sparse optimization
-  bool is_enable_sparse = true;
-
-  // check = >0.0
-  // check = <=1.0
-  // desc = the threshold of zero elements percentage for treating a feature as a sparse one
-  double sparse_threshold = 0.8;
-
  // desc = set this to ``false`` to disable the special handle of missing value
  bool use_missing = true;


--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -8,6 +8,7 @@
 #include <LightGBM/config.h>
 #include <LightGBM/feature_group.h>
 #include <LightGBM/meta.h>
+#include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/random.h>
@@ -381,6 +382,7 @@ class Dataset {
  inline uint64_t NumTotalBin() const {
    return group_bin_boundaries_.back();
  }
+
  inline std::vector<int> ValidFeatureIndices() const {
    std::vector<int> ret;
    for (int i = 0; i < num_total_features_; ++i) {
@@ -394,6 +396,13 @@ class Dataset {

  void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);

+  MultiValBin* GetMultiBinFromSparseFeatures() const;
+
+  MultiValBin* GetMultiBinFromAllFeatures() const;
+
+  MultiValBin* TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+    bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const;
+
  LIGHTGBM_EXPORT void FinishLoad();

  LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
@@ -423,15 +432,18 @@ class Dataset {

  void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                           const data_size_t* data_indices, data_size_t num_data,
-                           int leaf_idx,
-                           std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
                           const score_t* gradients, const score_t* hessians,
                           score_t* ordered_gradients, score_t* ordered_hessians,
                           bool is_constant_hessian,
-                           HistogramBinEntry* histogram_data) const;
+                           const MultiValBin* multi_val_bin, bool is_colwise,
+                           hist_t* histogram_data) const;
+
+  void ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data,
+                                  const score_t* gradients, const score_t* hessians,
+                                  bool is_constant_hessian,
+                                  hist_t* histogram_data) const;

-  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
-                    HistogramBinEntry* data) const;
+  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;

  inline data_size_t Split(int feature,
                           const uint32_t* threshold, int num_threshold,  bool default_left,
@@ -496,19 +508,10 @@ class Dataset {
    return feature_groups_[group]->bin_mappers_[sub_feature].get();
  }

-  inline const Bin* FeatureBin(int i) const {
-    const int group = feature2group_[i];
-    return feature_groups_[group]->bin_data_.get();
-  }
-
  inline const Bin* FeatureGroupBin(int group) const {
    return feature_groups_[group]->bin_data_.get();
  }

-  inline bool FeatureGroupIsSparse(int group) const {
-    return feature_groups_[group]->is_sparse_;
-  }
-
  inline BinIterator* FeatureIterator(int i) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
@@ -519,6 +522,10 @@ class Dataset {
    return feature_groups_[group]->FeatureGroupIterator();
  }

+  inline bool IsMultiGroup(int i) const {
+    return feature_groups_[i]->is_multi_val_;
+  }
+
  inline double RealThreshold(int i, uint32_t threshold) const {
    const int group = feature2group_[i];
    const int sub_feature = feature2subfeature_[i];
@@ -532,18 +539,6 @@ class Dataset {
    return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
  }

-  inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
-    ordered_bins->resize(num_groups_);
-    OMP_INIT_EX();
-    #pragma omp parallel for schedule(guided)
-    for (int i = 0; i < num_groups_; ++i) {
-      OMP_LOOP_EX_BEGIN();
-      ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
-      OMP_LOOP_EX_END();
-    }
-    OMP_THROW_EX();
-  }
-
  /*!
  * \brief Get meta data pointer
  * \return Pointer of meta data
@@ -620,7 +615,7 @@ class Dataset {
  /*! \brief Disable copy */
  Dataset(const Dataset&) = delete;

-  void addFeaturesFrom(Dataset* other);
+  void AddFeaturesFrom(Dataset* other);

 private:
  std::string data_filename_;
@@ -638,8 +633,6 @@ class Dataset {
  Metadata metadata_;
  /*! \brief index of label column */
  int label_idx_ = 0;
-  /*! \brief Threshold for treating a feature as a sparse feature */
-  double sparse_threshold_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
  /*! \brief store feature names */
@@ -662,6 +655,8 @@ class Dataset {
  bool use_missing_;
  bool zero_as_missing_;
  std::vector<int> feature_need_push_zeros_;
+  mutable std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;
+
 };

 }  // namespace LightGBM

--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -30,14 +30,13 @@ class FeatureGroup {
  * \param is_enable_sparse True if enable sparse feature
  * \param sparse_threshold Threshold for treating a feature as a sparse feature
  */
-  FeatureGroup(int num_feature,
+  FeatureGroup(int num_feature, bool is_multi_val,
    std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
-    data_size_t num_data, double sparse_threshold, bool is_enable_sparse) : num_feature_(num_feature) {
+    data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val), is_sparse_(false) {
    CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
    // use bin at zero to store most_freq_bin
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
-    int cnt_non_zero = 0;
    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_.emplace_back(bin_mappers->at(i).release());
      auto num_bin = bin_mappers_[i]->num_bin();
@@ -46,18 +45,26 @@ class FeatureGroup {
      }
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
-      cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
    }
-    double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
-    bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
-      sparse_rate, is_enable_sparse, sparse_threshold, &is_sparse_));
+    if (is_multi_val_) {
+      multi_bin_data_.clear();
+      for (int i = 0; i < num_feature_; ++i) {
+        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
+        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
+          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        } else {
+          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        }
+      }
+    } else {
+      bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
+    }
  }

-  FeatureGroup(int num_feature,
-               std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
-               data_size_t num_data, bool is_sparse) : num_feature_(num_feature) {
-    CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
-    // use bin at zero to store most_freq_bin
+  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
+    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
+    CHECK(static_cast<int>(bin_mappers->size()) == 1);
+    // use bin at zero to store default_bin
    num_total_bin_ = 1;
    bin_offsets_.emplace_back(num_total_bin_);
    for (int i = 0; i < num_feature_; ++i) {
@@ -69,13 +76,15 @@ class FeatureGroup {
      num_total_bin_ += num_bin;
      bin_offsets_.emplace_back(num_total_bin_);
    }
-    is_sparse_ = is_sparse;
-    if (is_sparse_) {
+    if (bin_mappers_[0]->sparse_rate() >=  kSparseThreshold) {
+      is_sparse_ = true;
      bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
    } else {
+      is_sparse_ = false;
      bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
    }
  }
+
  /*!
  * \brief Constructor from memory
  * \param memory Pointer of memory
@@ -86,6 +95,8 @@ class FeatureGroup {
    const std::vector<data_size_t>& local_used_indices) {
    const char* memory_ptr = reinterpret_cast<const char*>(memory);
    // get is_sparse
+    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
+    memory_ptr += sizeof(is_multi_val_);
    is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
    memory_ptr += sizeof(is_sparse_);
    num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
@@ -110,13 +121,26 @@ class FeatureGroup {
    if (!local_used_indices.empty()) {
      num_data = static_cast<data_size_t>(local_used_indices.size());
    }
-    if (is_sparse_) {
-      bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
+    if (is_multi_val_) {
+      for (int i = 0; i < num_feature_; ++i) {
+        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
+        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
+          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        } else {
+          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        }
+        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
+        memory_ptr += multi_bin_data_.back()->SizesInByte();
+      }
    } else {
-      bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
+      if (is_sparse_) {
+        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
+      } else {
+        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
+      }
+      // get bin data
+      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
    }
-    // get bin data
-    bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
  }
  /*! \brief Destructor */
  ~FeatureGroup() {
@@ -131,22 +155,54 @@ class FeatureGroup {
  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
    uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
-    bin += bin_offsets_[sub_feature_idx];
    if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
      bin -= 1;
    }
-    bin_data_->Push(tid, line_idx, bin);
+    if (is_multi_val_) {
+      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
+    } else {
+      bin += bin_offsets_[sub_feature_idx];
+      bin_data_->Push(tid, line_idx, bin);
+    }
  }

  inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
-    bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
+    if (!is_multi_val_) {
+      bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
+    } else {
+      for (int i = 0; i < num_feature_; ++i) {
+        multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
+      }
+    }
  }

  inline BinIterator* SubFeatureIterator(int sub_feature) {
-    uint32_t min_bin = bin_offsets_[sub_feature];
-    uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
-    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
+    if (!is_multi_val_) {
+      uint32_t min_bin = bin_offsets_[sub_feature];
+      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
+      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
+    } else {
+      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
+      uint32_t min_bin = 1;
+      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
+      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
+    }
+  }
+
+  inline void FinishLoad() {
+    if (is_multi_val_) {
+      OMP_INIT_EX();
+      #pragma omp parallel for schedule(guided)
+      for (int i = 0; i < num_feature_; ++i) {
+        OMP_LOOP_EX_BEGIN();
+        multi_bin_data_[i]->FinishLoad();
+        OMP_LOOP_EX_END();
+      }
+      OMP_THROW_EX();
+    } else {
+      bin_data_->FinishLoad();
+    }
  }

  /*!
@@ -155,6 +211,9 @@ class FeatureGroup {
   * \return A pointer to the BinIterator object
   */
  inline BinIterator* FeatureGroupIterator() {
+    if (is_multi_val_) {
+      return nullptr;
+    }
    uint32_t min_bin = bin_offsets_[0];
    uint32_t max_bin = bin_offsets_.back() - 1;
    uint32_t most_freq_bin = 0;
@@ -168,17 +227,29 @@ class FeatureGroup {
    bool default_left,
    data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices) const {
-
-    uint32_t min_bin = bin_offsets_[sub_feature];
-    uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
    uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
-    if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
-      auto missing_type = bin_mappers_[sub_feature]->missing_type();
-      return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
-                              *threshold, data_indices, num_data, lte_indices, gt_indices);
+    if (!is_multi_val_) {
+      uint32_t min_bin = bin_offsets_[sub_feature];
+      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
+      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
+        auto missing_type = bin_mappers_[sub_feature]->missing_type();
+        return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
+          *threshold, data_indices, num_data, lte_indices, gt_indices);
+      } else {
+        return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
+      }
    } else {
-      return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
+      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
+      uint32_t min_bin = 1;
+      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
+      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
+        auto missing_type = bin_mappers_[sub_feature]->missing_type();
+        return multi_bin_data_[sub_feature]->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
+          *threshold, data_indices, num_data, lte_indices, gt_indices);
+      } else {
+        return multi_bin_data_[sub_feature]->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
+      }
    }
  }
  /*!
@@ -195,22 +266,35 @@ class FeatureGroup {
  * \param file File want to write
  */
  void SaveBinaryToFile(const VirtualFileWriter* writer) const {
+    writer->Write(&is_multi_val_, sizeof(is_multi_val_));
    writer->Write(&is_sparse_, sizeof(is_sparse_));
    writer->Write(&num_feature_, sizeof(num_feature_));
    for (int i = 0; i < num_feature_; ++i) {
      bin_mappers_[i]->SaveBinaryToFile(writer);
    }
-    bin_data_->SaveBinaryToFile(writer);
+    if (is_multi_val_) {
+      for (int i = 0; i < num_feature_; ++i) {
+        multi_bin_data_[i]->SaveBinaryToFile(writer);
+      }
+    } else {
+      bin_data_->SaveBinaryToFile(writer);
+    }
  }
  /*!
  * \brief Get sizes in byte of this object
  */
  size_t SizesInByte() const {
-    size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
+    size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
    for (int i = 0; i < num_feature_; ++i) {
      ret += bin_mappers_[i]->SizesInByte();
    }
-    ret += bin_data_->SizesInByte();
+    if (!is_multi_val_) {
+      ret += bin_data_->SizesInByte();
+    } else {
+      for (int i = 0; i < num_feature_; ++i) {
+        ret += multi_bin_data_[i]->SizesInByte();
+      }
+    }
    return ret;
  }
  /*! \brief Disable copy */
@@ -218,6 +302,7 @@ class FeatureGroup {
  /*! \brief Deep copy */
  FeatureGroup(const FeatureGroup& other) {
    num_feature_ = other.num_feature_;
+    is_multi_val_ = other.is_multi_val_;
    is_sparse_ = other.is_sparse_;
    num_total_bin_ = other.num_total_bin_;
    bin_offsets_ = other.bin_offsets_;
@@ -226,8 +311,14 @@ class FeatureGroup {
    for (auto& bin_mapper : other.bin_mappers_) {
      bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
    }
-
-    bin_data_.reset(other.bin_data_->Clone());
+    if (!is_multi_val_) {
+      bin_data_.reset(other.bin_data_->Clone());
+    } else {
+      multi_bin_data_.clear();
+      for (int i = 0; i < num_feature_; ++i) {
+        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
+      }
+    }
  }

 private:
@@ -239,7 +330,9 @@ class FeatureGroup {
  std::vector<uint32_t> bin_offsets_;
  /*! \brief Bin data of this feature */
  std::unique_ptr<Bin> bin_data_;
+  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
  /*! \brief True if this feature is sparse */
+  bool is_multi_val_;
  bool is_sparse_;
  int num_total_bin_;
 };

--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -71,8 +71,9 @@ typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm

 #define NO_SPECIFIC (-1)

-// Prefetch size is usually 64 bytes
-const int kCacheLineSize = 64;
+const int kAlignedSize = 32;
+
+#define SIZE_ALIGNED(t) ((t) + kAlignedSize - 1) / kAlignedSize * kAlignedSize

 }  // namespace LightGBM


--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -213,6 +213,7 @@ class Tree {

  void RecomputeMaxDepth();

+  int NextLeafId() const { return num_leaves_; }
 private:
  std::string NumericalDecisionIfElse(int node) const;


--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -71,6 +71,8 @@ class TreeLearner {
  virtual void SetBaggingData(const data_size_t* used_indices,
    data_size_t num_data) = 0;

+  virtual bool IsHistColWise() const = 0;
+
  /*!
  * \brief Using last trained tree to predict score then adding to out_score;
  * \param out_score output score

--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -11,22 +11,36 @@
 #include <limits>
 #include <string>
 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <functional>
 #include <iomanip>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <sstream>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
 #include <vector>

-#ifdef _MSC_VER
-#include "intrin.h"
+#if defined(_MSC_VER) 
+#include <malloc.h>
+#elif MM_MALLOC
+#include <mm_malloc.h>
+#elif defined(__GNUC__)
+#include <malloc.h>
+#define _mm_malloc(a, b) memalign(b, a)
+#define _mm_free(a) free(a)
+#else
+#include <stdlib.h>
+#define _mm_malloc(a, b) malloc(a)
+#define _mm_free(a) free(a)
 #endif

+
 namespace LightGBM {

 namespace Common {
@@ -946,8 +960,133 @@ inline bool CheckAllowedJSON(const std::string& s) {
  return true;
 }

+inline int RoundInt(double x) {
+  return static_cast<int>(x + 0.5f);
+}
+
+template <typename T, std::size_t N = 32>
+class AlignmentAllocator {
+public:
+  typedef T value_type;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+
+  typedef T* pointer;
+  typedef const T* const_pointer;
+
+  typedef T& reference;
+  typedef const T& const_reference;
+
+public:
+  inline AlignmentAllocator() throw () {}
+
+  template <typename T2>
+  inline AlignmentAllocator(const AlignmentAllocator<T2, N>&) throw () {}
+
+  inline ~AlignmentAllocator() throw () {}
+
+  inline pointer adress(reference r) {
+    return &r;
+  }
+
+  inline const_pointer adress(const_reference r) const {
+    return &r;
+  }
+
+  inline pointer allocate(size_type n) {
+    return (pointer)_mm_malloc(n * sizeof(value_type), N);
+  }
+
+  inline void deallocate(pointer p, size_type) {
+    _mm_free(p);
+  }
+
+  inline void construct(pointer p, const value_type& wert) {
+    new (p) value_type(wert);
+  }
+
+  inline void destroy(pointer p) {
+    p->~value_type();
+  }
+
+  inline size_type max_size() const throw () {
+    return size_type(-1) / sizeof(value_type);
+  }
+
+  template <typename T2>
+  struct rebind {
+    typedef AlignmentAllocator<T2, N> other;
+  };
+
+  bool operator!=(const AlignmentAllocator<T, N>& other) const {
+    return !(*this == other);
+  }
+
+  // Returns true if and only if storage allocated from *this
+  // can be deallocated from other, and vice versa.
+  // Always returns true for stateless allocators.
+  bool operator==(const AlignmentAllocator<T, N>&) const {
+    return true;
+  }
+};
+
+// Note: this class is not thread-safe, don't use it inside omp blocks
+class Timer {
+ public:
+  Timer() {}
+  ~Timer() {
+    Print();
+  }
+  #ifdef TIMETAG
+  void Start(const std::string& name) {
+    auto cur_time = std::chrono::steady_clock::now();
+    start_time_[name] = cur_time;
+  }
+  void Stop(const std::string& name) {
+    if (stats_.find(name) == stats_.end()) {
+      stats_[name] = std::chrono::duration<double, std::milli>(0);
+    }
+    stats_[name] += std::chrono::steady_clock::now() - start_time_[name];
+  }
+  #else
+  void Start(const std::string&) { }
+  void Stop(const std::string&) { }
+  #endif // TIMETAG
+
+  void Print() const {
+    #ifdef  TIMETAG
+    std::map<std::string, std::chrono::duration<double, std::milli>> ordered(stats_.begin(), stats_.end());
+    for (auto it = ordered.begin(); it != ordered.end(); ++it) {
+      Log::Info("%s costs:\t %f ", it->first.c_str(), it->second * 1e-3);
+    }
+    #endif
+  }
+  std::unordered_map<std::string, std::chrono::steady_clock::time_point> start_time_;
+  std::unordered_map<std::string, std::chrono::duration<double, std::milli>> stats_;
+};
+
+// Note: this class is not thread-safe, don't use it inside omp blocks
+class FunctionTimer {
+ public:
+  FunctionTimer(const std::string& name, Timer& timer): timer_(timer) {
+    timer.Start(name);
+    #ifdef TIMETAG
+    name_ = name;
+    #endif // TIMETAG
+
+  }
+  ~FunctionTimer() {
+    timer_.Stop(name_);
+  }
+ private:
+  std::string name_;
+  Timer& timer_;
+};
+
 }  // namespace Common

+extern Common::Timer global_timer;
+
 }  // namespace LightGBM

 #endif   // LightGBM_UTILS_COMMON_FUN_H_
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -27,6 +27,8 @@

 namespace LightGBM {

+Common::Timer global_timer;
+
 Application::Application(int argc, char** argv) {
  LoadParameters(argc, argv);
  // set number of threads for openmp

--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -62,7 +62,7 @@ class Predictor {
    boosting_ = boosting;
    num_pred_one_row_ = boosting_->NumPredictOneRow(num_iteration, predict_leaf_index, predict_contrib);
    num_feature_ = boosting_->MaxFeatureIdx() + 1;
-    predict_buf_ = std::vector<std::vector<double>>(num_threads_, std::vector<double>(num_feature_, 0.0f));
+    predict_buf_.resize(num_threads_, std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(num_feature_, 0.0f));
    const int kFeatureThreshold = 100000;
    const size_t KSparseThreshold = static_cast<size_t>(0.01 * num_feature_);
    if (predict_leaf_index) {
@@ -263,7 +263,7 @@ class Predictor {
  int num_feature_;
  int num_pred_one_row_;
  int num_threads_;
-  std::vector<std::vector<double>> predict_buf_;
+  std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
 };

 }  // namespace LightGBM

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -17,7 +17,6 @@

 namespace LightGBM {

-
 GBDT::GBDT() : iter_(0),
 train_data_(nullptr),
 objective_function_(nullptr),
@@ -41,6 +40,7 @@ balanced_bagging_(false) {
 }

 GBDT::~GBDT() {
+
 }

 void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
@@ -148,6 +148,7 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
 }

 void GBDT::Boosting() {
+  Common::FunctionTimer fun_timer("GBDT::Boosting", global_timer);
  if (objective_function_ == nullptr) {
    Log::Fatal("No object function provided");
  }
@@ -208,23 +209,26 @@ data_size_t GBDT::BalancedBaggingHelper(Random* cur_rand, data_size_t start, dat
 }

 void GBDT::Bagging(int iter) {
+  Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
  // if need bagging
  if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0)
      || need_re_bagging_) {
    need_re_bagging_ = false;
-    const data_size_t min_inner_size = 1000;
-    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
-    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
+    const data_size_t min_inner_size = 1024;
+    const int n_block = std::min(
+        num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size);
+    data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block);
    OMP_INIT_EX();
    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < n_block; ++i) {
      OMP_LOOP_EX_BEGIN();
-      left_cnts_buf_[i] = 0;
-      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
-      if (cur_start > num_data_) { continue; }
-      data_size_t cur_cnt = inner_size;
-      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
+      data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start);
+      if (cur_cnt <= 0) {
+        left_cnts_buf_[i] = 0;
+        right_cnts_buf_[i] = 0;
+        continue;
+      }
      Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = 0;
      if (balanced_bagging_) {
@@ -241,15 +245,14 @@ void GBDT::Bagging(int iter) {
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
-    for (int i = 1; i < num_threads_; ++i) {
+    for (int i = 1; i < n_block; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
-    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
+    left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1];

    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
-      OMP_LOOP_EX_BEGIN();
+    for (int i = 0; i < n_block; ++i) {
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
                    tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
@@ -258,9 +261,7 @@ void GBDT::Bagging(int iter) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
                    tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
      }
-      OMP_LOOP_EX_END();
    }
-    OMP_THROW_EX();
    bag_data_cnt_ = left_cnt;
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
    // set bagging data to tree learner
@@ -276,6 +277,7 @@ void GBDT::Bagging(int iter) {
 }

 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
+  Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
  bool is_finished = false;
  auto start_time = std::chrono::steady_clock::now();
  for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
@@ -342,6 +344,7 @@ double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id)
 }

 double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
+  Common::FunctionTimer fun_timer("GBDT::BoostFromAverage", global_timer);
  // boosting from average label; or customized "average" if implemented for the current objective
  if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
    if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
@@ -366,6 +369,7 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
 }

 bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
+  Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer);
  std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
  // boosting first
  if (gradients == nullptr || hessians == nullptr) {
@@ -486,6 +490,7 @@ bool GBDT::EvalAndCheckEarlyStopping() {
 }

 void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
+  Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer);
  // update training score
  if (!is_use_subset_) {
    train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
@@ -755,17 +760,10 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
    right_write_pos_buf_.resize(num_threads_);

    double average_bag_rate = (bag_data_cnt_ / num_data_) / config->bagging_freq;
-    int sparse_group = 0;
-    for (int i = 0; i < train_data_->num_feature_groups(); ++i) {
-      if (train_data_->FeatureGroupIsSparse(i)) {
-        ++sparse_group;
-      }
-    }
    is_use_subset_ = false;
    const int group_threshold_usesubset = 100;
-    const int sparse_group_threshold_usesubset = train_data_->num_feature_groups() / 4;
-    if (average_bag_rate <= 0.5
-        && (train_data_->num_feature_groups() < group_threshold_usesubset || sparse_group < sparse_group_threshold_usesubset)) {
+    if (tree_learner_->IsHistColWise() && average_bag_rate <= 0.5
+        && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
      if (tmp_subset_ == nullptr || is_change_dataset) {
        tmp_subset_.reset(new Dataset(bag_data_cnt_));
        tmp_subset_->CopyFeatureMapperFrom(train_data_);

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -457,11 +457,11 @@ class GBDT : public GBDTBase {
  /*! \brief Max feature index of training data*/
  int max_feature_idx_;
  /*! \brief First order derivative of training data */
-  std::vector<score_t> gradients_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> gradients_;
  /*! \brief Secend order derivative of training data */
-  std::vector<score_t> hessians_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> hessians_;
  /*! \brief Store the indices of in-bag data */
-  std::vector<data_size_t> bag_data_indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;
  /*! \brief Number of in-bag data */
  data_size_t bag_data_cnt_;
  /*! \brief Store the indices of in-bag data */

--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -22,10 +22,6 @@

 namespace LightGBM {

-#ifdef TIMETAG
-std::chrono::duration<double, std::milli> subset_time;
-std::chrono::duration<double, std::milli> re_init_tree_time;
-#endif

 class GOSS: public GBDT {
 public:
@@ -36,10 +32,7 @@ class GOSS: public GBDT {
  }

  ~GOSS() {
-    #ifdef TIMETAG
-    Log::Info("GOSS::subset costs %f", subset_time * 1e-3);
-    Log::Info("GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3);
-    #endif
+
  }

  void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
@@ -143,19 +136,21 @@ class GOSS: public GBDT {
    // not subsample for first iterations
    if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }

-    const data_size_t min_inner_size = 100;
-    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
-    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
+    const data_size_t min_inner_size = 128;
+    const int n_block = std::min(
+        num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size);
+    data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block);
    OMP_INIT_EX();
    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < n_block; ++i) {
      OMP_LOOP_EX_BEGIN();
-      left_cnts_buf_[i] = 0;
-      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
-      if (cur_start > num_data_) { continue; }
-      data_size_t cur_cnt = inner_size;
-      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
+      data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start);
+      if (cur_cnt <= 0) {
+        left_cnts_buf_[i] = 0;
+        right_cnts_buf_[i] = 0;
+        continue;
+      }
      Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = BaggingHelper(&cur_rand, cur_start, cur_cnt,
                                                 tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
@@ -168,14 +163,14 @@ class GOSS: public GBDT {
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
-    for (int i = 1; i < num_threads_; ++i) {
+    for (int i = 1; i < n_block; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
-    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
+    left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1];

    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < n_block; ++i) {
      OMP_LOOP_EX_BEGIN();
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
@@ -193,22 +188,10 @@ class GOSS: public GBDT {
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
-      // get subset
-      #ifdef TIMETAG
-      auto start_time = std::chrono::steady_clock::now();
-      #endif
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
-      #ifdef TIMETAG
-      subset_time += std::chrono::steady_clock::now() - start_time;
-      #endif
-      #ifdef TIMETAG
-      start_time = std::chrono::steady_clock::now();
-      #endif
      tree_learner_->ResetTrainingData(tmp_subset_.get());
-      #ifdef TIMETAG
-      re_init_tree_time += std::chrono::steady_clock::now() - start_time;
-      #endif
+
    }
  }


--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -55,6 +55,7 @@ class ScoreUpdater {
  inline bool has_init_score() const { return has_init_score_; }

  inline void AddScore(double val, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < num_data_; ++i) {
@@ -76,6 +77,7 @@ class ScoreUpdater {
  * \param cur_tree_id Current tree for multiclass training
  */
  inline void AddScore(const Tree* tree, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
    tree->AddPredictionToScore(data_, num_data_, score_.data() + offset);
  }
@@ -87,6 +89,7 @@ class ScoreUpdater {
  * \param cur_tree_id Current tree for multiclass training
  */
  inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
    tree_learner->AddPredictionToScore(tree, score_.data() + offset);
  }
@@ -100,6 +103,7 @@ class ScoreUpdater {
  */
  inline void AddScore(const Tree* tree, const data_size_t* data_indices,
                       data_size_t data_cnt, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
    tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset);
  }
@@ -119,7 +123,7 @@ class ScoreUpdater {
  /*! \brief Pointer of data set */
  const Dataset* data_;
  /*! \brief Scores for data set */
-  std::vector<double> score_;
+  std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>> score_;
  bool has_init_score_;
 };