speed up multi-val bin subset for bagging (#2827)

* speed up multi-val bin subset for bagging * remove the duplicated codes * code refine * some codes refactoring * move `is_constant_hessian` into `TrainingShareStates` * refine * fix bug * fix bug when num_groups_ < 0 * fix gpu * fix gpu bagging * fix gpu bug * typo * Update src/treelearner/serial_tree_learner.h

speed up multi-val bin subset for bagging (#2827)
* speed up multi-val bin subset for bagging * remove the duplicated codes * code refine * some codes refactoring * move `is_constant_hessian` into `TrainingShareStates` * refine * fix bug * fix bug when num_groups_ < 0 * fix gpu * fix gpu bagging * fix gpu bug * typo * Update src/treelearner/serial_tree_learner.h
d0bec9e9 · Guolin Ke · GitHub · 0aa7bfee · d0bec9e9 · d0bec9e9
Unverified Commit d0bec9e9 authored Mar 02, 2020 by Guolin Ke Committed by GitHub Mar 02, 2020
17 changed files
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -303,7 +303,7 @@ class Bin {
  virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;


-  virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
+  virtual void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
  /*!
  * \brief Get bin iterator of this bin for specific feature
  * \param min_bin min_bin of current used feature
@@ -453,23 +453,34 @@ class MultiValBin {

  virtual int32_t num_bin() const = 0;

-  virtual void ReSize(data_size_t num_data) = 0;
+  virtual double num_element_per_row() const = 0;

-  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;

-  virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
+  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;

-  virtual void ReSizeForSubFeature(int num_bin, int num_feature,
-                                   double estimate_element_per_row) = 0;
+  virtual void CopySubrow(const MultiValBin* full_bin,
+                          const data_size_t* used_indices,
+                          data_size_t num_used_indices) = 0;

-  virtual MultiValBin* CreateLike(int num_bin, int num_feature,
+  virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
+                                  int num_feature,
                                  double estimate_element_per_row) const = 0;

-  virtual void CopySubFeature(const MultiValBin* full_bin,
-                              const std::vector<int>& used_feature_index,
-                              const std::vector<uint32_t>& lower,
-                              const std::vector<uint32_t>& upper,
-                              const std::vector<uint32_t>& delta) = 0;
+
+  virtual void CopySubcol(const MultiValBin* full_bin,
+                          const std::vector<int>& used_feature_index,
+                          const std::vector<uint32_t>& lower,
+                          const std::vector<uint32_t>& upper,
+                          const std::vector<uint32_t>& delta) = 0;
+
+  virtual void ReSize(data_size_t num_data, int num_bin, int num_feature,
+                      double estimate_element_per_row) = 0;
+
+  virtual void CopySubrowAndSubcol(
+      const MultiValBin* full_bin, const data_size_t* used_indices,
+      data_size_t num_used_indices, const std::vector<int>& used_feature_index,
+      const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
+      const std::vector<uint32_t>& delta) = 0;

  virtual void ConstructHistogram(
    const data_size_t* data_indices, data_size_t start, data_size_t end,

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -276,16 +276,22 @@ class Parser {
  static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx);
 };

-struct TrainingTempState {
-  std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
-      hist_buf;
+struct TrainingShareStates {
+  bool is_colwise = true;
+  bool is_use_subcol = false;
+  bool is_use_subrow = false;
+  bool is_subrow_copied = false;
+  bool is_constant_hessian = true;
+  const data_size_t* bagging_use_indices;
+  data_size_t bagging_indices_cnt;
  int num_bin_aligned;
-  bool use_subfeature;
  std::unique_ptr<MultiValBin> multi_val_bin;
-  std::unique_ptr<MultiValBin> multi_val_bin_subfeature;
+  std::unique_ptr<MultiValBin> multi_val_bin_subset;
  std::vector<uint32_t> hist_move_src;
  std::vector<uint32_t> hist_move_dest;
  std::vector<uint32_t> hist_move_size;
+  std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
+      hist_buf;

  void SetMultiValBin(MultiValBin* bin) {
    if (bin == nullptr) {
@@ -302,14 +308,14 @@ struct TrainingTempState {
  }

  hist_t* TempBuf() {
-    if (!use_subfeature) {
+    if (!is_use_subcol) {
      return nullptr;
    }
    return hist_buf.data() + hist_buf.size() - num_bin_aligned * 2;
  }

  void HistMove(const hist_t* src, hist_t* dest) {
-    if (!use_subfeature) {
+    if (!is_use_subcol) {
      return;
    }
 #pragma omp parallel for schedule(static)
@@ -436,16 +442,16 @@ class Dataset {
  }
  void ReSize(data_size_t num_data);

-  void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
+  void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);

  MultiValBin* GetMultiBinFromSparseFeatures() const;

  MultiValBin* GetMultiBinFromAllFeatures() const;

-  TrainingTempState* TestMultiThreadingMethod(
-    score_t* gradients, score_t* hessians,
-    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
-    bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const;
+  TrainingShareStates* GetShareStates(
+      score_t* gradients, score_t* hessians,
+      const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+      bool force_colwise, bool force_rowwise) const;

  LIGHTGBM_EXPORT void FinishLoad();

@@ -473,23 +479,21 @@ class Dataset {
  LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);

  void InitTrain(const std::vector<int8_t>& is_feature_used,
-                 bool is_colwise,
-                 TrainingTempState* temp_state) const;
+                 TrainingShareStates* share_state) const;

  void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                           const data_size_t* data_indices,
                           data_size_t num_data, const score_t* gradients,
                           const score_t* hessians, score_t* ordered_gradients,
-                           score_t* ordered_hessians, bool is_constant_hessian,
-                           bool is_colwise, TrainingTempState* temp_state,
+                           score_t* ordered_hessians,
+                           TrainingShareStates* share_state,
                           hist_t* histogram_data) const;

  void ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                   data_size_t num_data,
                                   const score_t* gradients,
                                   const score_t* hessians,
-                                   bool is_constant_hessian,
-                                   TrainingTempState* temp_state,
+                                   TrainingShareStates* share_state,
                                   hist_t* histogram_data) const;

  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;

--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -174,12 +174,12 @@ class FeatureGroup {
    }
  }

-  inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
+  inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
    if (!is_multi_val_) {
-      bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
+      bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
    } else {
      for (int i = 0; i < num_feature_; ++i) {
-        multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
+        multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
      }
    }
  }

--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -37,7 +37,10 @@ class TreeLearner {
  */
  virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;

-  virtual void ResetTrainingData(const Dataset* train_data) = 0;
+  virtual void ResetIsConstantHessian(bool is_constant_hessian) = 0;
+
+  virtual void ResetTrainingData(const Dataset* train_data,
+                                 bool is_constant_hessian) = 0;

  /*!
  * \brief Reset tree configs
@@ -52,7 +55,7 @@ class TreeLearner {
  * \param is_constant_hessian True if all hessians share the same value
  * \return A trained tree
  */
-  virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian,
+  virtual Tree* Train(const score_t* gradients, const score_t* hessians,
                      const Json& forced_split_json) = 0;

  /*!
@@ -65,13 +68,13 @@ class TreeLearner {

  /*!
  * \brief Set bagging data
+  * \param subset subset of bagging
  * \param used_indices Used data indices
  * \param num_data Number of used data
  */
-  virtual void SetBaggingData(const data_size_t* used_indices,
-    data_size_t num_data) = 0;
-
-  virtual bool IsHistColWise() const = 0;
+  virtual void SetBaggingData(const Dataset* subset,
+                              const data_size_t* used_indices,
+                              data_size_t num_data) = 0;

  /*!
  * \brief Using last trained tree to predict score then adding to out_score;

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -231,13 +231,14 @@ void GBDT::Bagging(int iter) {
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
    // set bagging data to tree learner
    if (!is_use_subset_) {
-      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
+      tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
      tmp_subset_->ReSize(bag_data_cnt_);
-      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(),
+      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                              bag_data_cnt_, false);
-      tree_learner_->ResetTrainingData(tmp_subset_.get());
+      tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
+                                    bag_data_cnt_);
    }
  }
 }
@@ -365,7 +366,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
        grad = gradients_.data() + offset;
        hess = hessians_.data() + offset;
      }
-      new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
+      new_tree.reset(tree_learner_->Train(grad, hess, forced_splits_json_));
    }

    if (new_tree->num_leaves() > 1) {
@@ -693,8 +694,10 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
    feature_names_ = train_data_->feature_names();
    feature_infos_ = train_data_->feature_infos();

-    tree_learner_->ResetTrainingData(train_data);
+    tree_learner_->ResetTrainingData(train_data, is_constant_hessian_);
    ResetBaggingConfig(config_.get(), true);
+  } else {
+    tree_learner_->ResetIsConstantHessian(is_constant_hessian_);
  }
 }

@@ -750,7 +753,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
        (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
    is_use_subset_ = false;
    const int group_threshold_usesubset = 100;
-    if (tree_learner_->IsHistColWise() && average_bag_rate <= 0.5
+    if (average_bag_rate <= 0.5
        && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
      if (tmp_subset_ == nullptr || is_change_dataset) {
        tmp_subset_.reset(new Dataset(bag_data_cnt_));

--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -125,8 +125,7 @@ class RF : public GBDT {
          hess = tmp_hess_.data();
        }

-        new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_,
-          forced_splits_json_));
+        new_tree.reset(tree_learner_->Train(grad, hess, forced_splits_json_));
      }

      if (new_tree->num_leaves() > 1) {

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1059,7 +1059,7 @@ int LGBM_DatasetGetSubset(
  }
  auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices));
  ret->CopyFeatureMapperFrom(full_dataset);
-  ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true);
+  ret->CopySubrow(full_dataset, used_row_indices, num_used_row_indices, true);
  *out = ret.release();
  API_END();
 }

--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -586,10 +586,10 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
  return ret.release();
 }

-TrainingTempState* Dataset::TestMultiThreadingMethod(
+TrainingShareStates* Dataset::GetShareStates(
    score_t* gradients, score_t* hessians,
    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
-    bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const {
+    bool force_colwise, bool force_rowwise) const {
  Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
                                  global_timer);
  if (force_colwise && force_rowwise) {
@@ -598,25 +598,30 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
        "the same time");
  }
  if (num_groups_ <= 0) {
-    return nullptr;
+    TrainingShareStates* share_state = new TrainingShareStates();
+    share_state->is_colwise = true;
+    share_state->is_constant_hessian = is_constant_hessian;
+    return share_state;
  }
  if (force_colwise) {
-    *is_hist_col_wise = true;
-    TrainingTempState* temp_state = new TrainingTempState();
-    temp_state->SetMultiValBin(GetMultiBinFromSparseFeatures());
-    return temp_state;
+    TrainingShareStates* share_state = new TrainingShareStates();
+    share_state->SetMultiValBin(GetMultiBinFromSparseFeatures());
+    share_state->is_colwise = true;
+    share_state->is_constant_hessian = is_constant_hessian;
+    return share_state;
  } else if (force_rowwise) {
-    *is_hist_col_wise = false;
-    TrainingTempState* temp_state = new TrainingTempState();
-    temp_state->SetMultiValBin(GetMultiBinFromAllFeatures());
-    return temp_state;
+    TrainingShareStates* share_state = new TrainingShareStates();
+    share_state->SetMultiValBin(GetMultiBinFromAllFeatures());
+    share_state->is_colwise = false;
+    share_state->is_constant_hessian = is_constant_hessian;
+    return share_state;
  } else {
    std::unique_ptr<MultiValBin> sparse_bin;
    std::unique_ptr<MultiValBin> all_bin;
-    std::unique_ptr<TrainingTempState> colwise_state;
-    std::unique_ptr<TrainingTempState> rowwise_state;
-    colwise_state.reset(new TrainingTempState());
-    rowwise_state.reset(new TrainingTempState());
+    std::unique_ptr<TrainingShareStates> colwise_state;
+    std::unique_ptr<TrainingShareStates> rowwise_state;
+    colwise_state.reset(new TrainingShareStates());
+    rowwise_state.reset(new TrainingShareStates());

    std::chrono::duration<double, std::milli> col_wise_init_time,
        row_wise_init_time;
@@ -633,23 +638,25 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
    Log::Debug(
        "init for col-wise cost %f seconds, init for row-wise cost %f seconds",
        col_wise_init_time * 1e-3, row_wise_init_time * 1e-3);
-    InitTrain(is_feature_used, true, colwise_state.get());
-    InitTrain(is_feature_used, false, rowwise_state.get());
+    colwise_state->is_colwise = true;
+    colwise_state->is_constant_hessian = is_constant_hessian;
+    InitTrain(is_feature_used, colwise_state.get());
+    rowwise_state->is_colwise = false;
+    rowwise_state->is_constant_hessian = is_constant_hessian;
+    InitTrain(is_feature_used, rowwise_state.get());
    std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
    start_time = std::chrono::steady_clock::now();
    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
-                        hessians, gradients, hessians, is_constant_hessian,
-                        true, colwise_state.get(), hist_data.data());
+                        hessians, gradients, hessians, colwise_state.get(),
+                        hist_data.data());
    col_wise_time = std::chrono::steady_clock::now() - start_time;
    start_time = std::chrono::steady_clock::now();
    ConstructHistogramsMultiVal(nullptr, num_data_, gradients, hessians,
-                                is_constant_hessian, rowwise_state.get(),
-                                hist_data.data());
+                                rowwise_state.get(), hist_data.data());
    row_wise_time = std::chrono::steady_clock::now() - start_time;
    Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds",
               col_wise_time * 1e-3, row_wise_time * 1e-3);
    if (col_wise_time < row_wise_time) {
-      *is_hist_col_wise = true;
      auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
          "Auto-choosing col-wise multi-threading, the overhead of testing was "
@@ -658,7 +665,6 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
          overhead_cost * 1e-3);
      return colwise_state.release();
    } else {
-      *is_hist_col_wise = false;
      auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
          "Auto-choosing row-wise multi-threading, the overhead of testing was "
@@ -765,7 +771,7 @@ void Dataset::ReSize(data_size_t num_data) {
  }
 }

-void Dataset::CopySubset(const Dataset* fullset,
+void Dataset::CopySubrow(const Dataset* fullset,
                         const data_size_t* used_indices,
                         data_size_t num_used_indices, bool need_meta_data) {
  CHECK(num_used_indices == num_data_);
@@ -773,7 +779,7 @@ void Dataset::CopySubset(const Dataset* fullset,
 #pragma omp parallel for schedule(static)
  for (int group = 0; group < num_groups_; ++group) {
    OMP_LOOP_EX_BEGIN();
-    feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(),
+    feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(),
                                       used_indices, num_used_indices);
    OMP_LOOP_EX_END();
  }
@@ -1037,13 +1043,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
 }

 void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
-                        bool is_colwise, TrainingTempState* temp_state) const {
+                        TrainingShareStates* share_state) const {
  Common::FunctionTimer fun_time("Dataset::InitTrain", global_timer);
-  temp_state->use_subfeature = false;
-  if (temp_state->multi_val_bin == nullptr) {
+  share_state->is_use_subcol = false;
+  if (share_state->multi_val_bin == nullptr) {
    return;
  }
-  global_timer.Start("Dataset::InitTrain.Prep");
+  const auto multi_val_bin = share_state->multi_val_bin.get();
  double sum_used_dense_ratio = 0.0;
  double sum_dense_ratio = 0.0;
  int num_used = 0;
@@ -1063,7 +1069,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
        sum_dense_ratio += dense_rate;
        ++total;
      }
-    } else if (!is_colwise) {
+    } else if (!share_state->is_colwise) {
      bool is_group_used = false;
      double dense_rate = 0;
      for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
@@ -1081,101 +1087,124 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
      ++total;
    }
  }
-  global_timer.Stop("Dataset::InitTrain.Prep");
  const double k_subfeature_threshold = 0.6;
  if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) {
-    return;
-  }
-  temp_state->use_subfeature = true;
-  global_timer.Start("Dataset::InitTrain.Prep");
-  std::vector<uint32_t> upper_bound;
-  std::vector<uint32_t> lower_bound;
-  std::vector<uint32_t> delta;
-  temp_state->hist_move_src.clear();
-  temp_state->hist_move_dest.clear();
-  temp_state->hist_move_size.clear();
+    // only need to copy subset
+    if (share_state->is_use_subrow && !share_state->is_subrow_copied) {
+      if (share_state->multi_val_bin_subset == nullptr) {
+        share_state->multi_val_bin_subset.reset(multi_val_bin->CreateLike(
+            share_state->bagging_indices_cnt, multi_val_bin->num_bin(), total,
+            multi_val_bin->num_element_per_row()));
+      } else {
+        share_state->multi_val_bin_subset->ReSize(
+            share_state->bagging_indices_cnt, multi_val_bin->num_bin(), total,
+            multi_val_bin->num_element_per_row());
+      }
+      share_state->multi_val_bin_subset->CopySubrow(
+          multi_val_bin, share_state->bagging_use_indices,
+          share_state->bagging_indices_cnt);
+      // avoid to copy subset many times
+      share_state->is_subrow_copied = true;
+    }
+  } else {
+    share_state->is_use_subcol = true;
+    std::vector<uint32_t> upper_bound;
+    std::vector<uint32_t> lower_bound;
+    std::vector<uint32_t> delta;
+    share_state->hist_move_src.clear();
+    share_state->hist_move_dest.clear();
+    share_state->hist_move_size.clear();

-  int num_total_bin = 1;
-  int new_num_total_bin = 1;
+    int num_total_bin = 1;
+    int new_num_total_bin = 1;

-  for (int i = 0; i < num_groups_; ++i) {
-    int f_start = group_feature_start_[i];
-    if (feature_groups_[i]->is_multi_val_) {
-      for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
-        const auto& bin_mapper = feature_groups_[i]->bin_mappers_[j];
-        int cur_num_bin = bin_mapper->num_bin();
-        if (bin_mapper->GetMostFreqBin() == 0) {
-          cur_num_bin -= 1;
+    for (int i = 0; i < num_groups_; ++i) {
+      int f_start = group_feature_start_[i];
+      if (feature_groups_[i]->is_multi_val_) {
+        for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
+          const auto& bin_mapper = feature_groups_[i]->bin_mappers_[j];
+          int cur_num_bin = bin_mapper->num_bin();
+          if (bin_mapper->GetMostFreqBin() == 0) {
+            cur_num_bin -= 1;
+          }
+          num_total_bin += cur_num_bin;
+          if (is_feature_used[f_start + j]) {
+            new_num_total_bin += cur_num_bin;
+
+            lower_bound.push_back(num_total_bin - cur_num_bin);
+            upper_bound.push_back(num_total_bin);
+
+            share_state->hist_move_src.push_back(
+                (new_num_total_bin - cur_num_bin) * 2);
+            share_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) *
+                                                 2);
+            share_state->hist_move_size.push_back(cur_num_bin * 2);
+            delta.push_back(num_total_bin - new_num_total_bin);
+          }
+        }
+      } else if (!share_state->is_colwise) {
+        bool is_group_used = false;
+        for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
+          if (is_feature_used[f_start + j]) {
+            is_group_used = true;
+            break;
+          }
        }
+        int cur_num_bin = feature_groups_[i]->bin_offsets_.back() - 1;
        num_total_bin += cur_num_bin;
-        if (is_feature_used[f_start + j]) {
+        if (is_group_used) {
          new_num_total_bin += cur_num_bin;

          lower_bound.push_back(num_total_bin - cur_num_bin);
          upper_bound.push_back(num_total_bin);

-          temp_state->hist_move_src.push_back(
+          share_state->hist_move_src.push_back(
              (new_num_total_bin - cur_num_bin) * 2);
-          temp_state->hist_move_dest.push_back(
-              (num_total_bin - cur_num_bin) * 2);
-          temp_state->hist_move_size.push_back(cur_num_bin * 2);
+          share_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) *
+                                               2);
+          share_state->hist_move_size.push_back(cur_num_bin * 2);
          delta.push_back(num_total_bin - new_num_total_bin);
        }
      }
-    } else if (!is_colwise) {
-      bool is_group_used = false;
-      for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
-        if (is_feature_used[f_start + j]) {
-          is_group_used = true;
-          break;
-        }
-      }
-      int cur_num_bin = feature_groups_[i]->bin_offsets_.back() - 1;
-      num_total_bin += cur_num_bin;
-      if (is_group_used) {
-        new_num_total_bin += cur_num_bin;
-
-        lower_bound.push_back(num_total_bin - cur_num_bin);
-        upper_bound.push_back(num_total_bin);
-
-        temp_state->hist_move_src.push_back(
-            (new_num_total_bin - cur_num_bin) * 2);
-        temp_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) * 2);
-        temp_state->hist_move_size.push_back(cur_num_bin * 2);
-        delta.push_back(num_total_bin - new_num_total_bin);
-      }
+    }
+    // avoid out of range
+    lower_bound.push_back(num_total_bin);
+    upper_bound.push_back(num_total_bin);
+    data_size_t num_data =
+        share_state->is_use_subrow ? share_state->bagging_indices_cnt : num_data_;
+    if (share_state->multi_val_bin_subset == nullptr) {
+      share_state->multi_val_bin_subset.reset(multi_val_bin->CreateLike(
+          num_data, new_num_total_bin, num_used, sum_used_dense_ratio));
+    } else {
+      share_state->multi_val_bin_subset->ReSize(num_data, new_num_total_bin,
+                                               num_used, sum_used_dense_ratio);
+    }
+    if (share_state->is_use_subrow) {
+      share_state->multi_val_bin_subset->CopySubrowAndSubcol(
+          multi_val_bin, share_state->bagging_use_indices,
+          share_state->bagging_indices_cnt, used_feature_index, lower_bound,
+          upper_bound, delta);
+      // may need to recopy subset
+      share_state->is_subrow_copied = false;
+    } else {
+      share_state->multi_val_bin_subset->CopySubcol(
+          multi_val_bin, used_feature_index, lower_bound, upper_bound, delta);
    }
  }
-  // avoid out of range
-  lower_bound.push_back(num_total_bin);
-  upper_bound.push_back(num_total_bin);
-  global_timer.Stop("Dataset::InitTrain.Prep");
-  global_timer.Start("Dataset::InitTrain.Resize");
-  if (temp_state->multi_val_bin_subfeature == nullptr) {
-    temp_state->multi_val_bin_subfeature.reset(
-        temp_state->multi_val_bin->CreateLike(new_num_total_bin, num_used,
-                                              sum_used_dense_ratio));
-  } else {
-    temp_state->multi_val_bin_subfeature->ReSizeForSubFeature(
-        new_num_total_bin, num_used, sum_used_dense_ratio);
-  }
-  global_timer.Stop("Dataset::InitTrain.Resize");
-  global_timer.Start("Dataset::InitTrain.CopySubFeature");
-  temp_state->multi_val_bin_subfeature->CopySubFeature(
-      temp_state->multi_val_bin.get(), used_feature_index, lower_bound,
-      upper_bound, delta);
-  global_timer.Stop("Dataset::InitTrain.CopySubFeature");
 }

-void Dataset::ConstructHistogramsMultiVal(
-    const data_size_t* data_indices, data_size_t num_data,
-    const score_t* gradients, const score_t* hessians, bool is_constant_hessian,
-    TrainingTempState* temp_state, hist_t* hist_data) const {
+void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
+                                          data_size_t num_data,
+                                          const score_t* gradients,
+                                          const score_t* hessians,
+                                          TrainingShareStates* share_state,
+                                          hist_t* hist_data) const {
  Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
                                 global_timer);
-  const auto multi_val_bin = temp_state->use_subfeature
-                                 ? temp_state->multi_val_bin_subfeature.get()
-                                 : temp_state->multi_val_bin.get();
+  const auto multi_val_bin =
+      (share_state->is_use_subcol || share_state->is_use_subrow)
+          ? share_state->multi_val_bin_subset.get()
+          : share_state->multi_val_bin.get();
  if (multi_val_bin == nullptr) {
    return;
  }
@@ -1191,12 +1220,12 @@ void Dataset::ConstructHistogramsMultiVal(
                                    &n_data_block, &data_block_size);
  const size_t buf_size =
      static_cast<size_t>(n_data_block - 1) * num_bin_aligned * 2;
-  if (temp_state->hist_buf.size() < buf_size) {
-    temp_state->hist_buf.resize(buf_size);
+  if (share_state->hist_buf.size() < buf_size) {
+    share_state->hist_buf.resize(buf_size);
  }
  auto origin_hist_data = hist_data;
-  if (temp_state->use_subfeature) {
-    hist_data = temp_state->TempBuf();
+  if (share_state->is_use_subcol) {
+    hist_data = share_state->TempBuf();
  }
  OMP_INIT_EX();
 #pragma omp parallel for schedule(static)
@@ -1206,12 +1235,12 @@ void Dataset::ConstructHistogramsMultiVal(
    data_size_t end = std::min(start + data_block_size, num_data);
    auto data_ptr = hist_data;
    if (tid > 0) {
-      data_ptr = temp_state->hist_buf.data() +
+      data_ptr = share_state->hist_buf.data() +
                 static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
    }
    std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin * kHistEntrySize);
    if (data_indices != nullptr && num_data < num_data_) {
-      if (!is_constant_hessian) {
+      if (!share_state->is_constant_hessian) {
        multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
                                          hessians, data_ptr);
      } else {
@@ -1219,7 +1248,7 @@ void Dataset::ConstructHistogramsMultiVal(
                                          data_ptr);
      }
    } else {
-      if (!is_constant_hessian) {
+      if (!share_state->is_constant_hessian) {
        multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
                                          data_ptr);
      } else {
@@ -1236,13 +1265,13 @@ void Dataset::ConstructHistogramsMultiVal(
  int bin_block_size = num_bin;
  Threading::BlockInfo<data_size_t>(num_threads, num_bin, 512, &n_bin_block,
                                    &bin_block_size);
-  if (!is_constant_hessian) {
+  if (!share_state->is_constant_hessian) {
 #pragma omp parallel for schedule(static)
    for (int t = 0; t < n_bin_block; ++t) {
      const int start = t * bin_block_size;
      const int end = std::min(start + bin_block_size, num_bin);
      for (int tid = 1; tid < n_data_block; ++tid) {
-        auto src_ptr = temp_state->hist_buf.data() +
+        auto src_ptr = share_state->hist_buf.data() +
                       static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
        for (int i = start * 2; i < end * 2; ++i) {
          hist_data[i] += src_ptr[i];
@@ -1255,7 +1284,7 @@ void Dataset::ConstructHistogramsMultiVal(
      const int start = t * bin_block_size;
      const int end = std::min(start + bin_block_size, num_bin);
      for (int tid = 1; tid < n_data_block; ++tid) {
-        auto src_ptr = temp_state->hist_buf.data() +
+        auto src_ptr = share_state->hist_buf.data() +
                       static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
        for (int i = start * 2; i < end * 2; ++i) {
          hist_data[i] += src_ptr[i];
@@ -1268,7 +1297,7 @@ void Dataset::ConstructHistogramsMultiVal(
  }
  global_timer.Stop("Dataset::sparse_bin_histogram_merge");
  global_timer.Start("Dataset::sparse_bin_histogram_move");
-  temp_state->HistMove(hist_data, origin_hist_data);
+  share_state->HistMove(hist_data, origin_hist_data);
  global_timer.Stop("Dataset::sparse_bin_histogram_move");
 }

@@ -1276,16 +1305,14 @@ void Dataset::ConstructHistograms(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
-    bool is_constant_hessian, bool is_colwise, TrainingTempState* temp_state,
-    hist_t* hist_data) const {
+    TrainingShareStates* share_state, hist_t* hist_data) const {
  Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
  if (num_data < 0 || hist_data == nullptr) {
    return;
  }
-  if (!is_colwise) {
+  if (!share_state->is_colwise) {
    return ConstructHistogramsMultiVal(data_indices, num_data, gradients,
-                                       hessians, is_constant_hessian,
-                                       temp_state, hist_data);
+                                       hessians, share_state, hist_data);
  }
  global_timer.Start("Dataset::Get used group");
  std::vector<int> used_dense_group;
@@ -1316,7 +1343,7 @@ void Dataset::ConstructHistograms(
    auto ptr_ordered_grad = gradients;
    auto ptr_ordered_hess = hessians;
    if (data_indices != nullptr && num_data < num_data_) {
-      if (!is_constant_hessian) {
+      if (!share_state->is_constant_hessian) {
 #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
        for (data_size_t i = 0; i < num_data; ++i) {
          ordered_gradients[i] = gradients[data_indices[i]];
@@ -1330,7 +1357,7 @@ void Dataset::ConstructHistograms(
      }
      ptr_ordered_grad = ordered_gradients;
      ptr_ordered_hess = ordered_hessians;
-      if (!is_constant_hessian) {
+      if (!share_state->is_constant_hessian) {
        OMP_INIT_EX();
 #pragma omp parallel for schedule(static)
        for (int gi = 0; gi < num_used_dense_group; ++gi) {
@@ -1372,7 +1399,7 @@ void Dataset::ConstructHistograms(
        OMP_THROW_EX();
      }
    } else {
-      if (!is_constant_hessian) {
+      if (!share_state->is_constant_hessian) {
        OMP_INIT_EX();
 #pragma omp parallel for schedule(static)
        for (int gi = 0; gi < num_used_dense_group; ++gi) {
@@ -1416,8 +1443,8 @@ void Dataset::ConstructHistograms(
  global_timer.Stop("Dataset::dense_bin_histogram");
  if (multi_val_groud_id >= 0) {
    ConstructHistogramsMultiVal(
-        data_indices, num_data, gradients, hessians, is_constant_hessian,
-        temp_state, hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+        data_indices, num_data, gradients, hessians, share_state,
+        hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
  }
 }


--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -267,7 +267,7 @@ class DenseBin: public Bin {
    }
  }

-  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
+  void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
    auto other_bin = dynamic_cast<const DenseBin<VAL_T>*>(full_bin);
    for (int i = 0; i < num_used_indices; ++i) {
      data_[i] = other_bin->data_[used_indices[i]];

--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -292,7 +292,7 @@ class Dense4bitsBin : public Bin {
    }
  }

-  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
+  void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
    auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
    const data_size_t rest = num_used_indices & 1;
    for (int i = 0; i < num_used_indices - rest; i += 2) {

--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -34,6 +34,8 @@ class MultiValDenseBin : public MultiValBin {
    return num_bin_;
  }

+  double num_element_per_row() const override { return num_feature_; }
+
  void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
    auto start = RowPtr(idx);
    for (auto i = 0; i < num_feature_; ++i) {
@@ -48,12 +50,6 @@ class MultiValDenseBin : public MultiValBin {
    return false;
  }

-  void ReSize(data_size_t num_data) override {
-    if (num_data_ != num_data) {
-      num_data_ = num_data;
-    }
-  }
-
  #define ACC_GH(hist, i, g, h) \
  const auto ti = static_cast<int>(i) << 1; \
  hist[ti] += g; \
@@ -125,24 +121,13 @@ class MultiValDenseBin : public MultiValBin {
    ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
  }

-  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
-    auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
-    data_.resize(num_feature_ * num_used_indices);
-    for (data_size_t i = 0; i < num_used_indices; ++i) {
-      auto j_start = RowPtr(i);
-      auto other_j_start = other_bin->RowPtr(used_indices[i]);
-      for (auto j = other_j_start;
-           j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
-        data_[j - other_j_start + j_start] = other_bin->data_[j];
-      }
-    }
+  MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double) const override {
+    return new MultiValDenseBin<VAL_T>(num_data, num_bin, num_feature);
  }

-  MultiValBin* CreateLike(int num_bin, int num_feature, double) const override {
-    return new MultiValDenseBin<VAL_T>(num_data_, num_bin, num_feature);
-  }
-
-  void ReSizeForSubFeature(int num_bin, int num_feature, double) override {
+  void ReSize(data_size_t num_data, int num_bin, int num_feature,
+              double) override {
+    num_data_ = num_data;
    num_bin_ = num_bin;
    num_feature_ = num_feature;
    size_t new_size = static_cast<size_t>(num_feature_) * num_data_;
@@ -151,35 +136,73 @@ class MultiValDenseBin : public MultiValBin {
    }
  }

-  void CopySubFeature(const MultiValBin* full_bin,
-                      const std::vector<int>& used_feature_index,
-                      const std::vector<uint32_t>&,
-                      const std::vector<uint32_t>&,
-                      const std::vector<uint32_t>& delta) override {
-    const auto other =
+  template <bool SUBROW, bool SUBCOL>
+  void CopyInner(const MultiValBin* full_bin, const data_size_t* used_indices,
+                 data_size_t num_used_indices,
+                 const std::vector<int>& used_feature_index,
+                 const std::vector<uint32_t>& delta) {
+    const auto other_bin =
        reinterpret_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
+    if (SUBROW) {
+      CHECK(num_data_ == num_used_indices);
+    }
    int n_block = 1;
    data_size_t block_size = num_data_;
-    Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block, &block_size);
+    Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block,
+                                      &block_size);
 #pragma omp parallel for schedule(static, 1)
    for (int tid = 0; tid < n_block; ++tid) {
      data_size_t start = tid * block_size;
      data_size_t end = std::min(num_data_, start + block_size);
      for (data_size_t i = start; i < end; ++i) {
        const auto j_start = RowPtr(i);
-        const auto other_j_start = other->RowPtr(i);
+        const auto other_j_start =
+            SUBROW ? other_bin->RowPtr(used_indices[i]) : other_bin->RowPtr(i);
        for (int j = 0; j < num_feature_; ++j) {
-          if (other->data_[other_j_start + used_feature_index[j]] > 0) {
-            data_[j_start + j] = static_cast<VAL_T>(
-                other->data_[other_j_start + used_feature_index[j]] - delta[j]);
+          if (SUBCOL) {
+            if (other_bin->data_[other_j_start + used_feature_index[j]] > 0) {
+              data_[j_start + j] = static_cast<VAL_T>(
+                  other_bin->data_[other_j_start + used_feature_index[j]] -
+                  delta[j]);
+            } else {
+              data_[j_start + j] = 0;
+            }
          } else {
-            data_[j_start + j] = 0;
+            data_[j_start + j] =
+                static_cast<VAL_T>(other_bin->data_[other_j_start + j]);
          }
        }
      }
    }
  }

+
+  void CopySubrow(const MultiValBin* full_bin, const data_size_t* used_indices,
+                  data_size_t num_used_indices) override {
+    CopyInner<true, false>(full_bin, used_indices, num_used_indices,
+                           std::vector<int>(), std::vector<uint32_t>());
+  }
+
+  void CopySubcol(const MultiValBin* full_bin,
+                      const std::vector<int>& used_feature_index,
+                      const std::vector<uint32_t>&,
+                      const std::vector<uint32_t>&,
+                      const std::vector<uint32_t>& delta) override {
+    CopyInner<false, true>(full_bin, nullptr, num_data_, used_feature_index,
+                           delta);
+  }
+
+  void CopySubrowAndSubcol(const MultiValBin* full_bin,
+                               const data_size_t* used_indices,
+                               data_size_t num_used_indices,
+                               const std::vector<int>& used_feature_index,
+                               const std::vector<uint32_t>&,
+                               const std::vector<uint32_t>&,
+                               const std::vector<uint32_t>& delta) override {
+    CopyInner<true, true>(full_bin, used_indices, num_used_indices,
+                          used_feature_index, delta);
+  }
+
  inline size_t RowPtr(data_size_t idx) const {
    return static_cast<size_t>(idx) * num_feature_;
  }

--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -42,6 +42,10 @@ class MultiValSparseBin : public MultiValBin {

  int num_bin() const override { return num_bin_; }

+  double num_element_per_row() const override {
+    return estimate_element_per_row_;
+  }
+
  void PushOneRow(int tid, data_size_t idx,
                  const std::vector<uint32_t>& values) override {
    const int pre_alloc_size = 50;
@@ -102,12 +106,6 @@ class MultiValSparseBin : public MultiValBin {

  bool IsSparse() override { return true; }

-  void ReSize(data_size_t num_data) override {
-    if (num_data_ != num_data) {
-      num_data_ = num_data;
-    }
-  }
-
 #define ACC_GH(hist, i, g, h)               \
  const auto ti = static_cast<int>(i) << 1; \
  hist[ti] += g;                            \
@@ -189,32 +187,15 @@ class MultiValSparseBin : public MultiValBin {
                                                 nullptr, out);
  }

-  void CopySubset(const Bin* full_bin, const data_size_t* used_indices,
-                  data_size_t num_used_indices) override {
-    auto other_bin = dynamic_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin);
-    row_ptr_.resize(num_data_ + 1, 0);
-    INDEX_T estimate_num_data =
-        static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
-    data_.clear();
-    data_.reserve(estimate_num_data);
-    for (data_size_t i = 0; i < num_used_indices; ++i) {
-      for (auto j = other_bin->row_ptr_[used_indices[i]];
-           j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
-        data_.push_back(other_bin->data_[j]);
-      }
-      row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] -
-                        other_bin->row_ptr_[used_indices[i]];
-    }
-  }
-
-  MultiValBin* CreateLike(int num_bin, int,
+  MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
                          double estimate_element_per_row) const override {
-    return new MultiValSparseBin<INDEX_T, VAL_T>(num_data_, num_bin,
-                                        estimate_element_per_row);
+    return new MultiValSparseBin<INDEX_T, VAL_T>(num_data, num_bin,
+                                                 estimate_element_per_row);
  }

-  void ReSizeForSubFeature(int num_bin, int,
-                           double estimate_element_per_row) override {
+  void ReSize(data_size_t num_data, int num_bin, int,
+              double estimate_element_per_row) override {
+    num_data_ = num_data;
    num_bin_ = num_bin;
    estimate_element_per_row_ = estimate_element_per_row;
    INDEX_T estimate_num_data =
@@ -229,14 +210,22 @@ class MultiValSparseBin : public MultiValBin {
        t_data_[i].resize(avg_num_data, 0);
      }
    }
+    if (num_data_ + 1 > static_cast<data_size_t>(row_ptr_.size())) {
+      row_ptr_.resize(num_data_ + 1);
+    }
  }

-  void CopySubFeature(const MultiValBin* full_bin, const std::vector<int>&,
-                      const std::vector<uint32_t>& lower,
-                      const std::vector<uint32_t>& upper,
-                      const std::vector<uint32_t>& delta) override {
+  template <bool SUBROW, bool SUBCOL>
+  void CopyInner(const MultiValBin* full_bin, const data_size_t* used_indices,
+                 data_size_t num_used_indices,
+                 const std::vector<uint32_t>& lower,
+                 const std::vector<uint32_t>& upper,
+                 const std::vector<uint32_t>& delta) {
    const auto other =
        reinterpret_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin);
+    if (SUBROW) {
+      CHECK(num_data_ == num_used_indices);
+    }
    int n_block = 1;
    data_size_t block_size = num_data_;
    Threading::BlockInfo<data_size_t>(static_cast<int>(t_data_.size() + 1),
@@ -250,20 +239,26 @@ class MultiValSparseBin : public MultiValBin {
      auto& buf = (tid == 0) ? data_ : t_data_[tid - 1];
      INDEX_T size = 0;
      for (data_size_t i = start; i < end; ++i) {
-        const auto j_start = other->RowPtr(i);
-        const auto j_end = other->RowPtr(i + 1);
+        const auto j_start =
+            SUBROW ? other->RowPtr(used_indices[i]) : other->RowPtr(i);
+        const auto j_end =
+            SUBROW ? other->RowPtr(used_indices[i] + 1) : other->RowPtr(i + 1);
        if (size + (j_end - j_start) > static_cast<INDEX_T>(buf.size())) {
          buf.resize(size + (j_end - j_start) * pre_alloc_size);
        }
        int k = 0;
        const auto pre_size = size;
        for (auto j = j_start; j < j_end; ++j) {
-          auto val = other->data_[j];
-          while (val >= upper[k]) {
-            ++k;
-          }
-          if (val >= lower[k]) {
-            buf[size++] = static_cast<VAL_T>(val - delta[k]);
+          const auto val = other->data_[j];
+          if (SUBCOL) {
+            while (val >= upper[k]) {
+              ++k;
+            }
+            if (val >= lower[k]) {
+              buf[size++] = static_cast<VAL_T>(val - delta[k]);
+            }
+          } else {
+            buf[size++] = val;
          }
        }
        row_ptr_[i + 1] = size - pre_size;
@@ -273,6 +268,31 @@ class MultiValSparseBin : public MultiValBin {
    MergeData(sizes.data());
  }

+  void CopySubrow(const MultiValBin* full_bin, const data_size_t* used_indices,
+                  data_size_t num_used_indices) override {
+    CopyInner<true, false>(full_bin, used_indices, num_used_indices,
+                           std::vector<uint32_t>(), std::vector<uint32_t>(),
+                           std::vector<uint32_t>());
+  }
+
+  void CopySubcol(const MultiValBin* full_bin, const std::vector<int>&,
+                  const std::vector<uint32_t>& lower,
+                  const std::vector<uint32_t>& upper,
+                  const std::vector<uint32_t>& delta) override {
+    CopyInner<false, true>(full_bin, nullptr, num_data_, lower, upper, delta);
+  }
+
+  void CopySubrowAndSubcol(const MultiValBin* full_bin,
+                           const data_size_t* used_indices,
+                           data_size_t num_used_indices,
+                           const std::vector<int>&,
+                           const std::vector<uint32_t>& lower,
+                           const std::vector<uint32_t>& upper,
+                           const std::vector<uint32_t>& delta) override {
+    CopyInner<true, true>(full_bin, used_indices, num_used_indices, lower,
+                          upper, delta);
+  }
+
  inline INDEX_T RowPtr(data_size_t idx) const { return row_ptr_[idx]; }

  MultiValSparseBin<INDEX_T, VAL_T>* Clone() override;

--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -451,7 +451,7 @@ class SparseBin: public Bin {
    }
  }

-  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
+  void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
    auto other_bin = dynamic_cast<const SparseBin<VAL_T>*>(full_bin);
    deltas_.clear();
    vals_.clear();

--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -158,7 +158,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
    indices_future_.wait();
  }
  // for constant hessian, hessians are not copied except for the root node
-  if (!is_constant_hessian_) {
+  if (!share_state_->is_constant_hessian) {
    hessians_future_.wait();
  }
  gradients_future_.wait();
@@ -581,7 +581,7 @@ void GPUTreeLearner::BuildGPUKernels() {
    // compile the GPU kernel depending if double precision is used, constant hessian is used, etc.
    opts << " -D POWER_FEATURE_WORKGROUPS=" << i
         << " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(config_->gpu_use_dp)
-         << " -D CONST_HESSIAN=" << int(is_constant_hessian_)
+         << " -D CONST_HESSIAN=" << int(share_state_->is_constant_hessian)
         << " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math";
    #if GPU_DEBUG >= 1
    std::cout << "Building GPU kernels with options: " << opts.str() << std::endl;
@@ -642,7 +642,7 @@ void GPUTreeLearner::SetupKernelArguments() {
  }
  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
    // The only argument that needs to be changed later is num_data_
-    if (is_constant_hessian_) {
+    if (share_state_->is_constant_hessian) {
      // hessian is passed as a parameter, but it is not available now.
      // hessian will be set in BeforeTrain()
      histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
@@ -736,20 +736,12 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
 }

 Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
-                            bool is_constant_hessian, const Json& forced_split_json) {
-  // check if we need to recompile the GPU kernel (is_constant_hessian changed)
-  // this should rarely occur
-  if (is_constant_hessian != is_constant_hessian_) {
-    Log::Info("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not ");
-    is_constant_hessian_ = is_constant_hessian;
-    BuildGPUKernels();
-    SetupKernelArguments();
-  }
-  return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
+                            const Json& forced_split_json) {
+  return SerialTreeLearner::Train(gradients, hessians, forced_split_json);
 }

-void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
-  SerialTreeLearner::ResetTrainingData(train_data);
+void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
+  SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
  num_feature_groups_ = train_data_->num_feature_groups();
  // GPU memory has to been reallocated because data may have been changed
  AllocateGPUMemory();
@@ -757,6 +749,14 @@ void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
  SetupKernelArguments();
 }

+void GPUTreeLearner::ResetIsConstantHessian(bool is_constant_hessian) {
+  if (is_constant_hessian != share_state_->is_constant_hessian) {	
+    SerialTreeLearner::ResetIsConstantHessian(is_constant_hessian);
+    BuildGPUKernels();	
+    SetupKernelArguments();	
+  }
+}
+
 void GPUTreeLearner::BeforeTrain() {
  #if GPU_DEBUG >= 2
  printf("Copying intial full gradients and hessians to device\n");
@@ -764,7 +764,7 @@ void GPUTreeLearner::BeforeTrain() {
  // Copy initial full hessians and gradients to GPU.
  // We start copying as early as possible, instead of at ConstructHistogram().
  if (!use_bagging_ && num_dense_feature_groups_) {
-    if (!is_constant_hessian_) {
+    if (!share_state_->is_constant_hessian) {
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data_ * sizeof(score_t), hessians_);
    } else {
      // setup hessian parameters only
@@ -792,7 +792,7 @@ void GPUTreeLearner::BeforeTrain() {
    #endif
    // transfer the indices to GPU
    indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
-    if (!is_constant_hessian_) {
+    if (!share_state_->is_constant_hessian) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < cnt; ++i) {
        ordered_hessians_[i] = hessians_[indices[i]];
@@ -846,7 +846,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
    #endif
    indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);

-    if (!is_constant_hessian_) {
+    if (!share_state_->is_constant_hessian) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = begin; i < end; ++i) {
        ordered_hessians_[i - begin] = hessians_[indices[i]];
@@ -899,7 +899,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
    }
  }
  // generate and copy ordered_hessians if hessians is not null
-  if (hessians != nullptr && !is_constant_hessian_) {
+  if (hessians != nullptr && !share_state_->is_constant_hessian) {
    if (num_data != num_data_) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
@@ -976,8 +976,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
  train_data_->ConstructHistograms(is_sparse_feature_used,
    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
    gradients_, hessians_,
-    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
-    is_hist_colwise_, temp_state_.get(),
+    ordered_gradients_.data(), ordered_hessians_.data(),
+    share_state_.get(),
    ptr_smaller_leaf_hist_data);
  // wait for GPU to finish, only if GPU is actually used
  if (is_gpu_used) {
@@ -1041,8 +1041,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
    train_data_->ConstructHistograms(is_sparse_feature_used,
      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
      gradients_, hessians_,
-      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
-      is_hist_colwise_, temp_state_.get(),
+      ordered_gradients_.data(), ordered_hessians_.data(),
+      share_state_.get(),
      ptr_larger_leaf_hist_data);
    // wait for GPU to finish, only if GPU is actually used
    if (is_gpu_used) {

--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -46,15 +46,16 @@ class GPUTreeLearner: public SerialTreeLearner {
  explicit GPUTreeLearner(const Config* tree_config);
  ~GPUTreeLearner();
  void Init(const Dataset* train_data, bool is_constant_hessian) override;
-  void ResetTrainingData(const Dataset* train_data) override;
+  void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
+  void ResetIsConstantHessian(bool is_constant_hessian);
  Tree* Train(const score_t* gradients, const score_t *hessians,
-              bool is_constant_hessian, const Json& forced_split_json) override;
+              const Json& forced_split_json) override;

-  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
-    SerialTreeLearner::SetBaggingData(used_indices, num_data);
-    // determine if we are using bagging before we construct the data partition
-    // thus we can start data movement to GPU earlier
-    if (used_indices != nullptr) {
+  void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
+    SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
+    if (subset == nullptr && used_indices != nullptr) {
+      // determine if we are using bagging before we construct the data partition
+      // thus we can start data movement to GPU earlier
      if (num_data != num_data_) {
        use_bagging_ = true;
        return;

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -30,7 +30,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
  num_features_ = train_data_->num_features();
-  is_constant_hessian_ = is_constant_hessian;
  int max_cache_size = 0;
  // Get the max size of pool
  if (config_->histogram_pool_size <= 0) {
@@ -62,9 +61,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);

-  GetMultiValBin(train_data_, true);
-
-  histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
+  GetShareStates(train_data_, is_constant_hessian, true);
+  histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
  Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
  if (CostEfficientGradientBoosting::IsEnable(config_)) {
    cegb_.reset(new CostEfficientGradientBoosting(this));
@@ -72,22 +70,28 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  }
 }

-void SerialTreeLearner::GetMultiValBin(const Dataset* dataset, bool is_first_time) {
+void SerialTreeLearner::GetShareStates(const Dataset* dataset,
+                                       bool is_constant_hessian,
+                                       bool is_first_time) {
  if (is_first_time) {
    auto used_feature = GetUsedFeatures(true);
-    temp_state_.reset(dataset->TestMultiThreadingMethod(
-      ordered_gradients_.data(), ordered_hessians_.data(), used_feature,
-      is_constant_hessian_, config_->force_col_wise, config_->force_row_wise, &is_hist_colwise_));
+    share_state_.reset(dataset->GetShareStates(
+        ordered_gradients_.data(), ordered_hessians_.data(), used_feature,
+        is_constant_hessian, config_->force_col_wise, config_->force_row_wise));
  } else {
+    CHECK(share_state_ != nullptr);
    // cannot change is_hist_col_wise during training
-    temp_state_.reset(dataset->TestMultiThreadingMethod(
-      ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_,
-      is_constant_hessian_, is_hist_colwise_, !is_hist_colwise_, &is_hist_colwise_));
+    share_state_.reset(dataset->GetShareStates(
+        ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_,
+        is_constant_hessian, share_state_->is_colwise,
+        !share_state_->is_colwise));
  }
+  CHECK(share_state_ != nullptr);
 }

-// Todo: optimized bagging for multi-val bin
-void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
+void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data,
+                                               bool is_constant_hessian,
+                                               bool reset_multi_val_bin) {
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
  CHECK_EQ(num_features_, train_data_->num_features());
@@ -99,7 +103,9 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
  // initialize data partition
  data_partition_->ResetNumData(num_data_);

-  GetMultiValBin(train_data_, false);
+  if (reset_multi_val_bin) {
+    GetShareStates(train_data_, is_constant_hessian, false);
+  }

  // initialize ordered gradients and hessians
  ordered_gradients_.resize(num_data_);
@@ -127,7 +133,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
    // at least need 2 leaves
    max_cache_size = std::max(2, max_cache_size);
    max_cache_size = std::min(max_cache_size, config_->num_leaves);
-    histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
+    histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);

    // push split information for all leaves
    best_split_per_leaf_.resize(config_->num_leaves);
@@ -142,11 +148,10 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
  }
 }

-Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
+Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, const Json& forced_split_json) {
  Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
  gradients_ = gradients;
  hessians_ = hessians;
-  is_constant_hessian_ = is_constant_hessian;

  // some initial works before training
  BeforeTrain();
@@ -286,7 +291,7 @@ void SerialTreeLearner::BeforeTrain() {
      is_feature_used_[i] = 1;
    }
  }
-  train_data_->InitTrain(is_feature_used_, is_hist_colwise_, temp_state_.get());
+  train_data_->InitTrain(is_feature_used_, share_state_.get());
  // initialize data partition
  data_partition_->Init();

@@ -369,24 +374,27 @@ void SerialTreeLearner::FindBestSplits() {
  FindBestSplitsFromHistograms(is_feature_used, use_subtract);
 }

-void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer);
+void SerialTreeLearner::ConstructHistograms(
+    const std::vector<int8_t>& is_feature_used, bool use_subtract) {
+  Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
+                                  global_timer);
  // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
+  hist_t* ptr_smaller_leaf_hist_data =
+      smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
  train_data_->ConstructHistograms(
      is_feature_used, smaller_leaf_splits_->data_indices(),
      smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
-      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
-      is_hist_colwise_, temp_state_.get(), ptr_smaller_leaf_hist_data);
+      ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
+      ptr_smaller_leaf_hist_data);

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
-    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
+    hist_t* ptr_larger_leaf_hist_data =
+        larger_leaf_histogram_array_[0].RawData() - kHistOffset;
    train_data_->ConstructHistograms(
        is_feature_used, larger_leaf_splits_->data_indices(),
        larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
-        ordered_gradients_.data(), ordered_hessians_.data(),
-        is_constant_hessian_, is_hist_colwise_, temp_state_.get(),
+        ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
        ptr_larger_leaf_hist_data);
  }
 }

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -48,11 +48,22 @@ class SerialTreeLearner: public TreeLearner {

  void Init(const Dataset* train_data, bool is_constant_hessian) override;

-  void ResetTrainingData(const Dataset* train_data) override;
+  void ResetTrainingData(const Dataset* train_data,
+                         bool is_constant_hessian) override {
+    ResetTrainingDataInner(train_data, is_constant_hessian, true);
+  }
+
+  void ResetIsConstantHessian(bool is_constant_hessian) override {
+    share_state_->is_constant_hessian = is_constant_hessian;
+  }
+
+  virtual void ResetTrainingDataInner(const Dataset* train_data,
+                                      bool is_constant_hessian,
+                                      bool reset_multi_val_bin);

  void ResetConfig(const Config* config) override;

-  Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
+  Tree* Train(const score_t* gradients, const score_t *hessians,
              const Json& forced_split_json) override;

  Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
@@ -60,8 +71,17 @@ class SerialTreeLearner: public TreeLearner {
  Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
                          const score_t* gradients, const score_t* hessians) override;

-  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
-    data_partition_->SetUsedDataIndices(used_indices, num_data);
+  void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
+    if (subset == nullptr) {
+      data_partition_->SetUsedDataIndices(used_indices, num_data);
+      share_state_->is_use_subrow = false;
+    } else {
+      ResetTrainingDataInner(subset, share_state_->is_constant_hessian, false);
+      share_state_->is_use_subrow = true;
+      share_state_->is_subrow_copied = false;
+      share_state_->bagging_use_indices = used_indices;
+      share_state_->bagging_indices_cnt = num_data;
+    }
  }

  void AddPredictionToScore(const Tree* tree,
@@ -84,8 +104,6 @@ class SerialTreeLearner: public TreeLearner {
  void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
                       data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;

-  bool IsHistColWise() const override { return is_hist_colwise_; }
-
 protected:
  void ComputeBestSplitForFeature(FeatureHistogram* histogram_array_,
                                  int feature_index, int real_fidx,
@@ -93,7 +111,7 @@ class SerialTreeLearner: public TreeLearner {
                                  const LeafSplits* leaf_splits,
                                  SplitInfo* best_split);

-  void GetMultiValBin(const Dataset* dataset, bool is_first_time);
+  void GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time);

  virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
  /*!
@@ -182,17 +200,11 @@ class SerialTreeLearner: public TreeLearner {
  /*! \brief hessians of current iteration, ordered for cache optimized */
  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_;
 #endif
-
-  /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
-  std::vector<char, Common::AlignmentAllocator<char, kAlignedSize>> is_data_in_leaf_;
  /*! \brief used to cache historical histogram to speed up*/
  HistogramPool histogram_pool_;
  /*! \brief config of tree learner*/
  const Config* config_;
-  std::vector<int> ordered_bin_indices_;
-  bool is_constant_hessian_;
-  std::unique_ptr<TrainingTempState> temp_state_;
-  bool is_hist_colwise_;
+  std::unique_ptr<TrainingShareStates> share_state_;
  std::unique_ptr<CostEfficientGradientBoosting> cegb_;
 };