slight reduce communication cost in parallel tree learner.

ecc8b8cd · Guolin Ke · 6c4a9750 · ecc8b8cd · ecc8b8cd · ecc8b8cd
Commit ecc8b8cd authored Aug 20, 2017 by Guolin Ke
5 changed files
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -233,14 +233,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
  }

  // sync global best info
-  std::memcpy(input_buffer_.data(), &smaller_best_split, sizeof(SplitInfo));
-  std::memcpy(input_buffer_.data() + sizeof(SplitInfo), &larger_best_split, sizeof(SplitInfo));
-
-  Network::Allreduce(input_buffer_.data(), sizeof(SplitInfo) * 2, sizeof(SplitInfo),
-                     output_buffer_.data(), &SplitInfo::MaxReducer);
-
-  std::memcpy(&smaller_best_split, output_buffer_.data(), sizeof(SplitInfo));
-  std::memcpy(&larger_best_split, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
+  SyncUpGlobalBestSplit(input_buffer_.data(), input_buffer_.data(), &smaller_best_split, &larger_best_split);

  // set best split
  this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best_split;

--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -60,14 +60,7 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(con
    larger_best_split = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
  }
  // sync global best info
-  std::memcpy(input_buffer_.data(), &smaller_best_split, sizeof(SplitInfo));
-  std::memcpy(input_buffer_.data() + sizeof(SplitInfo), &larger_best_split, sizeof(SplitInfo));
-
-  Network::Allreduce(input_buffer_.data(), sizeof(SplitInfo) * 2, sizeof(SplitInfo),
-                     output_buffer_.data(), &SplitInfo::MaxReducer);
-  // copy back
-  std::memcpy(&smaller_best_split, output_buffer_.data(), sizeof(SplitInfo));
-  std::memcpy(&larger_best_split, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
+  SyncUpGlobalBestSplit(input_buffer_.data(), input_buffer_.data(), &smaller_best_split, &larger_best_split);
  // update best split
  this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best_split;
  if (this->larger_leaf_splits_->LeafIndex() >= 0) {

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -125,7 +125,7 @@ protected:
  * \param splits All splits from local voting
  * \param out Result of gobal voting, only store feature indices
  */
-  void GlobalVoting(int leaf_idx, const std::vector<SplitInfo>& splits,
+  void GlobalVoting(int leaf_idx, const std::vector<LightSplitInfo>& splits,
    std::vector<int>* out);
  /*!
  * \brief Copy local histgram to buffer
@@ -180,6 +180,32 @@ private:
  std::vector<FeatureMetainfo> feature_metas_;
 };

+// To-do: reduce the communication cost by using bitset to communicate.
+inline void SyncUpGlobalBestSplit(char* input_buffer_, char* output_buffer_, SplitInfo* smaller_best_split, SplitInfo* larger_best_split) {
+  // sync global best info
+  int size = SplitInfo::Size();
+  smaller_best_split->CopyTo(input_buffer_);
+  larger_best_split->CopyTo(input_buffer_ + size);
+  Network::Allreduce(input_buffer_, size * 2, size, output_buffer_, 
+                     [&size] (const char* src, char* dst, int len) {
+    int used_size = 0;
+    LightSplitInfo p1, p2;
+    while (used_size < len) {
+      p1.CopyFrom(src);
+      p2.CopyFrom(dst);
+      if (p1 > p2) {
+        std::memcpy(dst, src, size);
+      }
+      src += size;
+      dst += size;
+      used_size += size;
+    }
+  });
+  // copy back
+  smaller_best_split->CopyFrom(output_buffer_);
+  larger_best_split->CopyFrom(output_buffer_ + size);
+}
+
 }  // namespace LightGBM
 #endif   // LightGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_

--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -17,34 +17,86 @@ namespace LightGBM {
 struct SplitInfo {
 public:
  /*! \brief Feature index */
-  int feature;
+  int feature = -1;
  /*! \brief Split threshold */
-  uint32_t threshold;
-  /*! \brief True if default split is left */
-  bool default_left;
+  uint32_t threshold = 0;
+  /*! \brief Left number of data after split */
+  data_size_t left_count = 0;
+  /*! \brief Right number of data after split */
+  data_size_t right_count = 0;
  /*! \brief Left output after split */
-  double left_output;
+  double left_output = 0.0;
  /*! \brief Right output after split */
-  double right_output;
+  double right_output = 0.0;
  /*! \brief Split gain */
-  double gain;
-  /*! \brief Left number of data after split */
-  data_size_t left_count;
-  /*! \brief Right number of data after split */
-  data_size_t right_count;
+  double gain = kMinScore;
  /*! \brief Left sum gradient after split */
-  double left_sum_gradient;
+  double left_sum_gradient = 0;
  /*! \brief Left sum hessian after split */
-  double left_sum_hessian;
+  double left_sum_hessian = 0;
  /*! \brief Right sum gradient after split */
-  double right_sum_gradient;
+  double right_sum_gradient = 0;
  /*! \brief Right sum hessian after split */
-  double right_sum_hessian;
+  double right_sum_hessian = 0;
+  /*! \brief True if default split is left */
+  bool default_left = true;

-  SplitInfo() {
-    // initialize with -1 and -inf gain
-    feature = -1;
-    gain = kMinScore;
+  inline static int Size() {
+    return sizeof(int) + sizeof(uint32_t) + sizeof(bool) + sizeof(double) * 7 + sizeof(data_size_t) * 2;
+  }
+
+  inline void CopyTo(char* buffer) const {
+    std::memcpy(buffer, &feature, sizeof(feature));
+    buffer += sizeof(feature);
+    std::memcpy(buffer, &left_count, sizeof(left_count));
+    buffer += sizeof(left_count);
+    std::memcpy(buffer, &right_count, sizeof(right_count));
+    buffer += sizeof(right_count);
+    std::memcpy(buffer, &gain, sizeof(gain));
+    buffer += sizeof(gain);
+    std::memcpy(buffer, &threshold, sizeof(threshold));
+    buffer += sizeof(threshold);
+    std::memcpy(buffer, &left_output, sizeof(left_output));
+    buffer += sizeof(left_output);
+    std::memcpy(buffer, &right_output, sizeof(right_output));
+    buffer += sizeof(right_output);
+    std::memcpy(buffer, &left_sum_gradient, sizeof(left_sum_gradient));
+    buffer += sizeof(left_sum_gradient);
+    std::memcpy(buffer, &left_sum_hessian, sizeof(left_sum_hessian));
+    buffer += sizeof(left_sum_hessian);
+    std::memcpy(buffer, &right_sum_gradient, sizeof(right_sum_gradient));
+    buffer += sizeof(right_sum_gradient);
+    std::memcpy(buffer, &right_sum_hessian, sizeof(right_sum_hessian));
+    buffer += sizeof(right_sum_hessian);
+    std::memcpy(buffer, &default_left, sizeof(default_left));
+    buffer += sizeof(default_left);
+  }
+
+  void CopyFrom(const char* buffer) {
+    std::memcpy(&feature, buffer, sizeof(feature));
+    buffer += sizeof(feature);
+    std::memcpy(&left_count, buffer, sizeof(left_count));
+    buffer += sizeof(left_count);
+    std::memcpy(&right_count, buffer, sizeof(right_count));
+    buffer += sizeof(right_count);
+    std::memcpy(&gain, buffer, sizeof(gain));
+    buffer += sizeof(gain);
+    std::memcpy(&threshold, buffer, sizeof(threshold));
+    buffer += sizeof(threshold);
+    std::memcpy(&left_output, buffer, sizeof(left_output));
+    buffer += sizeof(left_output);
+    std::memcpy(&right_output, buffer, sizeof(right_output));
+    buffer += sizeof(right_output);
+    std::memcpy(&left_sum_gradient, buffer, sizeof(left_sum_gradient));
+    buffer += sizeof(left_sum_gradient);
+    std::memcpy(&left_sum_hessian, buffer, sizeof(left_sum_hessian));
+    buffer += sizeof(left_sum_hessian);
+    std::memcpy(&right_sum_gradient, buffer, sizeof(right_sum_gradient));
+    buffer += sizeof(right_sum_gradient);
+    std::memcpy(&right_sum_hessian, buffer, sizeof(right_sum_hessian));
+    buffer += sizeof(right_sum_hessian);
+    std::memcpy(&default_left, buffer, sizeof(default_left));
+    buffer += sizeof(default_left);
  }

  inline void Reset() {
@@ -53,32 +105,102 @@ public:
    gain = kMinScore;
  }

-  inline bool operator > (const SplitInfo &si) const;
-
-  inline bool operator == (const SplitInfo &si) const;
+  inline bool operator > (const SplitInfo& si) const {
+    double local_gain = this->gain;
+    double other_gain = si.gain;
+    // replace nan with -inf
+    if (local_gain == NAN) {
+      local_gain = kMinScore;
+    }
+    // replace nan with -inf
+    if (other_gain == NAN) {
+      other_gain = kMinScore;
+    }
+    int local_feature = this->feature;
+    int other_feature = si.feature;
+    // replace -1 with max int
+    if (local_feature == -1) {
+      local_feature = INT32_MAX;
+    }
+    // replace -1 with max int
+    if (other_feature == -1) {
+      other_feature = INT32_MAX;
+    }
+    if (local_gain != other_gain) {
+      return local_gain > other_gain;
+    } else {
+      // if same gain, use smaller feature
+      return local_feature < other_feature;
+    }
+  }

-  inline static void MaxReducer(const char* src, char* dst, int len) {
-    const int type_size = sizeof(SplitInfo);
-    int used_size = 0;
-    const SplitInfo* p1;
-    SplitInfo* p2;
-    while (used_size < len) {
-      p1 = reinterpret_cast<const SplitInfo*>(src);
-      p2 = reinterpret_cast<SplitInfo*>(dst);
-      if (*p1 > *p2) {
-        // copy
-        std::memcpy(dst, src, type_size);
+  inline bool operator == (const SplitInfo& si) const {
+    double local_gain = this->gain;
+    double other_gain = si.gain;
+    // replace nan with -inf
+    if (local_gain == NAN) {
+      local_gain = kMinScore;
    }
-      src += type_size;
-      dst += type_size;
-      used_size += type_size;
+    // replace nan with -inf
+    if (other_gain == NAN) {
+      other_gain = kMinScore;
+    }
+    int local_feature = this->feature;
+    int other_feature = si.feature;
+    // replace -1 with max int
+    if (local_feature == -1) {
+      local_feature = INT32_MAX;
+    }
+    // replace -1 with max int
+    if (other_feature == -1) {
+      other_feature = INT32_MAX;
+    }
+    if (local_gain != other_gain) {
+      return local_gain == other_gain;
+    } else {
+      // if same gain, use smaller feature
+      return local_feature == other_feature;
    }
  }
+
 };

+struct LightSplitInfo {
+public:
+  /*! \brief Feature index */
+  int feature = -1;
+  /*! \brief Split gain */
+  double gain = kMinScore;
+  /*! \brief Left number of data after split */
+  data_size_t left_count = 0;
+  /*! \brief Right number of data after split */
+  data_size_t right_count = 0;

+  inline void Reset() {
+    // initialize with -1 and -inf gain
+    feature = -1;
+    gain = kMinScore;
+  }
+
+  void CopyFrom(const SplitInfo& other) {
+    feature = other.feature;
+    gain = other.gain;
+    left_count = other.left_count;
+    right_count = other.right_count;
+  }
+
+  void CopyFrom(const char* buffer) {
+    std::memcpy(&feature, buffer, sizeof(feature));
+    buffer += sizeof(feature);
+    std::memcpy(&left_count, buffer, sizeof(left_count));
+    buffer += sizeof(left_count);
+    std::memcpy(&right_count, buffer, sizeof(right_count));
+    buffer += sizeof(right_count);
+    std::memcpy(&gain, buffer, sizeof(gain));
+    buffer += sizeof(gain);
+  }

-inline bool SplitInfo::operator > (const SplitInfo& si) const {
+  inline bool operator > (const LightSplitInfo& si) const {
    double local_gain = this->gain;
    double other_gain = si.gain;
    // replace nan with -inf
@@ -105,9 +227,9 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const {
      // if same gain, use smaller feature
      return local_feature < other_feature;
    }
-}
+  }

-inline bool SplitInfo::operator == (const SplitInfo& si) const {
+  inline bool operator == (const LightSplitInfo& si) const {
    double local_gain = this->gain;
    double other_gain = si.gain;
    // replace nan with -inf
@@ -134,7 +256,9 @@ inline bool SplitInfo::operator == (const SplitInfo& si) const {
      // if same gain, use smaller feature
      return local_feature == other_feature;
    }
-}
+  }
+
+};

 }  // namespace LightGBM
 #endif   // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -33,7 +33,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
    }
  }
  // calculate buffer size
-  size_t buffer_size = 2 * top_k_ * std::max(max_bin * sizeof(HistogramBinEntry), sizeof(SplitInfo) * num_machines_);
+  size_t buffer_size = 2 * top_k_ * std::max(max_bin * sizeof(HistogramBinEntry), sizeof(LightSplitInfo) * num_machines_);
  // left and right on same time, so need double size
  input_buffer_.resize(buffer_size);
  output_buffer_.resize(buffer_size);
@@ -162,14 +162,14 @@ bool VotingParallelTreeLearner<TREELEARNER_T>::BeforeFindBestSplit(const Tree* t
 }

 template <typename TREELEARNER_T>
-void VotingParallelTreeLearner<TREELEARNER_T>::GlobalVoting(int leaf_idx, const std::vector<SplitInfo>& splits, std::vector<int>* out) {
+void VotingParallelTreeLearner<TREELEARNER_T>::GlobalVoting(int leaf_idx, const std::vector<LightSplitInfo>& splits, std::vector<int>* out) {
  out->clear();
  if (leaf_idx < 0) {
    return;
  }
  // get mean number on machines
  score_t mean_num_data = GetGlobalDataCountInLeaf(leaf_idx) / static_cast<score_t>(num_machines_);
-  std::vector<SplitInfo> feature_best_split(this->num_features_, SplitInfo());
+  std::vector<LightSplitInfo> feature_best_split(this->num_features_, LightSplitInfo());
  for (auto & split : splits) {
    int fid = split.feature;
    if (fid < 0) {
@@ -183,8 +183,8 @@ void VotingParallelTreeLearner<TREELEARNER_T>::GlobalVoting(int leaf_idx, const
    }
  }
  // get top k
-  std::vector<SplitInfo> top_k_splits;
-  ArrayArgs<SplitInfo>::MaxK(feature_best_split, top_k_, &top_k_splits);
+  std::vector<LightSplitInfo> top_k_splits;
+  ArrayArgs<LightSplitInfo>::MaxK(feature_best_split, top_k_, &top_k_splits);
  for (auto& split : top_k_splits) {
    if (split.gain == kMinScore || split.feature == -1) {
      continue;
@@ -318,27 +318,35 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
  // local voting
  ArrayArgs<SplitInfo>::MaxK(smaller_bestsplit_per_features, top_k_, &smaller_top_k_splits);
  ArrayArgs<SplitInfo>::MaxK(larger_bestsplit_per_features, top_k_, &larger_top_k_splits);
+
+  std::vector<LightSplitInfo> smaller_top_k_light_splits(top_k_);
+  std::vector<LightSplitInfo> larger_top_k_light_splits(top_k_);
+  for (int i = 0; i < top_k_; ++i) {
+    smaller_top_k_light_splits[i].CopyFrom(smaller_top_k_splits[i]);
+    larger_top_k_light_splits[i].CopyFrom(larger_top_k_splits[i]);
+  }
+
  // gather
  int offset = 0;
  for (int i = 0; i < top_k_; ++i) {
-    std::memcpy(input_buffer_.data() + offset, &smaller_top_k_splits[i], sizeof(SplitInfo));
-    offset += sizeof(SplitInfo);
-    std::memcpy(input_buffer_.data() + offset, &larger_top_k_splits[i], sizeof(SplitInfo));
-    offset += sizeof(SplitInfo);
+    std::memcpy(input_buffer_.data() + offset, &smaller_top_k_light_splits[i], sizeof(LightSplitInfo));
+    offset += sizeof(LightSplitInfo);
+    std::memcpy(input_buffer_.data() + offset, &larger_top_k_light_splits[i], sizeof(LightSplitInfo));
+    offset += sizeof(LightSplitInfo);
  }
  Network::Allgather(input_buffer_.data(), offset, output_buffer_.data());
  // get all top-k from all machines
-  std::vector<SplitInfo> smaller_top_k_splits_global;
-  std::vector<SplitInfo> larger_top_k_splits_global;
+  std::vector<LightSplitInfo> smaller_top_k_splits_global;
+  std::vector<LightSplitInfo> larger_top_k_splits_global;
  offset = 0;
  for (int i = 0; i < num_machines_; ++i) {
    for (int j = 0; j < top_k_; ++j) {
-      smaller_top_k_splits_global.push_back(SplitInfo());
-      std::memcpy(&smaller_top_k_splits_global.back(), output_buffer_.data() + offset, sizeof(SplitInfo));
-      offset += sizeof(SplitInfo);
-      larger_top_k_splits_global.push_back(SplitInfo());
-      std::memcpy(&larger_top_k_splits_global.back(), output_buffer_.data() + offset, sizeof(SplitInfo));
-      offset += sizeof(SplitInfo);
+      smaller_top_k_splits_global.push_back(LightSplitInfo());
+      std::memcpy(&smaller_top_k_splits_global.back(), output_buffer_.data() + offset, sizeof(LightSplitInfo));
+      offset += sizeof(LightSplitInfo);
+      larger_top_k_splits_global.push_back(LightSplitInfo());
+      std::memcpy(&larger_top_k_splits_global.back(), output_buffer_.data() + offset, sizeof(LightSplitInfo));
+      offset += sizeof(LightSplitInfo);
    }
  }
  // global voting
@@ -434,13 +442,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
    larger_best_split = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
  }
  // sync global best info
-  std::memcpy(input_buffer_.data(), &smaller_best_split, sizeof(SplitInfo));
-  std::memcpy(input_buffer_.data() + sizeof(SplitInfo), &larger_best_split, sizeof(SplitInfo));
-
-  Network::Allreduce(input_buffer_.data(), sizeof(SplitInfo) * 2, sizeof(SplitInfo), output_buffer_.data(), &SplitInfo::MaxReducer);
-
-  std::memcpy(&smaller_best_split, output_buffer_.data(), sizeof(SplitInfo));
-  std::memcpy(&larger_best_split, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
+  SyncUpGlobalBestSplit(input_buffer_.data(), input_buffer_.data(), &smaller_best_split, &larger_best_split);

  // copy back
  this->best_split_per_leaf_[smaller_leaf_splits_global_->LeafIndex()] = smaller_best_split;