Merge remote-tracking branch 'upstream/master'

# Conflicts: # src/io/dataset.cpp # src/io/ordered_sparse_bin.hpp # src/treelearner/leaf_splits.hpp # src/treelearner/serial_tree_learner.cpp

Merge remote-tracking branch 'upstream/master'
# Conflicts: # src/io/dataset.cpp # src/io/ordered_sparse_bin.hpp # src/treelearner/leaf_splits.hpp # src/treelearner/serial_tree_learner.cpp
dce329e5 · Hui Xue · 0b9fe27a · a6a75fe9 · dce329e5 · dce329e5
Commit dce329e5 authored Oct 25, 2016 by Hui Xue
8 changed files
--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -31,7 +31,7 @@ public:
    optimize_pos_at_ = config.max_position;
    sigmoid_table_ = nullptr;
    if (sigmoid_ <= 0.0) {
-      Log::Stderr("sigmoid param %f should greater than zero", sigmoid_);
+      Log::Fatal("sigmoid param %f should greater than zero", sigmoid_);
    }
  }
  ~LambdarankNDCG() {
@@ -47,7 +47,7 @@ public:
    // get boundries
    query_boundaries_ = metadata.query_boundaries();
    if (query_boundaries_ == nullptr) {
-      Log::Stderr("For NDCG metric, should have query information");
+      Log::Fatal("For NDCG metric, should have query information");
    }
    num_queries_ = metadata.num_queries();
    // cache inverse max DCG, avoid computation many times

--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -143,8 +143,8 @@ void DataParallelTreeLearner::FindBestThresholds() {
                                                             smaller_leaf_splits_->num_data_in_leaf(),
                                                             smaller_leaf_splits_->sum_gradients(),
                                                             smaller_leaf_splits_->sum_hessians(),
-                                                             ptr_to_ordered_gradients_,
-                                                             ptr_to_ordered_hessians_);
+                                                             ptr_to_ordered_gradients_smaller_leaf_,
+                                                             ptr_to_ordered_hessians_smaller_leaf_);
    } else {
      smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
                                                             smaller_leaf_splits_->LeafIndex(),

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -10,7 +10,7 @@
 namespace LightGBM {

 /*!
-* \brief used to find splits candidates for a leaf
+* \brief used to find split candidates for a leaf
 */
 class LeafSplits {
 public:
@@ -26,7 +26,8 @@ public:
  }

  /*!
-  * \brief Init splits on current leaf, don't need to traverse all data
+
+  * \brief Init split on current leaf on partial data. 
  * \param leaf Index of current leaf
  * \param data_partition current data partition
  * \param sum_gradients
@@ -43,7 +44,7 @@ public:
  }

  /*!
-  * \brief Init splits on current leaf, need to traverse all data to sum up
+  * \brief Init splits on current leaf, it will traverse all data to sum up the results
  * \param gradients
  * \param hessians
  */
@@ -66,7 +67,7 @@ public:
  }

  /*!
-  * \brief Init splits on current leaf, need to travesal all data to sum up
+  * \brief Init splits on current leaf of partial data.
  * \param leaf Index of current leaf
  * \param data_partition current data partition
  * \param gradients

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -14,8 +14,8 @@ namespace LightGBM {

 /*!
 * \brief Feature parallel learning algorithm.
-* Different machine will find best split on different features, then sync global best split
-* When #data is small or #feature is large, you can use this to have better speed-up
+*        Different machine will find best split on different features, then sync global best split
+*        It is recommonded used when #data is small or #feature is large
 */
 class FeatureParallelTreeLearner: public SerialTreeLearner {
 public:
@@ -39,8 +39,8 @@ private:

 /*!
 * \brief Data parallel learning algorithm.
-* Workers use local data to construct histograms locally, then sync up global histograms.
-* When #data is large or #feature is small, you can use this to have better speed-up
+*        Workers use local data to construct histograms locally, then sync up global histograms.
+*        It is recommonded used when #data is large or #feature is small
 */
 class DataParallelTreeLearner: public SerialTreeLearner {
 public:

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -8,8 +8,7 @@
 namespace LightGBM {

 SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
-  :data_partition_(nullptr), is_feature_used_(nullptr),
-  historical_histogram_array_(nullptr), smaller_leaf_histogram_array_(nullptr),
+  :data_partition_(nullptr), is_feature_used_(nullptr), smaller_leaf_histogram_array_(nullptr),
  larger_leaf_histogram_array_(nullptr),
  smaller_leaf_splits_(nullptr), larger_leaf_splits_(nullptr),
  ordered_gradients_(nullptr), ordered_hessians_(nullptr), is_data_in_leaf_(nullptr) {
@@ -19,6 +18,8 @@ SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
  min_sum_hessian_one_leaf_ = static_cast<float>(tree_config.min_sum_hessian_in_leaf);
  feature_fraction_ = tree_config.feature_fraction;
  random_ = Random(tree_config.feature_fraction_seed);
+  histogram_pool_size_ = tree_config.histogram_pool_size;
+  max_depth_ = tree_config.max_depth;
 }

 SerialTreeLearner::~SerialTreeLearner() {
@@ -26,11 +27,11 @@ SerialTreeLearner::~SerialTreeLearner() {
  if (smaller_leaf_splits_ != nullptr) { delete smaller_leaf_splits_; }
  if (larger_leaf_splits_ != nullptr) { delete larger_leaf_splits_; }
  for (int i = 0; i < num_leaves_; ++i) {
-    if (historical_histogram_array_[i] != nullptr) {
-      delete[] historical_histogram_array_[i];
+    FeatureHistogram* ptr = nullptr;
+    if (histogram_pool_.Get(i, &ptr)) {
+      delete[] ptr;
    }
  }
-  if (historical_histogram_array_ != nullptr) { delete[] historical_histogram_array_; }
  if (is_feature_used_ != nullptr) { delete[] is_feature_used_; }
  if (ordered_gradients_ != nullptr) { delete[] ordered_gradients_; }
  if (ordered_hessians_ != nullptr) { delete[] ordered_hessians_; }
@@ -46,16 +47,31 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
  num_features_ = train_data_->num_features();
+  int max_cache_size = 0;
+  // Get the max size of pool
+  if (histogram_pool_size_ < 0) {
+    max_cache_size = num_leaves_;
+  } else {
+    size_t total_histogram_size = 0;
+    for (int i = 0; i < train_data_->num_features(); ++i) {
+      total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin();
+    }
+    max_cache_size = static_cast<int>(histogram_pool_size_ * 1024 * 1024 / total_histogram_size);
+  }
+  // at least need 2 leaves
+  max_cache_size = Common::Max(2, max_cache_size);
+  max_cache_size = Common::Min(max_cache_size, num_leaves_);
+  histogram_pool_.ResetSize(max_cache_size, num_leaves_);

-  // allocate the space for historical_histogram_array_
-  historical_histogram_array_ = new FeatureHistogram*[num_leaves_];
-  for (int i = 0; i < num_leaves_; ++i) {
-    historical_histogram_array_[i] = new FeatureHistogram[train_data_->num_features()];
+  for (int i = 0; i < max_cache_size; ++i) {
+    FeatureHistogram* tmp_histogram_array = new FeatureHistogram[train_data_->num_features()];
    for (int j = 0; j < train_data_->num_features(); ++j) {
-      historical_histogram_array_[i][j].Init(train_data_->FeatureAt(j),
+      tmp_histogram_array[j].Init(train_data_->FeatureAt(j),
                                             j, min_num_data_one_leaf_,
                                             min_sum_hessian_one_leaf_);
    }
+    // set data at i-th position
+    histogram_pool_.Set(i, tmp_histogram_array);
  }
  // push split information for all leaves
  for (int i = 0; i < num_leaves_; ++i) {
@@ -79,7 +95,7 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
      break;
    }
  }
-  // initialize  splits for leaf
+  // initialize splits for leaf
  smaller_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());
  larger_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());

@@ -95,7 +111,7 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  if (has_ordered_bin_) {
    is_data_in_leaf_ = new char[num_data_];
  }
-  Log::Stdout("#data:%d #feature:%d\n", num_data_, num_features_);
+  Log::Info("Number of data:%d, Number of features:%d", num_data_, num_features_);
 }


@@ -105,6 +121,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
  // some initial works before training
  BeforeTrain();
  Tree *tree = new Tree(num_leaves_);
+  // save pointer to last trained tree
+  last_trained_tree_ = tree;
  // root leaf
  int left_leaf = 0;
  // only root leaf can be splitted on first time
@@ -123,19 +141,19 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
    const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
    // cannot split, quit
    if (best_leaf_SplitInfo.gain <= 0.0) {
-      Log::Stdout("cannot find more split with gain = %f , current #leaves=%d\n",
+      Log::Info("cannot find more split with gain = %f , current #leaves=%d",
                   best_leaf_SplitInfo.gain, split + 1);
      break;
    }
    // split tree with best leaf
    Split(tree, best_leaf, &left_leaf, &right_leaf);
  }
-  // save pointer to last trained tree
-  last_trained_tree_ = tree;
  return tree;
 }

 void SerialTreeLearner::BeforeTrain() {
+  // reset histogram pool
+  histogram_pool_.ResetMap();
  // initialize used features
  for (int i = 0; i < num_features_; ++i) {
    is_feature_used_[i] = false;
@@ -146,13 +164,7 @@ void SerialTreeLearner::BeforeTrain() {
  for (auto idx : used_feature_indices) {
    is_feature_used_[idx] = true;
  }
-  // set all histogram to splittable
-  #pragma omp parallel for schedule(static)
-  for (int i = 0; i < num_leaves_; ++i) {
-    for (int j = 0; j < train_data_->num_features(); ++j) {
-      historical_histogram_array_[i][j].set_is_splittable(true);
-    }
-  }
+
  // initialize data partition
  data_partition_->Init();

@@ -166,8 +178,8 @@ void SerialTreeLearner::BeforeTrain() {
    // use all data
    smaller_leaf_splits_->Init(gradients_, hessians_);
    // point to gradients, avoid copy
-    ptr_to_ordered_gradients_ = gradients_;
-    ptr_to_ordered_hessians_ = hessians_;
+    ptr_to_ordered_gradients_smaller_leaf_ = gradients_;
+    ptr_to_ordered_hessians_smaller_leaf_  = hessians_;
  } else {
    // use bagging, only use part of data
    smaller_leaf_splits_->Init(0, data_partition_, gradients_, hessians_);
@@ -180,10 +192,13 @@ void SerialTreeLearner::BeforeTrain() {
      ordered_hessians_[i] = hessians_[indices[i]];
    }
    // point to ordered_gradients_ and ordered_hessians_
-    ptr_to_ordered_gradients_ = ordered_gradients_;
-    ptr_to_ordered_hessians_ = ordered_hessians_;
+    ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_;
+    ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_;
  }

+  ptr_to_ordered_gradients_larger_leaf_ = nullptr;
+  ptr_to_ordered_hessians_larger_leaf_ = nullptr;
+
  larger_leaf_splits_->Init();

  // if has ordered bin, need to initialize the ordered bin
@@ -220,6 +235,17 @@ void SerialTreeLearner::BeforeTrain() {
 }

 bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
+  // check depth of current leaf
+  if (max_depth_ > 0) {
+    // only need to check left leaf, since right leaf is in same level of left leaf
+    if (last_trained_tree_->leaf_depth(left_leaf) >= max_depth_) {
+      best_split_per_leaf_[left_leaf].gain = kMinScore;
+      if (right_leaf >= 0) {
+        best_split_per_leaf_[right_leaf].gain = kMinScore;
+      }
+      return false;
+    }
+  }
  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // no enough data to continue
@@ -231,26 +257,28 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
    }
    return false;
  }
+  parent_leaf_histogram_array_ = nullptr;
  // -1 if only has one leaf. else equal the index of smaller leaf
  int smaller_leaf = -1;
+  int larger_leaf = -1;
  // only have root
  if (right_leaf < 0) {
-    smaller_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
+    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
    larger_leaf_histogram_array_ = nullptr;
+    
  } else if (num_data_in_left_child < num_data_in_right_child) {
    smaller_leaf = left_leaf;
+    larger_leaf = right_leaf;
    // put parent(left) leaf's histograms into larger leaf's histograms
-    larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
-    smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
-
-    // We will construc histograms for smaller leaf, and smaller_leaf=left_leaf = parent.
-    // if we don't swap the cache, we will overwrite the parent's histogram cache.
-    std::swap(historical_histogram_array_[left_leaf], historical_histogram_array_[right_leaf]);
+    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
+    histogram_pool_.Move(left_leaf, right_leaf);
+    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
  } else {
    smaller_leaf = right_leaf;
+    larger_leaf = left_leaf;
    // put parent(left) leaf's histograms to larger leaf's histograms
-    larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
-    smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
+    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
+    histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
  }

  // init for the ordered gradients, only initialize when have 2 leaves
@@ -268,8 +296,23 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
      ordered_hessians_[i - begin] = hessians_[indices[i]];
    }
    // assign pointer
-    ptr_to_ordered_gradients_ = ordered_gradients_;
-    ptr_to_ordered_hessians_ = ordered_hessians_;
+    ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_;
+    ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_;
+
+    if (parent_leaf_histogram_array_ == nullptr) {
+      // need order gradient for larger leaf
+      data_size_t smaller_size = end - begin;
+      data_size_t larger_begin = data_partition_->leaf_begin(larger_leaf);
+      data_size_t larger_end = larger_begin + data_partition_->leaf_count(larger_leaf);
+      // copy
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = larger_begin; i < larger_end; ++i) {
+        ordered_gradients_[smaller_size + i - larger_begin] = gradients_[indices[i]];
+        ordered_hessians_[smaller_size + i - larger_begin] = hessians_[indices[i]];
+      }
+      ptr_to_ordered_gradients_larger_leaf_ = ordered_gradients_ + smaller_size;
+      ptr_to_ordered_hessians_larger_leaf_ = ordered_hessians_ + smaller_size;
+    }
  }

  // split for the ordered bin
@@ -301,7 +344,7 @@ void SerialTreeLearner::FindBestThresholds() {
    // feature is not used
    if ((is_feature_used_ != nullptr && is_feature_used_[feature_index] == false)) continue;
    // if parent(larger) leaf cannot split at current feature
-    if (larger_leaf_histogram_array_ != nullptr && !larger_leaf_histogram_array_[feature_index].is_splittable()) {
+    if (parent_leaf_histogram_array_ != nullptr && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
      continue;
    }
@@ -313,8 +356,8 @@ void SerialTreeLearner::FindBestThresholds() {
        smaller_leaf_splits_->num_data_in_leaf(),
        smaller_leaf_splits_->sum_gradients(),
        smaller_leaf_splits_->sum_hessians(),
-        ptr_to_ordered_gradients_,
-        ptr_to_ordered_hessians_);
+        ptr_to_ordered_gradients_smaller_leaf_,
+        ptr_to_ordered_hessians_smaller_leaf_);
    } else {
      // used ordered bin
      smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
@@ -331,9 +374,30 @@ void SerialTreeLearner::FindBestThresholds() {
    // only has root leaf
    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;

-    // construct histograms for large leaf, we initialize larger leaf as the parent,
-    // so we can just subtract the smaller leaf's histograms
-    larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
+    if (parent_leaf_histogram_array_ != nullptr) {
+      // construct histgroms for large leaf, we initialize larger leaf as the parent,
+      // so we can just subtract the smaller leaf's histograms
+      larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
+    } else {
+      if (ordered_bins_[feature_index] == nullptr) {
+        // if not use ordered bin
+        larger_leaf_histogram_array_[feature_index].Construct(larger_leaf_splits_->data_indices(),
+          larger_leaf_splits_->num_data_in_leaf(),
+          larger_leaf_splits_->sum_gradients(),
+          larger_leaf_splits_->sum_hessians(),
+          ptr_to_ordered_gradients_larger_leaf_,
+          ptr_to_ordered_hessians_larger_leaf_);
+      } else {
+        // used ordered bin
+        larger_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
+          larger_leaf_splits_->LeafIndex(),
+          larger_leaf_splits_->num_data_in_leaf(),
+          larger_leaf_splits_->sum_gradients(),
+          larger_leaf_splits_->sum_hessians(),
+          gradients_,
+          hessians_);
+      }
+    }

    // find best threshold for larger child
    larger_leaf_histogram_array_[feature_index].FindBestThreshold(&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -3,6 +3,7 @@

 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/lru_pool.h>

 #include <LightGBM/tree_learner.h>
 #include <LightGBM/dataset.h>
@@ -122,8 +123,8 @@ protected:
  Random random_;
  /*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
  bool* is_feature_used_;
-  /*! \brief cache historical histogram to speed up */
-  FeatureHistogram** historical_histogram_array_;
+  /*! \brief pointer to histograms array of parent of current leaves */
+  FeatureHistogram* parent_leaf_histogram_array_;
  /*! \brief pointer to histograms array of smaller leaf */
  FeatureHistogram* smaller_leaf_histogram_array_;
  /*! \brief pointer to histograms array of larger leaf */
@@ -143,15 +144,27 @@ protected:
  score_t* ordered_hessians_;

  /*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
-  const score_t* ptr_to_ordered_gradients_;
+  const score_t* ptr_to_ordered_gradients_smaller_leaf_;
  /*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
-  const score_t* ptr_to_ordered_hessians_;
+  const score_t* ptr_to_ordered_hessians_smaller_leaf_;
+
+  /*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
+  const score_t* ptr_to_ordered_gradients_larger_leaf_;
+  /*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
+  const score_t* ptr_to_ordered_hessians_larger_leaf_;
+
  /*! \brief Store ordered bin */
  std::vector<OrderedBin*> ordered_bins_;
  /*! \brief True if has ordered bin */
  bool has_ordered_bin_ = false;
  /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
  char* is_data_in_leaf_;
+  /*! \brief  max cache size(unit:GB) for historical histogram. < 0 means not limit */
+  double histogram_pool_size_;
+  /*! \brief used to cache historical histogram to speed up*/
+  LRUPool<FeatureHistogram*> histogram_pool_;
+  /*! \brief  max depth of tree model */
+  int max_depth_;
 };



--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -170,6 +170,7 @@
    <ClInclude Include="..\include\LightGBM\utils\array_args.h" />
    <ClInclude Include="..\include\LightGBM\utils\common.h" />
    <ClInclude Include="..\include\LightGBM\utils\log.h" />
+    <ClInclude Include="..\include\LightGBM\utils\lru_pool.h" />
    <ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h" />
    <ClInclude Include="..\include\LightGBM\utils\random.h" />
    <ClInclude Include="..\include\LightGBM\utils\text_reader.h" />

--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -156,6 +156,9 @@
    <ClInclude Include="..\include\LightGBM\utils\threading.h">
      <Filter>include\LightGBM\utils</Filter>
    </ClInclude>
+    <ClInclude Include="..\include\LightGBM\utils\lru_pool.h">
+      <Filter>include\LightGBM\utils</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\src\application\application.cpp">