Commit dce329e5 authored by Hui Xue's avatar Hui Xue
Browse files

Merge remote-tracking branch 'upstream/master'

# Conflicts:
#	src/io/dataset.cpp
#	src/io/ordered_sparse_bin.hpp
#	src/treelearner/leaf_splits.hpp
#	src/treelearner/serial_tree_learner.cpp
parents 0b9fe27a a6a75fe9
......@@ -31,7 +31,7 @@ public:
optimize_pos_at_ = config.max_position;
sigmoid_table_ = nullptr;
if (sigmoid_ <= 0.0) {
Log::Stderr("sigmoid param %f should greater than zero", sigmoid_);
Log::Fatal("sigmoid param %f should greater than zero", sigmoid_);
}
}
~LambdarankNDCG() {
......@@ -47,7 +47,7 @@ public:
// get boundries
query_boundaries_ = metadata.query_boundaries();
if (query_boundaries_ == nullptr) {
Log::Stderr("For NDCG metric, should have query information");
Log::Fatal("For NDCG metric, should have query information");
}
num_queries_ = metadata.num_queries();
// cache inverse max DCG, avoid computation many times
......
......@@ -143,8 +143,8 @@ void DataParallelTreeLearner::FindBestThresholds() {
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_,
ptr_to_ordered_hessians_);
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_);
} else {
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
smaller_leaf_splits_->LeafIndex(),
......
......@@ -10,7 +10,7 @@
namespace LightGBM {
/*!
* \brief used to find splits candidates for a leaf
* \brief used to find split candidates for a leaf
*/
class LeafSplits {
public:
......@@ -26,7 +26,8 @@ public:
}
/*!
* \brief Init splits on current leaf, don't need to traverse all data
* \brief Init split on current leaf on partial data.
* \param leaf Index of current leaf
* \param data_partition current data partition
* \param sum_gradients
......@@ -43,7 +44,7 @@ public:
}
/*!
* \brief Init splits on current leaf, need to traverse all data to sum up
* \brief Init splits on current leaf, it will traverse all data to sum up the results
* \param gradients
* \param hessians
*/
......@@ -66,7 +67,7 @@ public:
}
/*!
* \brief Init splits on current leaf, need to travesal all data to sum up
* \brief Init splits on current leaf of partial data.
* \param leaf Index of current leaf
* \param data_partition current data partition
* \param gradients
......
......@@ -14,8 +14,8 @@ namespace LightGBM {
/*!
* \brief Feature parallel learning algorithm.
* Different machine will find best split on different features, then sync global best split
* When #data is small or #feature is large, you can use this to have better speed-up
* Different machine will find best split on different features, then sync global best split
* It is recommonded used when #data is small or #feature is large
*/
class FeatureParallelTreeLearner: public SerialTreeLearner {
public:
......@@ -39,8 +39,8 @@ private:
/*!
* \brief Data parallel learning algorithm.
* Workers use local data to construct histograms locally, then sync up global histograms.
* When #data is large or #feature is small, you can use this to have better speed-up
* Workers use local data to construct histograms locally, then sync up global histograms.
* It is recommonded used when #data is large or #feature is small
*/
class DataParallelTreeLearner: public SerialTreeLearner {
public:
......
......@@ -8,8 +8,7 @@
namespace LightGBM {
SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
:data_partition_(nullptr), is_feature_used_(nullptr),
historical_histogram_array_(nullptr), smaller_leaf_histogram_array_(nullptr),
:data_partition_(nullptr), is_feature_used_(nullptr), smaller_leaf_histogram_array_(nullptr),
larger_leaf_histogram_array_(nullptr),
smaller_leaf_splits_(nullptr), larger_leaf_splits_(nullptr),
ordered_gradients_(nullptr), ordered_hessians_(nullptr), is_data_in_leaf_(nullptr) {
......@@ -19,6 +18,8 @@ SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
min_sum_hessian_one_leaf_ = static_cast<float>(tree_config.min_sum_hessian_in_leaf);
feature_fraction_ = tree_config.feature_fraction;
random_ = Random(tree_config.feature_fraction_seed);
histogram_pool_size_ = tree_config.histogram_pool_size;
max_depth_ = tree_config.max_depth;
}
SerialTreeLearner::~SerialTreeLearner() {
......@@ -26,11 +27,11 @@ SerialTreeLearner::~SerialTreeLearner() {
if (smaller_leaf_splits_ != nullptr) { delete smaller_leaf_splits_; }
if (larger_leaf_splits_ != nullptr) { delete larger_leaf_splits_; }
for (int i = 0; i < num_leaves_; ++i) {
if (historical_histogram_array_[i] != nullptr) {
delete[] historical_histogram_array_[i];
FeatureHistogram* ptr = nullptr;
if (histogram_pool_.Get(i, &ptr)) {
delete[] ptr;
}
}
if (historical_histogram_array_ != nullptr) { delete[] historical_histogram_array_; }
if (is_feature_used_ != nullptr) { delete[] is_feature_used_; }
if (ordered_gradients_ != nullptr) { delete[] ordered_gradients_; }
if (ordered_hessians_ != nullptr) { delete[] ordered_hessians_; }
......@@ -46,16 +47,31 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
train_data_ = train_data;
num_data_ = train_data_->num_data();
num_features_ = train_data_->num_features();
int max_cache_size = 0;
// Get the max size of pool
if (histogram_pool_size_ < 0) {
max_cache_size = num_leaves_;
} else {
size_t total_histogram_size = 0;
for (int i = 0; i < train_data_->num_features(); ++i) {
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin();
}
max_cache_size = static_cast<int>(histogram_pool_size_ * 1024 * 1024 / total_histogram_size);
}
// at least need 2 leaves
max_cache_size = Common::Max(2, max_cache_size);
max_cache_size = Common::Min(max_cache_size, num_leaves_);
histogram_pool_.ResetSize(max_cache_size, num_leaves_);
// allocate the space for historical_histogram_array_
historical_histogram_array_ = new FeatureHistogram*[num_leaves_];
for (int i = 0; i < num_leaves_; ++i) {
historical_histogram_array_[i] = new FeatureHistogram[train_data_->num_features()];
for (int i = 0; i < max_cache_size; ++i) {
FeatureHistogram* tmp_histogram_array = new FeatureHistogram[train_data_->num_features()];
for (int j = 0; j < train_data_->num_features(); ++j) {
historical_histogram_array_[i][j].Init(train_data_->FeatureAt(j),
tmp_histogram_array[j].Init(train_data_->FeatureAt(j),
j, min_num_data_one_leaf_,
min_sum_hessian_one_leaf_);
}
// set data at i-th position
histogram_pool_.Set(i, tmp_histogram_array);
}
// push split information for all leaves
for (int i = 0; i < num_leaves_; ++i) {
......@@ -79,7 +95,7 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
break;
}
}
// initialize splits for leaf
// initialize splits for leaf
smaller_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());
larger_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());
......@@ -95,7 +111,7 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
if (has_ordered_bin_) {
is_data_in_leaf_ = new char[num_data_];
}
Log::Stdout("#data:%d #feature:%d\n", num_data_, num_features_);
Log::Info("Number of data:%d, Number of features:%d", num_data_, num_features_);
}
......@@ -105,6 +121,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
// some initial works before training
BeforeTrain();
Tree *tree = new Tree(num_leaves_);
// save pointer to last trained tree
last_trained_tree_ = tree;
// root leaf
int left_leaf = 0;
// only root leaf can be splitted on first time
......@@ -123,19 +141,19 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
// cannot split, quit
if (best_leaf_SplitInfo.gain <= 0.0) {
Log::Stdout("cannot find more split with gain = %f , current #leaves=%d\n",
Log::Info("cannot find more split with gain = %f , current #leaves=%d",
best_leaf_SplitInfo.gain, split + 1);
break;
}
// split tree with best leaf
Split(tree, best_leaf, &left_leaf, &right_leaf);
}
// save pointer to last trained tree
last_trained_tree_ = tree;
return tree;
}
void SerialTreeLearner::BeforeTrain() {
// reset histogram pool
histogram_pool_.ResetMap();
// initialize used features
for (int i = 0; i < num_features_; ++i) {
is_feature_used_[i] = false;
......@@ -146,13 +164,7 @@ void SerialTreeLearner::BeforeTrain() {
for (auto idx : used_feature_indices) {
is_feature_used_[idx] = true;
}
// set all histogram to splittable
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_leaves_; ++i) {
for (int j = 0; j < train_data_->num_features(); ++j) {
historical_histogram_array_[i][j].set_is_splittable(true);
}
}
// initialize data partition
data_partition_->Init();
......@@ -166,8 +178,8 @@ void SerialTreeLearner::BeforeTrain() {
// use all data
smaller_leaf_splits_->Init(gradients_, hessians_);
// point to gradients, avoid copy
ptr_to_ordered_gradients_ = gradients_;
ptr_to_ordered_hessians_ = hessians_;
ptr_to_ordered_gradients_smaller_leaf_ = gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = hessians_;
} else {
// use bagging, only use part of data
smaller_leaf_splits_->Init(0, data_partition_, gradients_, hessians_);
......@@ -180,10 +192,13 @@ void SerialTreeLearner::BeforeTrain() {
ordered_hessians_[i] = hessians_[indices[i]];
}
// point to ordered_gradients_ and ordered_hessians_
ptr_to_ordered_gradients_ = ordered_gradients_;
ptr_to_ordered_hessians_ = ordered_hessians_;
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_;
}
ptr_to_ordered_gradients_larger_leaf_ = nullptr;
ptr_to_ordered_hessians_larger_leaf_ = nullptr;
larger_leaf_splits_->Init();
// if has ordered bin, need to initialize the ordered bin
......@@ -220,6 +235,17 @@ void SerialTreeLearner::BeforeTrain() {
}
bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
// check depth of current leaf
if (max_depth_ > 0) {
// only need to check left leaf, since right leaf is in same level of left leaf
if (last_trained_tree_->leaf_depth(left_leaf) >= max_depth_) {
best_split_per_leaf_[left_leaf].gain = kMinScore;
if (right_leaf >= 0) {
best_split_per_leaf_[right_leaf].gain = kMinScore;
}
return false;
}
}
data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
// no enough data to continue
......@@ -231,26 +257,28 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
}
return false;
}
parent_leaf_histogram_array_ = nullptr;
// -1 if only has one leaf. else equal the index of smaller leaf
int smaller_leaf = -1;
int larger_leaf = -1;
// only have root
if (right_leaf < 0) {
smaller_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
larger_leaf_histogram_array_ = nullptr;
} else if (num_data_in_left_child < num_data_in_right_child) {
smaller_leaf = left_leaf;
larger_leaf = right_leaf;
// put parent(left) leaf's histograms into larger leaf's histograms
larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
// We will construc histograms for smaller leaf, and smaller_leaf=left_leaf = parent.
// if we don't swap the cache, we will overwrite the parent's histogram cache.
std::swap(historical_histogram_array_[left_leaf], historical_histogram_array_[right_leaf]);
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Move(left_leaf, right_leaf);
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
} else {
smaller_leaf = right_leaf;
larger_leaf = left_leaf;
// put parent(left) leaf's histograms to larger leaf's histograms
larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
}
// init for the ordered gradients, only initialize when have 2 leaves
......@@ -268,8 +296,23 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
ordered_hessians_[i - begin] = hessians_[indices[i]];
}
// assign pointer
ptr_to_ordered_gradients_ = ordered_gradients_;
ptr_to_ordered_hessians_ = ordered_hessians_;
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_;
if (parent_leaf_histogram_array_ == nullptr) {
// need order gradient for larger leaf
data_size_t smaller_size = end - begin;
data_size_t larger_begin = data_partition_->leaf_begin(larger_leaf);
data_size_t larger_end = larger_begin + data_partition_->leaf_count(larger_leaf);
// copy
#pragma omp parallel for schedule(static)
for (data_size_t i = larger_begin; i < larger_end; ++i) {
ordered_gradients_[smaller_size + i - larger_begin] = gradients_[indices[i]];
ordered_hessians_[smaller_size + i - larger_begin] = hessians_[indices[i]];
}
ptr_to_ordered_gradients_larger_leaf_ = ordered_gradients_ + smaller_size;
ptr_to_ordered_hessians_larger_leaf_ = ordered_hessians_ + smaller_size;
}
}
// split for the ordered bin
......@@ -301,7 +344,7 @@ void SerialTreeLearner::FindBestThresholds() {
// feature is not used
if ((is_feature_used_ != nullptr && is_feature_used_[feature_index] == false)) continue;
// if parent(larger) leaf cannot split at current feature
if (larger_leaf_histogram_array_ != nullptr && !larger_leaf_histogram_array_[feature_index].is_splittable()) {
if (parent_leaf_histogram_array_ != nullptr && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue;
}
......@@ -313,8 +356,8 @@ void SerialTreeLearner::FindBestThresholds() {
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_,
ptr_to_ordered_hessians_);
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_);
} else {
// used ordered bin
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
......@@ -331,9 +374,30 @@ void SerialTreeLearner::FindBestThresholds() {
// only has root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
// construct histograms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
if (parent_leaf_histogram_array_ != nullptr) {
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
} else {
if (ordered_bins_[feature_index] == nullptr) {
// if not use ordered bin
larger_leaf_histogram_array_[feature_index].Construct(larger_leaf_splits_->data_indices(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_larger_leaf_,
ptr_to_ordered_hessians_larger_leaf_);
} else {
// used ordered bin
larger_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
larger_leaf_splits_->LeafIndex(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
gradients_,
hessians_);
}
}
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
......
......@@ -3,6 +3,7 @@
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/lru_pool.h>
#include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h>
......@@ -122,8 +123,8 @@ protected:
Random random_;
/*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
bool* is_feature_used_;
/*! \brief cache historical histogram to speed up */
FeatureHistogram** historical_histogram_array_;
/*! \brief pointer to histograms array of parent of current leaves */
FeatureHistogram* parent_leaf_histogram_array_;
/*! \brief pointer to histograms array of smaller leaf */
FeatureHistogram* smaller_leaf_histogram_array_;
/*! \brief pointer to histograms array of larger leaf */
......@@ -143,15 +144,27 @@ protected:
score_t* ordered_hessians_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_;
const score_t* ptr_to_ordered_gradients_smaller_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_;
const score_t* ptr_to_ordered_hessians_smaller_leaf_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_larger_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_larger_leaf_;
/*! \brief Store ordered bin */
std::vector<OrderedBin*> ordered_bins_;
/*! \brief True if has ordered bin */
bool has_ordered_bin_ = false;
/*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */
char* is_data_in_leaf_;
/*! \brief max cache size(unit:GB) for historical histogram. < 0 means not limit */
double histogram_pool_size_;
/*! \brief used to cache historical histogram to speed up*/
LRUPool<FeatureHistogram*> histogram_pool_;
/*! \brief max depth of tree model */
int max_depth_;
};
......
......@@ -170,6 +170,7 @@
<ClInclude Include="..\include\LightGBM\utils\array_args.h" />
<ClInclude Include="..\include\LightGBM\utils\common.h" />
<ClInclude Include="..\include\LightGBM\utils\log.h" />
<ClInclude Include="..\include\LightGBM\utils\lru_pool.h" />
<ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h" />
<ClInclude Include="..\include\LightGBM\utils\random.h" />
<ClInclude Include="..\include\LightGBM\utils\text_reader.h" />
......
......@@ -156,6 +156,9 @@
<ClInclude Include="..\include\LightGBM\utils\threading.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\lru_pool.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp">
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment