"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "7c9a985a476309ba1d7a7767b5247a3d4eb579b9"
Commit 623ac048 authored by Guolin Ke's avatar Guolin Ke
Browse files

add lru pool to reduce memory usage when #feature is large

parent 2ec7274a
......@@ -140,6 +140,8 @@ public:
int num_leaves = 127;
int feature_fraction_seed = 2;
double feature_fraction = 1.0;
// max cache size(unit:GB) for historical histogram. < 0 means not limit
double histogram_pool_size = -1;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......
......@@ -5,7 +5,6 @@
#include <cstdlib>
#include <cstdarg>
#include <cstring>
#include <fstream>
namespace LightGBM {
......
#ifndef LIGHTGBM_UTILS_LRU_POOL_H_
#define LIGHTGBM_UTILS_LRU_POOL_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <cstring>
namespace LightGBM {
/*!
* \brief A LRU cached object pool, used for store historical histograms
*/
template<typename T>
class LRUPool {
public:
/*!
* \brief Constructor
*/
LRUPool() {
}
/*!
* \brief Destructor
*/
~LRUPool() {
if (pool_ != nullptr) {
delete[] pool_;
}
if (mapper_ != nullptr) {
delete[] mapper_;
}
if (inverse_mapper_ != nullptr) {
delete[] inverse_mapper_;
}
if (last_used_time_ != nullptr) {
delete[] last_used_time_;
}
}
/*!
* \brief Reset pool size
* \param cache_size Max cache size
* \param total_size Total size will be used
*/
void ResetSize(int cache_size, int total_size) {
// free old memory
if (pool_ != nullptr) {
delete[] pool_;
}
if (mapper_ != nullptr) {
delete[] mapper_;
}
if (inverse_mapper_ != nullptr) {
delete[] inverse_mapper_;
}
if (last_used_time_ != nullptr) {
delete[] last_used_time_;
}
cache_size_ = cache_size;
// at least need 2 bucket to store smaller leaf and larger leaf
CHECK(cache_size_ >= 2);
total_size_ = total_size;
pool_ = new T[cache_size];
mapper_ = new int[total_size_];
inverse_mapper_ = new int[cache_size_];
last_used_time_ = new int[cache_size_];
ResetMap();
}
/*!
* \brief Return true if this pool is enough to store all data
*/
bool IsEnough() {
return cache_size_ == total_size_;
}
/*!
* \brief Reset mapper
*/
void ResetMap() {
cur_time_ = 0;
memset(mapper_, -1, sizeof(int)*total_size_);
memset(inverse_mapper_, -1, sizeof(int)*cache_size_);
memset(last_used_time_, 0, sizeof(int)*cache_size_);
}
/*!
* \brief Set data for the pool for specific index
* \param idx which index want to set to
* \param data
*/
void Set(int idx, const T& data) {
pool_[idx] = data;
}
/*!
* \brief Get data for the specific index
* \param idx which index want to get
* \param out output data will store into this
* \return True if this index is in the pool, False if this index is not in the pool
*/
bool Get(int idx, T* out) {
if (mapper_[idx] >= 0) {
int slot = mapper_[idx];
*out = pool_[slot];
last_used_time_[slot] = ++cur_time_;
return true;
} else {
// choose the least used slot
int slot = static_cast<int>(ArrayArgs<int>::ArgMin(last_used_time_, cache_size_));
*out = pool_[slot];
last_used_time_[slot] = ++cur_time_;
// reset previous mapper
if (inverse_mapper_[slot] >= 0) mapper_[inverse_mapper_[slot]] = -1;
// update current mapper
mapper_[idx] = slot;
inverse_mapper_[slot] = idx;
return false;
}
}
/*!
* \brief Move data from one index to another index
* \param src_idx
* \param dst_idx
*/
void Move(int src_idx, int dst_idx) {
if (mapper_[src_idx] < 0) {
return;
}
// get slot of src idx
int slot = mapper_[src_idx];
// reset src_idx
mapper_[src_idx] = -1;
// move to dst idx
mapper_[dst_idx] = slot;
last_used_time_[slot] = ++cur_time_;
inverse_mapper_[slot] = dst_idx;
}
private:
T* pool_ = nullptr;
int cache_size_;
int total_size_;
int* mapper_ = nullptr;
int* inverse_mapper_ = nullptr;
int* last_used_time_ = nullptr;
int cur_time_ = 0;
};
}
#endif // LIGHTGBM_UTILS_LRU_POOL_H_
......@@ -110,28 +110,30 @@ void OverallConfig::GetTaskType(const std::unordered_map<std::string, std::strin
}
void OverallConfig::CheckParamConflict() {
GBDTConfig* gbdt_config = dynamic_cast<GBDTConfig*>(boosting_config);
if (network_config.num_machines > 1) {
is_parallel = true;
} else {
is_parallel = false;
dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type =
TreeLearnerType::kSerialTreeLearner;
gbdt_config->tree_learner_type = TreeLearnerType::kSerialTreeLearner;
}
if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kSerialTreeLearner) {
if (gbdt_config->tree_learner_type == TreeLearnerType::kSerialTreeLearner) {
is_parallel = false;
network_config.num_machines = 1;
}
if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kSerialTreeLearner ||
dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kFeatureParallelTreelearner) {
if (gbdt_config->tree_learner_type == TreeLearnerType::kSerialTreeLearner ||
gbdt_config->tree_learner_type == TreeLearnerType::kFeatureParallelTreelearner) {
is_parallel_find_bin = false;
} else if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kDataParallelTreeLearner) {
} else if (gbdt_config->tree_learner_type == TreeLearnerType::kDataParallelTreeLearner) {
is_parallel_find_bin = true;
if (gbdt_config->tree_config.histogram_pool_size >= 0) {
Log::Error("Auto set histogram_pool_size to non-limit, to reduce communication cost");
// To reduce communication cost, not limit pool size when using data parallel
gbdt_config->tree_config.histogram_pool_size = -1;
}
}
}
......@@ -218,10 +220,11 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetDouble(params, "min_sum_hessian_in_leaf", &min_sum_hessian_in_leaf);
CHECK(min_sum_hessian_in_leaf > 1.0f || min_data_in_leaf > 0);
GetInt(params, "num_leaves", &num_leaves);
CHECK(num_leaves > 0);
CHECK(num_leaves > 1);
GetInt(params, "feature_fraction_seed", &feature_fraction_seed);
GetDouble(params, "feature_fraction", &feature_fraction);
CHECK(feature_fraction > 0.0 && feature_fraction <= 1.0);
GetDouble(params, "histogram_pool_size", &histogram_pool_size);
}
......
......@@ -143,8 +143,8 @@ void DataParallelTreeLearner::FindBestThresholds() {
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_,
ptr_to_ordered_hessians_);
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_);
} else {
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
smaller_leaf_splits_->LeafIndex(),
......
......@@ -8,8 +8,7 @@
namespace LightGBM {
SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
:data_partition_(nullptr), is_feature_used_(nullptr),
historical_histogram_array_(nullptr), smaller_leaf_histogram_array_(nullptr),
:data_partition_(nullptr), is_feature_used_(nullptr), smaller_leaf_histogram_array_(nullptr),
larger_leaf_histogram_array_(nullptr),
smaller_leaf_splits_(nullptr), larger_leaf_splits_(nullptr),
ordered_gradients_(nullptr), ordered_hessians_(nullptr), is_data_in_leaf_(nullptr) {
......@@ -19,6 +18,7 @@ SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
min_sum_hessian_one_leaf_ = static_cast<float>(tree_config.min_sum_hessian_in_leaf);
feature_fraction_ = tree_config.feature_fraction;
random_ = Random(tree_config.feature_fraction_seed);
histogram_pool_size_ = tree_config.histogram_pool_size;
}
SerialTreeLearner::~SerialTreeLearner() {
......@@ -26,11 +26,11 @@ SerialTreeLearner::~SerialTreeLearner() {
if (smaller_leaf_splits_ != nullptr) { delete smaller_leaf_splits_; }
if (larger_leaf_splits_ != nullptr) { delete larger_leaf_splits_; }
for (int i = 0; i < num_leaves_; ++i) {
if (historical_histogram_array_[i] != nullptr) {
delete[] historical_histogram_array_[i];
FeatureHistogram* ptr = nullptr;
if (histogram_pool_.Get(i, &ptr)) {
delete[] ptr;
}
}
if (historical_histogram_array_ != nullptr) { delete[] historical_histogram_array_; }
if (is_feature_used_ != nullptr) { delete[] is_feature_used_; }
if (ordered_gradients_ != nullptr) { delete[] ordered_gradients_; }
if (ordered_hessians_ != nullptr) { delete[] ordered_hessians_; }
......@@ -46,16 +46,32 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
train_data_ = train_data;
num_data_ = train_data_->num_data();
num_features_ = train_data_->num_features();
int max_cache_size = 0;
// Get the max size of pool
if (histogram_pool_size_ < 0) {
max_cache_size = num_leaves_;
} else {
size_t total_histogram_size = 0;
for (int i = 0; i < train_data_->num_features(); ++i) {
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin();
}
max_cache_size = static_cast<int>(histogram_pool_size_ * 1024 * 1024 * 1024 / total_histogram_size);
}
// at least need 2 leaves
max_cache_size = Common::Max(2, max_cache_size);
max_cache_size = Common::Min(max_cache_size, num_leaves_);
histogram_pool_.ResetSize(max_cache_size, num_leaves_);
// allocate the space for historical_histogram_array_
historical_histogram_array_ = new FeatureHistogram*[num_leaves_];
for (int i = 0; i < num_leaves_; ++i) {
historical_histogram_array_[i] = new FeatureHistogram[train_data_->num_features()];
for (int i = 0; i < max_cache_size; ++i) {
FeatureHistogram* tmp_histogram_array = new FeatureHistogram[train_data_->num_features()];
for (int j = 0; j < train_data_->num_features(); ++j) {
historical_histogram_array_[i][j].Init(train_data_->FeatureAt(j),
tmp_histogram_array[j].Init(train_data_->FeatureAt(j),
j, min_num_data_one_leaf_,
min_sum_hessian_one_leaf_);
}
// set data at i-th position
histogram_pool_.Set(i, tmp_histogram_array);
}
// push split information for all leaves
for (int i = 0; i < num_leaves_; ++i) {
......@@ -136,6 +152,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
}
void SerialTreeLearner::BeforeTrain() {
// reset histogram pool
histogram_pool_.ResetMap();
// initialize used features
for (int i = 0; i < num_features_; ++i) {
is_feature_used_[i] = false;
......@@ -146,13 +164,7 @@ void SerialTreeLearner::BeforeTrain() {
for (auto idx : used_feature_indices) {
is_feature_used_[idx] = true;
}
// set all histogram to splittable
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_leaves_; ++i) {
for (int j = 0; j < train_data_->num_features(); ++j) {
historical_histogram_array_[i][j].set_is_splittable(true);
}
}
// initialize data partition
data_partition_->Init();
......@@ -166,8 +178,8 @@ void SerialTreeLearner::BeforeTrain() {
// use all data
smaller_leaf_splits_->Init(gradients_, hessians_);
// point to gradients, avoid copy
ptr_to_ordered_gradients_ = gradients_;
ptr_to_ordered_hessians_ = hessians_;
ptr_to_ordered_gradients_smaller_leaf_ = gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = hessians_;
} else {
// use bagging, only use part of data
smaller_leaf_splits_->Init(0, data_partition_, gradients_, hessians_);
......@@ -180,10 +192,13 @@ void SerialTreeLearner::BeforeTrain() {
ordered_hessians_[i] = hessians_[indices[i]];
}
// point to ordered_gradients_ and ordered_hessians_
ptr_to_ordered_gradients_ = ordered_gradients_;
ptr_to_ordered_hessians_ = ordered_hessians_;
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_;
}
ptr_to_ordered_gradients_larger_leaf_ = nullptr;
ptr_to_ordered_hessians_larger_leaf_ = nullptr;
larger_leaf_splits_->Init();
// if has ordered bin, need to initialize the ordered bin
......@@ -231,26 +246,28 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
}
return false;
}
parent_leaf_histogram_array_ = nullptr;
// -1 if only has one leaf. else equal the index of smaller leaf
int smaller_leaf = -1;
int larger_leaf = -1;
// only have root
if (right_leaf < 0) {
smaller_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
larger_leaf_histogram_array_ = nullptr;
} else if (num_data_in_left_child < num_data_in_right_child) {
smaller_leaf = left_leaf;
larger_leaf = right_leaf;
// put parent(left) leaf's histograms into larger leaf's histgrams
larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
// We will construc histograms for smaller leaf, and smaller_leaf=left_leaf = parent.
// if we don't swap the cache, we will overwrite the parent's hisogram cache.
std::swap(historical_histogram_array_[left_leaf], historical_histogram_array_[right_leaf]);
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Move(left_leaf, right_leaf);
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
} else {
smaller_leaf = right_leaf;
larger_leaf = left_leaf;
// put parent(left) leaf's histograms to larger leaf's histgrams
larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
}
// init for the ordered gradients, only initialize when have 2 leaves
......@@ -268,8 +285,23 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
ordered_hessians_[i - begin] = hessians_[indices[i]];
}
// assign pointer
ptr_to_ordered_gradients_ = ordered_gradients_;
ptr_to_ordered_hessians_ = ordered_hessians_;
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_;
if (parent_leaf_histogram_array_ == nullptr) {
// need order gradient for larger leaf
data_size_t smaller_size = end - begin;
data_size_t larger_begin = data_partition_->leaf_begin(larger_leaf);
data_size_t larger_end = larger_begin + data_partition_->leaf_count(larger_leaf);
// copy
#pragma omp parallel for schedule(static)
for (data_size_t i = larger_begin; i < larger_end; ++i) {
ordered_gradients_[smaller_size + i - larger_begin] = gradients_[indices[i]];
ordered_hessians_[smaller_size + i - larger_begin] = hessians_[indices[i]];
}
ptr_to_ordered_gradients_larger_leaf_ = ordered_gradients_ + smaller_size;
ptr_to_ordered_hessians_larger_leaf_ = ordered_hessians_ + smaller_size;
}
}
// split for the ordered bin
......@@ -301,7 +333,7 @@ void SerialTreeLearner::FindBestThresholds() {
// feature is not used
if ((is_feature_used_ != nullptr && is_feature_used_[feature_index] == false)) continue;
// if parent(larger) leaf cannot split at current feature
if (larger_leaf_histogram_array_ != nullptr && !larger_leaf_histogram_array_[feature_index].is_splittable()) {
if (parent_leaf_histogram_array_ != nullptr && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue;
}
......@@ -313,8 +345,8 @@ void SerialTreeLearner::FindBestThresholds() {
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_,
ptr_to_ordered_hessians_);
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_);
} else {
// used ordered bin
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
......@@ -331,9 +363,30 @@ void SerialTreeLearner::FindBestThresholds() {
// only has root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
if (parent_leaf_histogram_array_ != nullptr) {
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
} else {
if (ordered_bins_[feature_index] == nullptr) {
// if not use ordered bin
larger_leaf_histogram_array_[feature_index].Construct(larger_leaf_splits_->data_indices(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_larger_leaf_,
ptr_to_ordered_hessians_larger_leaf_);
} else {
// used ordered bin
larger_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
larger_leaf_splits_->LeafIndex(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
gradients_,
hessians_);
}
}
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
......
......@@ -3,6 +3,7 @@
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/lru_pool.h>
#include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h>
......@@ -122,8 +123,8 @@ protected:
Random random_;
/*! \brief used for sub feature training, is_feature_used_[i] = falase means don't used feature i */
bool* is_feature_used_;
/*! \brief cache historical histogram to speed up */
FeatureHistogram** historical_histogram_array_;
/*! \brief pointer to histograms array of parent of current leaves */
FeatureHistogram* parent_leaf_histogram_array_;
/*! \brief pointer to histograms array of smaller leaf */
FeatureHistogram* smaller_leaf_histogram_array_;
/*! \brief pointer to histograms array of larger leaf */
......@@ -143,15 +144,25 @@ protected:
score_t* ordered_hessians_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_;
const score_t* ptr_to_ordered_gradients_smaller_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_;
const score_t* ptr_to_ordered_hessians_smaller_leaf_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_larger_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_larger_leaf_;
/*! \brief Store ordered bin */
std::vector<OrderedBin*> ordered_bins_;
/*! \brief True if has ordered bin */
bool has_ordered_bin_ = false;
/*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */
char* is_data_in_leaf_;
/*! \brief max cache size(unit:GB) for historical histogram. < 0 means not limit */
double histogram_pool_size_;
/*! \brief used to cache historical histogram to speed up*/
LRUPool<FeatureHistogram*> histogram_pool_;
};
......
......@@ -170,6 +170,7 @@
<ClInclude Include="..\include\LightGBM\utils\array_args.h" />
<ClInclude Include="..\include\LightGBM\utils\common.h" />
<ClInclude Include="..\include\LightGBM\utils\log.h" />
<ClInclude Include="..\include\LightGBM\utils\lru_pool.h" />
<ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h" />
<ClInclude Include="..\include\LightGBM\utils\random.h" />
<ClInclude Include="..\include\LightGBM\utils\text_reader.h" />
......
......@@ -156,6 +156,9 @@
<ClInclude Include="..\include\LightGBM\utils\threading.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\lru_pool.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp">
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment