"src/vscode:/vscode.git/clone" did not exist on "0f0eb69e3b86cfd2ca159a628654cf1f907e4cb3"
Unverified Commit d0bec9e9 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

speed up multi-val bin subset for bagging (#2827)

* speed up multi-val bin subset for bagging

* remove the duplicated codes

* code refine

* some codes refactoring

* move `is_constant_hessian` into `TrainingShareStates`

* refine

* fix bug

* fix bug when num_groups_ < 0

* fix gpu

* fix gpu bagging

* fix gpu bug

* typo

* Update src/treelearner/serial_tree_learner.h
parent 0aa7bfee
......@@ -303,7 +303,7 @@ class Bin {
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
virtual void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
/*!
* \brief Get bin iterator of this bin for specific feature
* \param min_bin min_bin of current used feature
......@@ -453,23 +453,34 @@ class MultiValBin {
virtual int32_t num_bin() const = 0;
virtual void ReSize(data_size_t num_data) = 0;
virtual double num_element_per_row() const = 0;
virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
virtual void ReSizeForSubFeature(int num_bin, int num_feature,
double estimate_element_per_row) = 0;
virtual void CopySubrow(const MultiValBin* full_bin,
const data_size_t* used_indices,
data_size_t num_used_indices) = 0;
virtual MultiValBin* CreateLike(int num_bin, int num_feature,
virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
int num_feature,
double estimate_element_per_row) const = 0;
virtual void CopySubFeature(const MultiValBin* full_bin,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) = 0;
virtual void CopySubcol(const MultiValBin* full_bin,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) = 0;
virtual void ReSize(data_size_t num_data, int num_bin, int num_feature,
double estimate_element_per_row) = 0;
virtual void CopySubrowAndSubcol(
const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices, const std::vector<int>& used_feature_index,
const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) = 0;
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t start, data_size_t end,
......
......@@ -276,16 +276,22 @@ class Parser {
static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx);
};
struct TrainingTempState {
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
hist_buf;
struct TrainingShareStates {
bool is_colwise = true;
bool is_use_subcol = false;
bool is_use_subrow = false;
bool is_subrow_copied = false;
bool is_constant_hessian = true;
const data_size_t* bagging_use_indices;
data_size_t bagging_indices_cnt;
int num_bin_aligned;
bool use_subfeature;
std::unique_ptr<MultiValBin> multi_val_bin;
std::unique_ptr<MultiValBin> multi_val_bin_subfeature;
std::unique_ptr<MultiValBin> multi_val_bin_subset;
std::vector<uint32_t> hist_move_src;
std::vector<uint32_t> hist_move_dest;
std::vector<uint32_t> hist_move_size;
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
hist_buf;
void SetMultiValBin(MultiValBin* bin) {
if (bin == nullptr) {
......@@ -302,14 +308,14 @@ struct TrainingTempState {
}
hist_t* TempBuf() {
if (!use_subfeature) {
if (!is_use_subcol) {
return nullptr;
}
return hist_buf.data() + hist_buf.size() - num_bin_aligned * 2;
}
void HistMove(const hist_t* src, hist_t* dest) {
if (!use_subfeature) {
if (!is_use_subcol) {
return;
}
#pragma omp parallel for schedule(static)
......@@ -436,16 +442,16 @@ class Dataset {
}
void ReSize(data_size_t num_data);
void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
MultiValBin* GetMultiBinFromSparseFeatures() const;
MultiValBin* GetMultiBinFromAllFeatures() const;
TrainingTempState* TestMultiThreadingMethod(
score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const;
TrainingShareStates* GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise) const;
LIGHTGBM_EXPORT void FinishLoad();
......@@ -473,23 +479,21 @@ class Dataset {
LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
void InitTrain(const std::vector<int8_t>& is_feature_used,
bool is_colwise,
TrainingTempState* temp_state) const;
TrainingShareStates* share_state) const;
void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients,
const score_t* hessians, score_t* ordered_gradients,
score_t* ordered_hessians, bool is_constant_hessian,
bool is_colwise, TrainingTempState* temp_state,
score_t* ordered_hessians,
TrainingShareStates* share_state,
hist_t* histogram_data) const;
void ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
const score_t* hessians,
bool is_constant_hessian,
TrainingTempState* temp_state,
TrainingShareStates* share_state,
hist_t* histogram_data) const;
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
......
......@@ -174,12 +174,12 @@ class FeatureGroup {
}
}
inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
if (!is_multi_val_) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
} else {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
}
}
}
......
......@@ -37,7 +37,10 @@ class TreeLearner {
*/
virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;
virtual void ResetTrainingData(const Dataset* train_data) = 0;
virtual void ResetIsConstantHessian(bool is_constant_hessian) = 0;
virtual void ResetTrainingData(const Dataset* train_data,
bool is_constant_hessian) = 0;
/*!
* \brief Reset tree configs
......@@ -52,7 +55,7 @@ class TreeLearner {
* \param is_constant_hessian True if all hessians share the same value
* \return A trained tree
*/
virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian,
virtual Tree* Train(const score_t* gradients, const score_t* hessians,
const Json& forced_split_json) = 0;
/*!
......@@ -65,13 +68,13 @@ class TreeLearner {
/*!
* \brief Set bagging data
* \param subset subset of bagging
* \param used_indices Used data indices
* \param num_data Number of used data
*/
virtual void SetBaggingData(const data_size_t* used_indices,
data_size_t num_data) = 0;
virtual bool IsHistColWise() const = 0;
virtual void SetBaggingData(const Dataset* subset,
const data_size_t* used_indices,
data_size_t num_data) = 0;
/*!
* \brief Using last trained tree to predict score then adding to out_score;
......
......@@ -231,13 +231,14 @@ void GBDT::Bagging(int iter) {
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(),
tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
bag_data_cnt_, false);
tree_learner_->ResetTrainingData(tmp_subset_.get());
tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
bag_data_cnt_);
}
}
}
......@@ -365,7 +366,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
grad = gradients_.data() + offset;
hess = hessians_.data() + offset;
}
new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
new_tree.reset(tree_learner_->Train(grad, hess, forced_splits_json_));
}
if (new_tree->num_leaves() > 1) {
......@@ -693,8 +694,10 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
feature_names_ = train_data_->feature_names();
feature_infos_ = train_data_->feature_infos();
tree_learner_->ResetTrainingData(train_data);
tree_learner_->ResetTrainingData(train_data, is_constant_hessian_);
ResetBaggingConfig(config_.get(), true);
} else {
tree_learner_->ResetIsConstantHessian(is_constant_hessian_);
}
}
......@@ -750,7 +753,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
(static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
is_use_subset_ = false;
const int group_threshold_usesubset = 100;
if (tree_learner_->IsHistColWise() && average_bag_rate <= 0.5
if (average_bag_rate <= 0.5
&& (train_data_->num_feature_groups() < group_threshold_usesubset)) {
if (tmp_subset_ == nullptr || is_change_dataset) {
tmp_subset_.reset(new Dataset(bag_data_cnt_));
......
......@@ -125,8 +125,7 @@ class RF : public GBDT {
hess = tmp_hess_.data();
}
new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_,
forced_splits_json_));
new_tree.reset(tree_learner_->Train(grad, hess, forced_splits_json_));
}
if (new_tree->num_leaves() > 1) {
......
......@@ -1059,7 +1059,7 @@ int LGBM_DatasetGetSubset(
}
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices));
ret->CopyFeatureMapperFrom(full_dataset);
ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true);
ret->CopySubrow(full_dataset, used_row_indices, num_used_row_indices, true);
*out = ret.release();
API_END();
}
......
......@@ -586,10 +586,10 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
return ret.release();
}
TrainingTempState* Dataset::TestMultiThreadingMethod(
TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const {
bool force_colwise, bool force_rowwise) const {
Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
global_timer);
if (force_colwise && force_rowwise) {
......@@ -598,25 +598,30 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
"the same time");
}
if (num_groups_ <= 0) {
return nullptr;
TrainingShareStates* share_state = new TrainingShareStates();
share_state->is_colwise = true;
share_state->is_constant_hessian = is_constant_hessian;
return share_state;
}
if (force_colwise) {
*is_hist_col_wise = true;
TrainingTempState* temp_state = new TrainingTempState();
temp_state->SetMultiValBin(GetMultiBinFromSparseFeatures());
return temp_state;
TrainingShareStates* share_state = new TrainingShareStates();
share_state->SetMultiValBin(GetMultiBinFromSparseFeatures());
share_state->is_colwise = true;
share_state->is_constant_hessian = is_constant_hessian;
return share_state;
} else if (force_rowwise) {
*is_hist_col_wise = false;
TrainingTempState* temp_state = new TrainingTempState();
temp_state->SetMultiValBin(GetMultiBinFromAllFeatures());
return temp_state;
TrainingShareStates* share_state = new TrainingShareStates();
share_state->SetMultiValBin(GetMultiBinFromAllFeatures());
share_state->is_colwise = false;
share_state->is_constant_hessian = is_constant_hessian;
return share_state;
} else {
std::unique_ptr<MultiValBin> sparse_bin;
std::unique_ptr<MultiValBin> all_bin;
std::unique_ptr<TrainingTempState> colwise_state;
std::unique_ptr<TrainingTempState> rowwise_state;
colwise_state.reset(new TrainingTempState());
rowwise_state.reset(new TrainingTempState());
std::unique_ptr<TrainingShareStates> colwise_state;
std::unique_ptr<TrainingShareStates> rowwise_state;
colwise_state.reset(new TrainingShareStates());
rowwise_state.reset(new TrainingShareStates());
std::chrono::duration<double, std::milli> col_wise_init_time,
row_wise_init_time;
......@@ -633,23 +638,25 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
Log::Debug(
"init for col-wise cost %f seconds, init for row-wise cost %f seconds",
col_wise_init_time * 1e-3, row_wise_init_time * 1e-3);
InitTrain(is_feature_used, true, colwise_state.get());
InitTrain(is_feature_used, false, rowwise_state.get());
colwise_state->is_colwise = true;
colwise_state->is_constant_hessian = is_constant_hessian;
InitTrain(is_feature_used, colwise_state.get());
rowwise_state->is_colwise = false;
rowwise_state->is_constant_hessian = is_constant_hessian;
InitTrain(is_feature_used, rowwise_state.get());
std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
start_time = std::chrono::steady_clock::now();
ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
hessians, gradients, hessians, is_constant_hessian,
true, colwise_state.get(), hist_data.data());
hessians, gradients, hessians, colwise_state.get(),
hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now();
ConstructHistogramsMultiVal(nullptr, num_data_, gradients, hessians,
is_constant_hessian, rowwise_state.get(),
hist_data.data());
rowwise_state.get(), hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time;
Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds",
col_wise_time * 1e-3, row_wise_time * 1e-3);
if (col_wise_time < row_wise_time) {
*is_hist_col_wise = true;
auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time;
Log::Warning(
"Auto-choosing col-wise multi-threading, the overhead of testing was "
......@@ -658,7 +665,6 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
overhead_cost * 1e-3);
return colwise_state.release();
} else {
*is_hist_col_wise = false;
auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time;
Log::Warning(
"Auto-choosing row-wise multi-threading, the overhead of testing was "
......@@ -765,7 +771,7 @@ void Dataset::ReSize(data_size_t num_data) {
}
}
void Dataset::CopySubset(const Dataset* fullset,
void Dataset::CopySubrow(const Dataset* fullset,
const data_size_t* used_indices,
data_size_t num_used_indices, bool need_meta_data) {
CHECK(num_used_indices == num_data_);
......@@ -773,7 +779,7 @@ void Dataset::CopySubset(const Dataset* fullset,
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(),
feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(),
used_indices, num_used_indices);
OMP_LOOP_EX_END();
}
......@@ -1037,13 +1043,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
}
void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
bool is_colwise, TrainingTempState* temp_state) const {
TrainingShareStates* share_state) const {
Common::FunctionTimer fun_time("Dataset::InitTrain", global_timer);
temp_state->use_subfeature = false;
if (temp_state->multi_val_bin == nullptr) {
share_state->is_use_subcol = false;
if (share_state->multi_val_bin == nullptr) {
return;
}
global_timer.Start("Dataset::InitTrain.Prep");
const auto multi_val_bin = share_state->multi_val_bin.get();
double sum_used_dense_ratio = 0.0;
double sum_dense_ratio = 0.0;
int num_used = 0;
......@@ -1063,7 +1069,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
sum_dense_ratio += dense_rate;
++total;
}
} else if (!is_colwise) {
} else if (!share_state->is_colwise) {
bool is_group_used = false;
double dense_rate = 0;
for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
......@@ -1081,101 +1087,124 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
++total;
}
}
global_timer.Stop("Dataset::InitTrain.Prep");
const double k_subfeature_threshold = 0.6;
if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) {
return;
}
temp_state->use_subfeature = true;
global_timer.Start("Dataset::InitTrain.Prep");
std::vector<uint32_t> upper_bound;
std::vector<uint32_t> lower_bound;
std::vector<uint32_t> delta;
temp_state->hist_move_src.clear();
temp_state->hist_move_dest.clear();
temp_state->hist_move_size.clear();
// only need to copy subset
if (share_state->is_use_subrow && !share_state->is_subrow_copied) {
if (share_state->multi_val_bin_subset == nullptr) {
share_state->multi_val_bin_subset.reset(multi_val_bin->CreateLike(
share_state->bagging_indices_cnt, multi_val_bin->num_bin(), total,
multi_val_bin->num_element_per_row()));
} else {
share_state->multi_val_bin_subset->ReSize(
share_state->bagging_indices_cnt, multi_val_bin->num_bin(), total,
multi_val_bin->num_element_per_row());
}
share_state->multi_val_bin_subset->CopySubrow(
multi_val_bin, share_state->bagging_use_indices,
share_state->bagging_indices_cnt);
// avoid to copy subset many times
share_state->is_subrow_copied = true;
}
} else {
share_state->is_use_subcol = true;
std::vector<uint32_t> upper_bound;
std::vector<uint32_t> lower_bound;
std::vector<uint32_t> delta;
share_state->hist_move_src.clear();
share_state->hist_move_dest.clear();
share_state->hist_move_size.clear();
int num_total_bin = 1;
int new_num_total_bin = 1;
int num_total_bin = 1;
int new_num_total_bin = 1;
for (int i = 0; i < num_groups_; ++i) {
int f_start = group_feature_start_[i];
if (feature_groups_[i]->is_multi_val_) {
for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
const auto& bin_mapper = feature_groups_[i]->bin_mappers_[j];
int cur_num_bin = bin_mapper->num_bin();
if (bin_mapper->GetMostFreqBin() == 0) {
cur_num_bin -= 1;
for (int i = 0; i < num_groups_; ++i) {
int f_start = group_feature_start_[i];
if (feature_groups_[i]->is_multi_val_) {
for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
const auto& bin_mapper = feature_groups_[i]->bin_mappers_[j];
int cur_num_bin = bin_mapper->num_bin();
if (bin_mapper->GetMostFreqBin() == 0) {
cur_num_bin -= 1;
}
num_total_bin += cur_num_bin;
if (is_feature_used[f_start + j]) {
new_num_total_bin += cur_num_bin;
lower_bound.push_back(num_total_bin - cur_num_bin);
upper_bound.push_back(num_total_bin);
share_state->hist_move_src.push_back(
(new_num_total_bin - cur_num_bin) * 2);
share_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) *
2);
share_state->hist_move_size.push_back(cur_num_bin * 2);
delta.push_back(num_total_bin - new_num_total_bin);
}
}
} else if (!share_state->is_colwise) {
bool is_group_used = false;
for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
if (is_feature_used[f_start + j]) {
is_group_used = true;
break;
}
}
int cur_num_bin = feature_groups_[i]->bin_offsets_.back() - 1;
num_total_bin += cur_num_bin;
if (is_feature_used[f_start + j]) {
if (is_group_used) {
new_num_total_bin += cur_num_bin;
lower_bound.push_back(num_total_bin - cur_num_bin);
upper_bound.push_back(num_total_bin);
temp_state->hist_move_src.push_back(
share_state->hist_move_src.push_back(
(new_num_total_bin - cur_num_bin) * 2);
temp_state->hist_move_dest.push_back(
(num_total_bin - cur_num_bin) * 2);
temp_state->hist_move_size.push_back(cur_num_bin * 2);
share_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) *
2);
share_state->hist_move_size.push_back(cur_num_bin * 2);
delta.push_back(num_total_bin - new_num_total_bin);
}
}
} else if (!is_colwise) {
bool is_group_used = false;
for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
if (is_feature_used[f_start + j]) {
is_group_used = true;
break;
}
}
int cur_num_bin = feature_groups_[i]->bin_offsets_.back() - 1;
num_total_bin += cur_num_bin;
if (is_group_used) {
new_num_total_bin += cur_num_bin;
lower_bound.push_back(num_total_bin - cur_num_bin);
upper_bound.push_back(num_total_bin);
temp_state->hist_move_src.push_back(
(new_num_total_bin - cur_num_bin) * 2);
temp_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) * 2);
temp_state->hist_move_size.push_back(cur_num_bin * 2);
delta.push_back(num_total_bin - new_num_total_bin);
}
}
// avoid out of range
lower_bound.push_back(num_total_bin);
upper_bound.push_back(num_total_bin);
data_size_t num_data =
share_state->is_use_subrow ? share_state->bagging_indices_cnt : num_data_;
if (share_state->multi_val_bin_subset == nullptr) {
share_state->multi_val_bin_subset.reset(multi_val_bin->CreateLike(
num_data, new_num_total_bin, num_used, sum_used_dense_ratio));
} else {
share_state->multi_val_bin_subset->ReSize(num_data, new_num_total_bin,
num_used, sum_used_dense_ratio);
}
if (share_state->is_use_subrow) {
share_state->multi_val_bin_subset->CopySubrowAndSubcol(
multi_val_bin, share_state->bagging_use_indices,
share_state->bagging_indices_cnt, used_feature_index, lower_bound,
upper_bound, delta);
// may need to recopy subset
share_state->is_subrow_copied = false;
} else {
share_state->multi_val_bin_subset->CopySubcol(
multi_val_bin, used_feature_index, lower_bound, upper_bound, delta);
}
}
// avoid out of range
lower_bound.push_back(num_total_bin);
upper_bound.push_back(num_total_bin);
global_timer.Stop("Dataset::InitTrain.Prep");
global_timer.Start("Dataset::InitTrain.Resize");
if (temp_state->multi_val_bin_subfeature == nullptr) {
temp_state->multi_val_bin_subfeature.reset(
temp_state->multi_val_bin->CreateLike(new_num_total_bin, num_used,
sum_used_dense_ratio));
} else {
temp_state->multi_val_bin_subfeature->ReSizeForSubFeature(
new_num_total_bin, num_used, sum_used_dense_ratio);
}
global_timer.Stop("Dataset::InitTrain.Resize");
global_timer.Start("Dataset::InitTrain.CopySubFeature");
temp_state->multi_val_bin_subfeature->CopySubFeature(
temp_state->multi_val_bin.get(), used_feature_index, lower_bound,
upper_bound, delta);
global_timer.Stop("Dataset::InitTrain.CopySubFeature");
}
void Dataset::ConstructHistogramsMultiVal(
const data_size_t* data_indices, data_size_t num_data,
const score_t* gradients, const score_t* hessians, bool is_constant_hessian,
TrainingTempState* temp_state, hist_t* hist_data) const {
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
const score_t* hessians,
TrainingShareStates* share_state,
hist_t* hist_data) const {
Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
global_timer);
const auto multi_val_bin = temp_state->use_subfeature
? temp_state->multi_val_bin_subfeature.get()
: temp_state->multi_val_bin.get();
const auto multi_val_bin =
(share_state->is_use_subcol || share_state->is_use_subrow)
? share_state->multi_val_bin_subset.get()
: share_state->multi_val_bin.get();
if (multi_val_bin == nullptr) {
return;
}
......@@ -1191,12 +1220,12 @@ void Dataset::ConstructHistogramsMultiVal(
&n_data_block, &data_block_size);
const size_t buf_size =
static_cast<size_t>(n_data_block - 1) * num_bin_aligned * 2;
if (temp_state->hist_buf.size() < buf_size) {
temp_state->hist_buf.resize(buf_size);
if (share_state->hist_buf.size() < buf_size) {
share_state->hist_buf.resize(buf_size);
}
auto origin_hist_data = hist_data;
if (temp_state->use_subfeature) {
hist_data = temp_state->TempBuf();
if (share_state->is_use_subcol) {
hist_data = share_state->TempBuf();
}
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
......@@ -1206,12 +1235,12 @@ void Dataset::ConstructHistogramsMultiVal(
data_size_t end = std::min(start + data_block_size, num_data);
auto data_ptr = hist_data;
if (tid > 0) {
data_ptr = temp_state->hist_buf.data() +
data_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
}
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin * kHistEntrySize);
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
if (!share_state->is_constant_hessian) {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
hessians, data_ptr);
} else {
......@@ -1219,7 +1248,7 @@ void Dataset::ConstructHistogramsMultiVal(
data_ptr);
}
} else {
if (!is_constant_hessian) {
if (!share_state->is_constant_hessian) {
multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
data_ptr);
} else {
......@@ -1236,13 +1265,13 @@ void Dataset::ConstructHistogramsMultiVal(
int bin_block_size = num_bin;
Threading::BlockInfo<data_size_t>(num_threads, num_bin, 512, &n_bin_block,
&bin_block_size);
if (!is_constant_hessian) {
if (!share_state->is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = temp_state->hist_buf.data() +
auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
......@@ -1255,7 +1284,7 @@ void Dataset::ConstructHistogramsMultiVal(
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = temp_state->hist_buf.data() +
auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
......@@ -1268,7 +1297,7 @@ void Dataset::ConstructHistogramsMultiVal(
}
global_timer.Stop("Dataset::sparse_bin_histogram_merge");
global_timer.Start("Dataset::sparse_bin_histogram_move");
temp_state->HistMove(hist_data, origin_hist_data);
share_state->HistMove(hist_data, origin_hist_data);
global_timer.Stop("Dataset::sparse_bin_histogram_move");
}
......@@ -1276,16 +1305,14 @@ void Dataset::ConstructHistograms(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian, bool is_colwise, TrainingTempState* temp_state,
hist_t* hist_data) const {
TrainingShareStates* share_state, hist_t* hist_data) const {
Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
if (num_data < 0 || hist_data == nullptr) {
return;
}
if (!is_colwise) {
if (!share_state->is_colwise) {
return ConstructHistogramsMultiVal(data_indices, num_data, gradients,
hessians, is_constant_hessian,
temp_state, hist_data);
hessians, share_state, hist_data);
}
global_timer.Start("Dataset::Get used group");
std::vector<int> used_dense_group;
......@@ -1316,7 +1343,7 @@ void Dataset::ConstructHistograms(
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
if (!share_state->is_constant_hessian) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
......@@ -1330,7 +1357,7 @@ void Dataset::ConstructHistograms(
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
if (!is_constant_hessian) {
if (!share_state->is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
......@@ -1372,7 +1399,7 @@ void Dataset::ConstructHistograms(
OMP_THROW_EX();
}
} else {
if (!is_constant_hessian) {
if (!share_state->is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
......@@ -1416,8 +1443,8 @@ void Dataset::ConstructHistograms(
global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) {
ConstructHistogramsMultiVal(
data_indices, num_data, gradients, hessians, is_constant_hessian,
temp_state, hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
data_indices, num_data, gradients, hessians, share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
}
}
......
......@@ -267,7 +267,7 @@ class DenseBin: public Bin {
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const DenseBin<VAL_T>*>(full_bin);
for (int i = 0; i < num_used_indices; ++i) {
data_[i] = other_bin->data_[used_indices[i]];
......
......@@ -292,7 +292,7 @@ class Dense4bitsBin : public Bin {
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
const data_size_t rest = num_used_indices & 1;
for (int i = 0; i < num_used_indices - rest; i += 2) {
......
......@@ -34,6 +34,8 @@ class MultiValDenseBin : public MultiValBin {
return num_bin_;
}
double num_element_per_row() const override { return num_feature_; }
void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
auto start = RowPtr(idx);
for (auto i = 0; i < num_feature_; ++i) {
......@@ -48,12 +50,6 @@ class MultiValDenseBin : public MultiValBin {
return false;
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
......@@ -125,24 +121,13 @@ class MultiValDenseBin : public MultiValBin {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
data_.resize(num_feature_ * num_used_indices);
for (data_size_t i = 0; i < num_used_indices; ++i) {
auto j_start = RowPtr(i);
auto other_j_start = other_bin->RowPtr(used_indices[i]);
for (auto j = other_j_start;
j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
data_[j - other_j_start + j_start] = other_bin->data_[j];
}
}
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double) const override {
return new MultiValDenseBin<VAL_T>(num_data, num_bin, num_feature);
}
MultiValBin* CreateLike(int num_bin, int num_feature, double) const override {
return new MultiValDenseBin<VAL_T>(num_data_, num_bin, num_feature);
}
void ReSizeForSubFeature(int num_bin, int num_feature, double) override {
void ReSize(data_size_t num_data, int num_bin, int num_feature,
double) override {
num_data_ = num_data;
num_bin_ = num_bin;
num_feature_ = num_feature;
size_t new_size = static_cast<size_t>(num_feature_) * num_data_;
......@@ -151,35 +136,73 @@ class MultiValDenseBin : public MultiValBin {
}
}
void CopySubFeature(const MultiValBin* full_bin,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>&,
const std::vector<uint32_t>&,
const std::vector<uint32_t>& delta) override {
const auto other =
template <bool SUBROW, bool SUBCOL>
void CopyInner(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>& delta) {
const auto other_bin =
reinterpret_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
if (SUBROW) {
CHECK(num_data_ == num_used_indices);
}
int n_block = 1;
data_size_t block_size = num_data_;
Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block, &block_size);
Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block,
&block_size);
#pragma omp parallel for schedule(static, 1)
for (int tid = 0; tid < n_block; ++tid) {
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data_, start + block_size);
for (data_size_t i = start; i < end; ++i) {
const auto j_start = RowPtr(i);
const auto other_j_start = other->RowPtr(i);
const auto other_j_start =
SUBROW ? other_bin->RowPtr(used_indices[i]) : other_bin->RowPtr(i);
for (int j = 0; j < num_feature_; ++j) {
if (other->data_[other_j_start + used_feature_index[j]] > 0) {
data_[j_start + j] = static_cast<VAL_T>(
other->data_[other_j_start + used_feature_index[j]] - delta[j]);
if (SUBCOL) {
if (other_bin->data_[other_j_start + used_feature_index[j]] > 0) {
data_[j_start + j] = static_cast<VAL_T>(
other_bin->data_[other_j_start + used_feature_index[j]] -
delta[j]);
} else {
data_[j_start + j] = 0;
}
} else {
data_[j_start + j] = 0;
data_[j_start + j] =
static_cast<VAL_T>(other_bin->data_[other_j_start + j]);
}
}
}
}
}
void CopySubrow(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices) override {
CopyInner<true, false>(full_bin, used_indices, num_used_indices,
std::vector<int>(), std::vector<uint32_t>());
}
void CopySubcol(const MultiValBin* full_bin,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>&,
const std::vector<uint32_t>&,
const std::vector<uint32_t>& delta) override {
CopyInner<false, true>(full_bin, nullptr, num_data_, used_feature_index,
delta);
}
void CopySubrowAndSubcol(const MultiValBin* full_bin,
const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>&,
const std::vector<uint32_t>&,
const std::vector<uint32_t>& delta) override {
CopyInner<true, true>(full_bin, used_indices, num_used_indices,
used_feature_index, delta);
}
inline size_t RowPtr(data_size_t idx) const {
return static_cast<size_t>(idx) * num_feature_;
}
......
......@@ -42,6 +42,10 @@ class MultiValSparseBin : public MultiValBin {
int num_bin() const override { return num_bin_; }
double num_element_per_row() const override {
return estimate_element_per_row_;
}
void PushOneRow(int tid, data_size_t idx,
const std::vector<uint32_t>& values) override {
const int pre_alloc_size = 50;
......@@ -102,12 +106,6 @@ class MultiValSparseBin : public MultiValBin {
bool IsSparse() override { return true; }
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
......@@ -189,32 +187,15 @@ class MultiValSparseBin : public MultiValBin {
nullptr, out);
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin);
row_ptr_.resize(num_data_ + 1, 0);
INDEX_T estimate_num_data =
static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
data_.clear();
data_.reserve(estimate_num_data);
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (auto j = other_bin->row_ptr_[used_indices[i]];
j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
data_.push_back(other_bin->data_[j]);
}
row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] -
other_bin->row_ptr_[used_indices[i]];
}
}
MultiValBin* CreateLike(int num_bin, int,
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
double estimate_element_per_row) const override {
return new MultiValSparseBin<INDEX_T, VAL_T>(num_data_, num_bin,
estimate_element_per_row);
return new MultiValSparseBin<INDEX_T, VAL_T>(num_data, num_bin,
estimate_element_per_row);
}
void ReSizeForSubFeature(int num_bin, int,
double estimate_element_per_row) override {
void ReSize(data_size_t num_data, int num_bin, int,
double estimate_element_per_row) override {
num_data_ = num_data;
num_bin_ = num_bin;
estimate_element_per_row_ = estimate_element_per_row;
INDEX_T estimate_num_data =
......@@ -229,14 +210,22 @@ class MultiValSparseBin : public MultiValBin {
t_data_[i].resize(avg_num_data, 0);
}
}
if (num_data_ + 1 > static_cast<data_size_t>(row_ptr_.size())) {
row_ptr_.resize(num_data_ + 1);
}
}
void CopySubFeature(const MultiValBin* full_bin, const std::vector<int>&,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) override {
template <bool SUBROW, bool SUBCOL>
void CopyInner(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) {
const auto other =
reinterpret_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin);
if (SUBROW) {
CHECK(num_data_ == num_used_indices);
}
int n_block = 1;
data_size_t block_size = num_data_;
Threading::BlockInfo<data_size_t>(static_cast<int>(t_data_.size() + 1),
......@@ -250,20 +239,26 @@ class MultiValSparseBin : public MultiValBin {
auto& buf = (tid == 0) ? data_ : t_data_[tid - 1];
INDEX_T size = 0;
for (data_size_t i = start; i < end; ++i) {
const auto j_start = other->RowPtr(i);
const auto j_end = other->RowPtr(i + 1);
const auto j_start =
SUBROW ? other->RowPtr(used_indices[i]) : other->RowPtr(i);
const auto j_end =
SUBROW ? other->RowPtr(used_indices[i] + 1) : other->RowPtr(i + 1);
if (size + (j_end - j_start) > static_cast<INDEX_T>(buf.size())) {
buf.resize(size + (j_end - j_start) * pre_alloc_size);
}
int k = 0;
const auto pre_size = size;
for (auto j = j_start; j < j_end; ++j) {
auto val = other->data_[j];
while (val >= upper[k]) {
++k;
}
if (val >= lower[k]) {
buf[size++] = static_cast<VAL_T>(val - delta[k]);
const auto val = other->data_[j];
if (SUBCOL) {
while (val >= upper[k]) {
++k;
}
if (val >= lower[k]) {
buf[size++] = static_cast<VAL_T>(val - delta[k]);
}
} else {
buf[size++] = val;
}
}
row_ptr_[i + 1] = size - pre_size;
......@@ -273,6 +268,31 @@ class MultiValSparseBin : public MultiValBin {
MergeData(sizes.data());
}
void CopySubrow(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices) override {
CopyInner<true, false>(full_bin, used_indices, num_used_indices,
std::vector<uint32_t>(), std::vector<uint32_t>(),
std::vector<uint32_t>());
}
void CopySubcol(const MultiValBin* full_bin, const std::vector<int>&,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) override {
CopyInner<false, true>(full_bin, nullptr, num_data_, lower, upper, delta);
}
void CopySubrowAndSubcol(const MultiValBin* full_bin,
const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<int>&,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) override {
CopyInner<true, true>(full_bin, used_indices, num_used_indices, lower,
upper, delta);
}
inline INDEX_T RowPtr(data_size_t idx) const { return row_ptr_[idx]; }
MultiValSparseBin<INDEX_T, VAL_T>* Clone() override;
......
......@@ -451,7 +451,7 @@ class SparseBin: public Bin {
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const SparseBin<VAL_T>*>(full_bin);
deltas_.clear();
vals_.clear();
......
......@@ -158,7 +158,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
indices_future_.wait();
}
// for constant hessian, hessians are not copied except for the root node
if (!is_constant_hessian_) {
if (!share_state_->is_constant_hessian) {
hessians_future_.wait();
}
gradients_future_.wait();
......@@ -581,7 +581,7 @@ void GPUTreeLearner::BuildGPUKernels() {
// compile the GPU kernel depending if double precision is used, constant hessian is used, etc.
opts << " -D POWER_FEATURE_WORKGROUPS=" << i
<< " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(config_->gpu_use_dp)
<< " -D CONST_HESSIAN=" << int(is_constant_hessian_)
<< " -D CONST_HESSIAN=" << int(share_state_->is_constant_hessian)
<< " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math";
#if GPU_DEBUG >= 1
std::cout << "Building GPU kernels with options: " << opts.str() << std::endl;
......@@ -642,7 +642,7 @@ void GPUTreeLearner::SetupKernelArguments() {
}
for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
// The only argument that needs to be changed later is num_data_
if (is_constant_hessian_) {
if (share_state_->is_constant_hessian) {
// hessian is passed as a parameter, but it is not available now.
// hessian will be set in BeforeTrain()
histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
......@@ -736,20 +736,12 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
}
Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
bool is_constant_hessian, const Json& forced_split_json) {
// check if we need to recompile the GPU kernel (is_constant_hessian changed)
// this should rarely occur
if (is_constant_hessian != is_constant_hessian_) {
Log::Info("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not ");
is_constant_hessian_ = is_constant_hessian;
BuildGPUKernels();
SetupKernelArguments();
}
return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
const Json& forced_split_json) {
return SerialTreeLearner::Train(gradients, hessians, forced_split_json);
}
void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
SerialTreeLearner::ResetTrainingData(train_data);
void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
num_feature_groups_ = train_data_->num_feature_groups();
// GPU memory has to been reallocated because data may have been changed
AllocateGPUMemory();
......@@ -757,6 +749,14 @@ void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
SetupKernelArguments();
}
void GPUTreeLearner::ResetIsConstantHessian(bool is_constant_hessian) {
if (is_constant_hessian != share_state_->is_constant_hessian) {
SerialTreeLearner::ResetIsConstantHessian(is_constant_hessian);
BuildGPUKernels();
SetupKernelArguments();
}
}
void GPUTreeLearner::BeforeTrain() {
#if GPU_DEBUG >= 2
printf("Copying intial full gradients and hessians to device\n");
......@@ -764,7 +764,7 @@ void GPUTreeLearner::BeforeTrain() {
// Copy initial full hessians and gradients to GPU.
// We start copying as early as possible, instead of at ConstructHistogram().
if (!use_bagging_ && num_dense_feature_groups_) {
if (!is_constant_hessian_) {
if (!share_state_->is_constant_hessian) {
hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data_ * sizeof(score_t), hessians_);
} else {
// setup hessian parameters only
......@@ -792,7 +792,7 @@ void GPUTreeLearner::BeforeTrain() {
#endif
// transfer the indices to GPU
indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
if (!is_constant_hessian_) {
if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < cnt; ++i) {
ordered_hessians_[i] = hessians_[indices[i]];
......@@ -846,7 +846,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
#endif
indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
if (!is_constant_hessian_) {
if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
ordered_hessians_[i - begin] = hessians_[indices[i]];
......@@ -899,7 +899,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
}
}
// generate and copy ordered_hessians if hessians is not null
if (hessians != nullptr && !is_constant_hessian_) {
if (hessians != nullptr && !share_state_->is_constant_hessian) {
if (num_data != num_data_) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
......@@ -976,8 +976,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
train_data_->ConstructHistograms(is_sparse_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
is_hist_colwise_, temp_state_.get(),
ordered_gradients_.data(), ordered_hessians_.data(),
share_state_.get(),
ptr_smaller_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) {
......@@ -1041,8 +1041,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
train_data_->ConstructHistograms(is_sparse_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
is_hist_colwise_, temp_state_.get(),
ordered_gradients_.data(), ordered_hessians_.data(),
share_state_.get(),
ptr_larger_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) {
......
......@@ -46,15 +46,16 @@ class GPUTreeLearner: public SerialTreeLearner {
explicit GPUTreeLearner(const Config* tree_config);
~GPUTreeLearner();
void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetTrainingData(const Dataset* train_data) override;
void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
void ResetIsConstantHessian(bool is_constant_hessian);
Tree* Train(const score_t* gradients, const score_t *hessians,
bool is_constant_hessian, const Json& forced_split_json) override;
const Json& forced_split_json) override;
void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
SerialTreeLearner::SetBaggingData(used_indices, num_data);
// determine if we are using bagging before we construct the data partition
// thus we can start data movement to GPU earlier
if (used_indices != nullptr) {
void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
if (subset == nullptr && used_indices != nullptr) {
// determine if we are using bagging before we construct the data partition
// thus we can start data movement to GPU earlier
if (num_data != num_data_) {
use_bagging_ = true;
return;
......
......@@ -30,7 +30,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
train_data_ = train_data;
num_data_ = train_data_->num_data();
num_features_ = train_data_->num_features();
is_constant_hessian_ = is_constant_hessian;
int max_cache_size = 0;
// Get the max size of pool
if (config_->histogram_pool_size <= 0) {
......@@ -62,9 +61,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_);
GetMultiValBin(train_data_, true);
histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
GetShareStates(train_data_, is_constant_hessian, true);
histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this));
......@@ -72,22 +70,28 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
}
}
void SerialTreeLearner::GetMultiValBin(const Dataset* dataset, bool is_first_time) {
void SerialTreeLearner::GetShareStates(const Dataset* dataset,
bool is_constant_hessian,
bool is_first_time) {
if (is_first_time) {
auto used_feature = GetUsedFeatures(true);
temp_state_.reset(dataset->TestMultiThreadingMethod(
ordered_gradients_.data(), ordered_hessians_.data(), used_feature,
is_constant_hessian_, config_->force_col_wise, config_->force_row_wise, &is_hist_colwise_));
share_state_.reset(dataset->GetShareStates(
ordered_gradients_.data(), ordered_hessians_.data(), used_feature,
is_constant_hessian, config_->force_col_wise, config_->force_row_wise));
} else {
CHECK(share_state_ != nullptr);
// cannot change is_hist_col_wise during training
temp_state_.reset(dataset->TestMultiThreadingMethod(
ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_,
is_constant_hessian_, is_hist_colwise_, !is_hist_colwise_, &is_hist_colwise_));
share_state_.reset(dataset->GetShareStates(
ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_,
is_constant_hessian, share_state_->is_colwise,
!share_state_->is_colwise));
}
CHECK(share_state_ != nullptr);
}
// Todo: optimized bagging for multi-val bin
void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data,
bool is_constant_hessian,
bool reset_multi_val_bin) {
train_data_ = train_data;
num_data_ = train_data_->num_data();
CHECK_EQ(num_features_, train_data_->num_features());
......@@ -99,7 +103,9 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
// initialize data partition
data_partition_->ResetNumData(num_data_);
GetMultiValBin(train_data_, false);
if (reset_multi_val_bin) {
GetShareStates(train_data_, is_constant_hessian, false);
}
// initialize ordered gradients and hessians
ordered_gradients_.resize(num_data_);
......@@ -127,7 +133,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
// at least need 2 leaves
max_cache_size = std::max(2, max_cache_size);
max_cache_size = std::min(max_cache_size, config_->num_leaves);
histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
// push split information for all leaves
best_split_per_leaf_.resize(config_->num_leaves);
......@@ -142,11 +148,10 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
}
}
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, const Json& forced_split_json) {
Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
gradients_ = gradients;
hessians_ = hessians;
is_constant_hessian_ = is_constant_hessian;
// some initial works before training
BeforeTrain();
......@@ -286,7 +291,7 @@ void SerialTreeLearner::BeforeTrain() {
is_feature_used_[i] = 1;
}
}
train_data_->InitTrain(is_feature_used_, is_hist_colwise_, temp_state_.get());
train_data_->InitTrain(is_feature_used_, share_state_.get());
// initialize data partition
data_partition_->Init();
......@@ -369,24 +374,27 @@ void SerialTreeLearner::FindBestSplits() {
FindBestSplitsFromHistograms(is_feature_used, use_subtract);
}
void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer);
void SerialTreeLearner::ConstructHistograms(
const std::vector<int8_t>& is_feature_used, bool use_subtract) {
Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
global_timer);
// construct smaller leaf
hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
hist_t* ptr_smaller_leaf_hist_data =
smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
train_data_->ConstructHistograms(
is_feature_used, smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
is_hist_colwise_, temp_state_.get(), ptr_smaller_leaf_hist_data);
ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
ptr_smaller_leaf_hist_data);
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf
hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
hist_t* ptr_larger_leaf_hist_data =
larger_leaf_histogram_array_[0].RawData() - kHistOffset;
train_data_->ConstructHistograms(
is_feature_used, larger_leaf_splits_->data_indices(),
larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
is_constant_hessian_, is_hist_colwise_, temp_state_.get(),
ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
ptr_larger_leaf_hist_data);
}
}
......
......@@ -48,11 +48,22 @@ class SerialTreeLearner: public TreeLearner {
void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetTrainingData(const Dataset* train_data) override;
void ResetTrainingData(const Dataset* train_data,
bool is_constant_hessian) override {
ResetTrainingDataInner(train_data, is_constant_hessian, true);
}
void ResetIsConstantHessian(bool is_constant_hessian) override {
share_state_->is_constant_hessian = is_constant_hessian;
}
virtual void ResetTrainingDataInner(const Dataset* train_data,
bool is_constant_hessian,
bool reset_multi_val_bin);
void ResetConfig(const Config* config) override;
Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
Tree* Train(const score_t* gradients, const score_t *hessians,
const Json& forced_split_json) override;
Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
......@@ -60,8 +71,17 @@ class SerialTreeLearner: public TreeLearner {
Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
const score_t* gradients, const score_t* hessians) override;
void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
data_partition_->SetUsedDataIndices(used_indices, num_data);
void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
if (subset == nullptr) {
data_partition_->SetUsedDataIndices(used_indices, num_data);
share_state_->is_use_subrow = false;
} else {
ResetTrainingDataInner(subset, share_state_->is_constant_hessian, false);
share_state_->is_use_subrow = true;
share_state_->is_subrow_copied = false;
share_state_->bagging_use_indices = used_indices;
share_state_->bagging_indices_cnt = num_data;
}
}
void AddPredictionToScore(const Tree* tree,
......@@ -84,8 +104,6 @@ class SerialTreeLearner: public TreeLearner {
void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
bool IsHistColWise() const override { return is_hist_colwise_; }
protected:
void ComputeBestSplitForFeature(FeatureHistogram* histogram_array_,
int feature_index, int real_fidx,
......@@ -93,7 +111,7 @@ class SerialTreeLearner: public TreeLearner {
const LeafSplits* leaf_splits,
SplitInfo* best_split);
void GetMultiValBin(const Dataset* dataset, bool is_first_time);
void GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time);
virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
/*!
......@@ -182,17 +200,11 @@ class SerialTreeLearner: public TreeLearner {
/*! \brief hessians of current iteration, ordered for cache optimized */
std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_;
#endif
/*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */
std::vector<char, Common::AlignmentAllocator<char, kAlignedSize>> is_data_in_leaf_;
/*! \brief used to cache historical histogram to speed up*/
HistogramPool histogram_pool_;
/*! \brief config of tree learner*/
const Config* config_;
std::vector<int> ordered_bin_indices_;
bool is_constant_hessian_;
std::unique_ptr<TrainingTempState> temp_state_;
bool is_hist_colwise_;
std::unique_ptr<TrainingShareStates> share_state_;
std::unique_ptr<CostEfficientGradientBoosting> cegb_;
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment