"vscode:/vscode.git/clone" did not exist on "75e486a6fa2a02a76024b4622d7aba3e13084ad4"
Unverified Commit d0bec9e9 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

speed up multi-val bin subset for bagging (#2827)

* speed up multi-val bin subset for bagging

* remove the duplicated codes

* code refine

* some codes refactoring

* move `is_constant_hessian` into `TrainingShareStates`

* refine

* fix bug

* fix bug when num_groups_ < 0

* fix gpu

* fix gpu bagging

* fix gpu bug

* typo

* Update src/treelearner/serial_tree_learner.h
parent 0aa7bfee
...@@ -303,7 +303,7 @@ class Bin { ...@@ -303,7 +303,7 @@ class Bin {
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0; virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0; virtual void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
/*! /*!
* \brief Get bin iterator of this bin for specific feature * \brief Get bin iterator of this bin for specific feature
* \param min_bin min_bin of current used feature * \param min_bin min_bin of current used feature
...@@ -453,24 +453,35 @@ class MultiValBin { ...@@ -453,24 +453,35 @@ class MultiValBin {
virtual int32_t num_bin() const = 0; virtual int32_t num_bin() const = 0;
virtual void ReSize(data_size_t num_data) = 0; virtual double num_element_per_row() const = 0;
virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0; virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0; virtual void CopySubrow(const MultiValBin* full_bin,
const data_size_t* used_indices,
data_size_t num_used_indices) = 0;
virtual void ReSizeForSubFeature(int num_bin, int num_feature, virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
double estimate_element_per_row) = 0; int num_feature,
virtual MultiValBin* CreateLike(int num_bin, int num_feature,
double estimate_element_per_row) const = 0; double estimate_element_per_row) const = 0;
virtual void CopySubFeature(const MultiValBin* full_bin,
virtual void CopySubcol(const MultiValBin* full_bin,
const std::vector<int>& used_feature_index, const std::vector<int>& used_feature_index,
const std::vector<uint32_t>& lower, const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper, const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) = 0; const std::vector<uint32_t>& delta) = 0;
virtual void ReSize(data_size_t num_data, int num_bin, int num_feature,
double estimate_element_per_row) = 0;
virtual void CopySubrowAndSubcol(
const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices, const std::vector<int>& used_feature_index,
const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) = 0;
virtual void ConstructHistogram( virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t start, data_size_t end, const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, const score_t* gradients, const score_t* hessians,
......
...@@ -276,16 +276,22 @@ class Parser { ...@@ -276,16 +276,22 @@ class Parser {
static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx); static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx);
}; };
struct TrainingTempState { struct TrainingShareStates {
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> bool is_colwise = true;
hist_buf; bool is_use_subcol = false;
bool is_use_subrow = false;
bool is_subrow_copied = false;
bool is_constant_hessian = true;
const data_size_t* bagging_use_indices;
data_size_t bagging_indices_cnt;
int num_bin_aligned; int num_bin_aligned;
bool use_subfeature;
std::unique_ptr<MultiValBin> multi_val_bin; std::unique_ptr<MultiValBin> multi_val_bin;
std::unique_ptr<MultiValBin> multi_val_bin_subfeature; std::unique_ptr<MultiValBin> multi_val_bin_subset;
std::vector<uint32_t> hist_move_src; std::vector<uint32_t> hist_move_src;
std::vector<uint32_t> hist_move_dest; std::vector<uint32_t> hist_move_dest;
std::vector<uint32_t> hist_move_size; std::vector<uint32_t> hist_move_size;
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
hist_buf;
void SetMultiValBin(MultiValBin* bin) { void SetMultiValBin(MultiValBin* bin) {
if (bin == nullptr) { if (bin == nullptr) {
...@@ -302,14 +308,14 @@ struct TrainingTempState { ...@@ -302,14 +308,14 @@ struct TrainingTempState {
} }
hist_t* TempBuf() { hist_t* TempBuf() {
if (!use_subfeature) { if (!is_use_subcol) {
return nullptr; return nullptr;
} }
return hist_buf.data() + hist_buf.size() - num_bin_aligned * 2; return hist_buf.data() + hist_buf.size() - num_bin_aligned * 2;
} }
void HistMove(const hist_t* src, hist_t* dest) { void HistMove(const hist_t* src, hist_t* dest) {
if (!use_subfeature) { if (!is_use_subcol) {
return; return;
} }
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
...@@ -436,16 +442,16 @@ class Dataset { ...@@ -436,16 +442,16 @@ class Dataset {
} }
void ReSize(data_size_t num_data); void ReSize(data_size_t num_data);
void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
MultiValBin* GetMultiBinFromSparseFeatures() const; MultiValBin* GetMultiBinFromSparseFeatures() const;
MultiValBin* GetMultiBinFromAllFeatures() const; MultiValBin* GetMultiBinFromAllFeatures() const;
TrainingTempState* TestMultiThreadingMethod( TrainingShareStates* GetShareStates(
score_t* gradients, score_t* hessians, score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const; bool force_colwise, bool force_rowwise) const;
LIGHTGBM_EXPORT void FinishLoad(); LIGHTGBM_EXPORT void FinishLoad();
...@@ -473,23 +479,21 @@ class Dataset { ...@@ -473,23 +479,21 @@ class Dataset {
LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset); LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
void InitTrain(const std::vector<int8_t>& is_feature_used, void InitTrain(const std::vector<int8_t>& is_feature_used,
bool is_colwise, TrainingShareStates* share_state) const;
TrainingTempState* temp_state) const;
void ConstructHistograms(const std::vector<int8_t>& is_feature_used, void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, data_size_t num_data, const score_t* gradients,
const score_t* hessians, score_t* ordered_gradients, const score_t* hessians, score_t* ordered_gradients,
score_t* ordered_hessians, bool is_constant_hessian, score_t* ordered_hessians,
bool is_colwise, TrainingTempState* temp_state, TrainingShareStates* share_state,
hist_t* histogram_data) const; hist_t* histogram_data) const;
void ConstructHistogramsMultiVal(const data_size_t* data_indices, void ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* gradients,
const score_t* hessians, const score_t* hessians,
bool is_constant_hessian, TrainingShareStates* share_state,
TrainingTempState* temp_state,
hist_t* histogram_data) const; hist_t* histogram_data) const;
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const; void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
......
...@@ -174,12 +174,12 @@ class FeatureGroup { ...@@ -174,12 +174,12 @@ class FeatureGroup {
} }
} }
inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) { inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
if (!is_multi_val_) { if (!is_multi_val_) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices); bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
} else { } else {
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices); multi_bin_data_[i]->CopySubrow(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
} }
} }
} }
......
...@@ -37,7 +37,10 @@ class TreeLearner { ...@@ -37,7 +37,10 @@ class TreeLearner {
*/ */
virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0; virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;
virtual void ResetTrainingData(const Dataset* train_data) = 0; virtual void ResetIsConstantHessian(bool is_constant_hessian) = 0;
virtual void ResetTrainingData(const Dataset* train_data,
bool is_constant_hessian) = 0;
/*! /*!
* \brief Reset tree configs * \brief Reset tree configs
...@@ -52,7 +55,7 @@ class TreeLearner { ...@@ -52,7 +55,7 @@ class TreeLearner {
* \param is_constant_hessian True if all hessians share the same value * \param is_constant_hessian True if all hessians share the same value
* \return A trained tree * \return A trained tree
*/ */
virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, virtual Tree* Train(const score_t* gradients, const score_t* hessians,
const Json& forced_split_json) = 0; const Json& forced_split_json) = 0;
/*! /*!
...@@ -65,14 +68,14 @@ class TreeLearner { ...@@ -65,14 +68,14 @@ class TreeLearner {
/*! /*!
* \brief Set bagging data * \brief Set bagging data
* \param subset subset of bagging
* \param used_indices Used data indices * \param used_indices Used data indices
* \param num_data Number of used data * \param num_data Number of used data
*/ */
virtual void SetBaggingData(const data_size_t* used_indices, virtual void SetBaggingData(const Dataset* subset,
const data_size_t* used_indices,
data_size_t num_data) = 0; data_size_t num_data) = 0;
virtual bool IsHistColWise() const = 0;
/*! /*!
* \brief Using last trained tree to predict score then adding to out_score; * \brief Using last trained tree to predict score then adding to out_score;
* \param out_score output score * \param out_score output score
......
...@@ -231,13 +231,14 @@ void GBDT::Bagging(int iter) { ...@@ -231,13 +231,14 @@ void GBDT::Bagging(int iter) {
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner // set bagging data to tree learner
if (!is_use_subset_) { if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_); tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
} else { } else {
// get subset // get subset
tmp_subset_->ReSize(bag_data_cnt_); tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
bag_data_cnt_, false); bag_data_cnt_, false);
tree_learner_->ResetTrainingData(tmp_subset_.get()); tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
bag_data_cnt_);
} }
} }
} }
...@@ -365,7 +366,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ...@@ -365,7 +366,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
grad = gradients_.data() + offset; grad = gradients_.data() + offset;
hess = hessians_.data() + offset; hess = hessians_.data() + offset;
} }
new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_)); new_tree.reset(tree_learner_->Train(grad, hess, forced_splits_json_));
} }
if (new_tree->num_leaves() > 1) { if (new_tree->num_leaves() > 1) {
...@@ -693,8 +694,10 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* ...@@ -693,8 +694,10 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
feature_names_ = train_data_->feature_names(); feature_names_ = train_data_->feature_names();
feature_infos_ = train_data_->feature_infos(); feature_infos_ = train_data_->feature_infos();
tree_learner_->ResetTrainingData(train_data); tree_learner_->ResetTrainingData(train_data, is_constant_hessian_);
ResetBaggingConfig(config_.get(), true); ResetBaggingConfig(config_.get(), true);
} else {
tree_learner_->ResetIsConstantHessian(is_constant_hessian_);
} }
} }
...@@ -750,7 +753,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { ...@@ -750,7 +753,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
(static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq; (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
is_use_subset_ = false; is_use_subset_ = false;
const int group_threshold_usesubset = 100; const int group_threshold_usesubset = 100;
if (tree_learner_->IsHistColWise() && average_bag_rate <= 0.5 if (average_bag_rate <= 0.5
&& (train_data_->num_feature_groups() < group_threshold_usesubset)) { && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
if (tmp_subset_ == nullptr || is_change_dataset) { if (tmp_subset_ == nullptr || is_change_dataset) {
tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_.reset(new Dataset(bag_data_cnt_));
......
...@@ -125,8 +125,7 @@ class RF : public GBDT { ...@@ -125,8 +125,7 @@ class RF : public GBDT {
hess = tmp_hess_.data(); hess = tmp_hess_.data();
} }
new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, new_tree.reset(tree_learner_->Train(grad, hess, forced_splits_json_));
forced_splits_json_));
} }
if (new_tree->num_leaves() > 1) { if (new_tree->num_leaves() > 1) {
......
...@@ -1059,7 +1059,7 @@ int LGBM_DatasetGetSubset( ...@@ -1059,7 +1059,7 @@ int LGBM_DatasetGetSubset(
} }
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices)); auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices));
ret->CopyFeatureMapperFrom(full_dataset); ret->CopyFeatureMapperFrom(full_dataset);
ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true); ret->CopySubrow(full_dataset, used_row_indices, num_used_row_indices, true);
*out = ret.release(); *out = ret.release();
API_END(); API_END();
} }
......
...@@ -586,10 +586,10 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures() const { ...@@ -586,10 +586,10 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
return ret.release(); return ret.release();
} }
TrainingTempState* Dataset::TestMultiThreadingMethod( TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians, score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const { bool force_colwise, bool force_rowwise) const {
Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
global_timer); global_timer);
if (force_colwise && force_rowwise) { if (force_colwise && force_rowwise) {
...@@ -598,25 +598,30 @@ TrainingTempState* Dataset::TestMultiThreadingMethod( ...@@ -598,25 +598,30 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
"the same time"); "the same time");
} }
if (num_groups_ <= 0) { if (num_groups_ <= 0) {
return nullptr; TrainingShareStates* share_state = new TrainingShareStates();
share_state->is_colwise = true;
share_state->is_constant_hessian = is_constant_hessian;
return share_state;
} }
if (force_colwise) { if (force_colwise) {
*is_hist_col_wise = true; TrainingShareStates* share_state = new TrainingShareStates();
TrainingTempState* temp_state = new TrainingTempState(); share_state->SetMultiValBin(GetMultiBinFromSparseFeatures());
temp_state->SetMultiValBin(GetMultiBinFromSparseFeatures()); share_state->is_colwise = true;
return temp_state; share_state->is_constant_hessian = is_constant_hessian;
return share_state;
} else if (force_rowwise) { } else if (force_rowwise) {
*is_hist_col_wise = false; TrainingShareStates* share_state = new TrainingShareStates();
TrainingTempState* temp_state = new TrainingTempState(); share_state->SetMultiValBin(GetMultiBinFromAllFeatures());
temp_state->SetMultiValBin(GetMultiBinFromAllFeatures()); share_state->is_colwise = false;
return temp_state; share_state->is_constant_hessian = is_constant_hessian;
return share_state;
} else { } else {
std::unique_ptr<MultiValBin> sparse_bin; std::unique_ptr<MultiValBin> sparse_bin;
std::unique_ptr<MultiValBin> all_bin; std::unique_ptr<MultiValBin> all_bin;
std::unique_ptr<TrainingTempState> colwise_state; std::unique_ptr<TrainingShareStates> colwise_state;
std::unique_ptr<TrainingTempState> rowwise_state; std::unique_ptr<TrainingShareStates> rowwise_state;
colwise_state.reset(new TrainingTempState()); colwise_state.reset(new TrainingShareStates());
rowwise_state.reset(new TrainingTempState()); rowwise_state.reset(new TrainingShareStates());
std::chrono::duration<double, std::milli> col_wise_init_time, std::chrono::duration<double, std::milli> col_wise_init_time,
row_wise_init_time; row_wise_init_time;
...@@ -633,23 +638,25 @@ TrainingTempState* Dataset::TestMultiThreadingMethod( ...@@ -633,23 +638,25 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
Log::Debug( Log::Debug(
"init for col-wise cost %f seconds, init for row-wise cost %f seconds", "init for col-wise cost %f seconds, init for row-wise cost %f seconds",
col_wise_init_time * 1e-3, row_wise_init_time * 1e-3); col_wise_init_time * 1e-3, row_wise_init_time * 1e-3);
InitTrain(is_feature_used, true, colwise_state.get()); colwise_state->is_colwise = true;
InitTrain(is_feature_used, false, rowwise_state.get()); colwise_state->is_constant_hessian = is_constant_hessian;
InitTrain(is_feature_used, colwise_state.get());
rowwise_state->is_colwise = false;
rowwise_state->is_constant_hessian = is_constant_hessian;
InitTrain(is_feature_used, rowwise_state.get());
std::chrono::duration<double, std::milli> col_wise_time, row_wise_time; std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
hessians, gradients, hessians, is_constant_hessian, hessians, gradients, hessians, colwise_state.get(),
true, colwise_state.get(), hist_data.data()); hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time; col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
ConstructHistogramsMultiVal(nullptr, num_data_, gradients, hessians, ConstructHistogramsMultiVal(nullptr, num_data_, gradients, hessians,
is_constant_hessian, rowwise_state.get(), rowwise_state.get(), hist_data.data());
hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time; row_wise_time = std::chrono::steady_clock::now() - start_time;
Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds", Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds",
col_wise_time * 1e-3, row_wise_time * 1e-3); col_wise_time * 1e-3, row_wise_time * 1e-3);
if (col_wise_time < row_wise_time) { if (col_wise_time < row_wise_time) {
*is_hist_col_wise = true;
auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time; auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time;
Log::Warning( Log::Warning(
"Auto-choosing col-wise multi-threading, the overhead of testing was " "Auto-choosing col-wise multi-threading, the overhead of testing was "
...@@ -658,7 +665,6 @@ TrainingTempState* Dataset::TestMultiThreadingMethod( ...@@ -658,7 +665,6 @@ TrainingTempState* Dataset::TestMultiThreadingMethod(
overhead_cost * 1e-3); overhead_cost * 1e-3);
return colwise_state.release(); return colwise_state.release();
} else { } else {
*is_hist_col_wise = false;
auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time; auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time;
Log::Warning( Log::Warning(
"Auto-choosing row-wise multi-threading, the overhead of testing was " "Auto-choosing row-wise multi-threading, the overhead of testing was "
...@@ -765,7 +771,7 @@ void Dataset::ReSize(data_size_t num_data) { ...@@ -765,7 +771,7 @@ void Dataset::ReSize(data_size_t num_data) {
} }
} }
void Dataset::CopySubset(const Dataset* fullset, void Dataset::CopySubrow(const Dataset* fullset,
const data_size_t* used_indices, const data_size_t* used_indices,
data_size_t num_used_indices, bool need_meta_data) { data_size_t num_used_indices, bool need_meta_data) {
CHECK(num_used_indices == num_data_); CHECK(num_used_indices == num_data_);
...@@ -773,7 +779,7 @@ void Dataset::CopySubset(const Dataset* fullset, ...@@ -773,7 +779,7 @@ void Dataset::CopySubset(const Dataset* fullset,
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) { for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(), feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(),
used_indices, num_used_indices); used_indices, num_used_indices);
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
...@@ -1037,13 +1043,13 @@ void Dataset::DumpTextFile(const char* text_filename) { ...@@ -1037,13 +1043,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
} }
void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
bool is_colwise, TrainingTempState* temp_state) const { TrainingShareStates* share_state) const {
Common::FunctionTimer fun_time("Dataset::InitTrain", global_timer); Common::FunctionTimer fun_time("Dataset::InitTrain", global_timer);
temp_state->use_subfeature = false; share_state->is_use_subcol = false;
if (temp_state->multi_val_bin == nullptr) { if (share_state->multi_val_bin == nullptr) {
return; return;
} }
global_timer.Start("Dataset::InitTrain.Prep"); const auto multi_val_bin = share_state->multi_val_bin.get();
double sum_used_dense_ratio = 0.0; double sum_used_dense_ratio = 0.0;
double sum_dense_ratio = 0.0; double sum_dense_ratio = 0.0;
int num_used = 0; int num_used = 0;
...@@ -1063,7 +1069,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, ...@@ -1063,7 +1069,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
sum_dense_ratio += dense_rate; sum_dense_ratio += dense_rate;
++total; ++total;
} }
} else if (!is_colwise) { } else if (!share_state->is_colwise) {
bool is_group_used = false; bool is_group_used = false;
double dense_rate = 0; double dense_rate = 0;
for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) { for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
...@@ -1081,19 +1087,33 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, ...@@ -1081,19 +1087,33 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
++total; ++total;
} }
} }
global_timer.Stop("Dataset::InitTrain.Prep");
const double k_subfeature_threshold = 0.6; const double k_subfeature_threshold = 0.6;
if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) { if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) {
return; // only need to copy subset
if (share_state->is_use_subrow && !share_state->is_subrow_copied) {
if (share_state->multi_val_bin_subset == nullptr) {
share_state->multi_val_bin_subset.reset(multi_val_bin->CreateLike(
share_state->bagging_indices_cnt, multi_val_bin->num_bin(), total,
multi_val_bin->num_element_per_row()));
} else {
share_state->multi_val_bin_subset->ReSize(
share_state->bagging_indices_cnt, multi_val_bin->num_bin(), total,
multi_val_bin->num_element_per_row());
}
share_state->multi_val_bin_subset->CopySubrow(
multi_val_bin, share_state->bagging_use_indices,
share_state->bagging_indices_cnt);
// avoid to copy subset many times
share_state->is_subrow_copied = true;
} }
temp_state->use_subfeature = true; } else {
global_timer.Start("Dataset::InitTrain.Prep"); share_state->is_use_subcol = true;
std::vector<uint32_t> upper_bound; std::vector<uint32_t> upper_bound;
std::vector<uint32_t> lower_bound; std::vector<uint32_t> lower_bound;
std::vector<uint32_t> delta; std::vector<uint32_t> delta;
temp_state->hist_move_src.clear(); share_state->hist_move_src.clear();
temp_state->hist_move_dest.clear(); share_state->hist_move_dest.clear();
temp_state->hist_move_size.clear(); share_state->hist_move_size.clear();
int num_total_bin = 1; int num_total_bin = 1;
int new_num_total_bin = 1; int new_num_total_bin = 1;
...@@ -1114,15 +1134,15 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, ...@@ -1114,15 +1134,15 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
lower_bound.push_back(num_total_bin - cur_num_bin); lower_bound.push_back(num_total_bin - cur_num_bin);
upper_bound.push_back(num_total_bin); upper_bound.push_back(num_total_bin);
temp_state->hist_move_src.push_back( share_state->hist_move_src.push_back(
(new_num_total_bin - cur_num_bin) * 2); (new_num_total_bin - cur_num_bin) * 2);
temp_state->hist_move_dest.push_back( share_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) *
(num_total_bin - cur_num_bin) * 2); 2);
temp_state->hist_move_size.push_back(cur_num_bin * 2); share_state->hist_move_size.push_back(cur_num_bin * 2);
delta.push_back(num_total_bin - new_num_total_bin); delta.push_back(num_total_bin - new_num_total_bin);
} }
} }
} else if (!is_colwise) { } else if (!share_state->is_colwise) {
bool is_group_used = false; bool is_group_used = false;
for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) { for (int j = 0; j < feature_groups_[i]->num_feature_; ++j) {
if (is_feature_used[f_start + j]) { if (is_feature_used[f_start + j]) {
...@@ -1138,10 +1158,11 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, ...@@ -1138,10 +1158,11 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
lower_bound.push_back(num_total_bin - cur_num_bin); lower_bound.push_back(num_total_bin - cur_num_bin);
upper_bound.push_back(num_total_bin); upper_bound.push_back(num_total_bin);
temp_state->hist_move_src.push_back( share_state->hist_move_src.push_back(
(new_num_total_bin - cur_num_bin) * 2); (new_num_total_bin - cur_num_bin) * 2);
temp_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) * 2); share_state->hist_move_dest.push_back((num_total_bin - cur_num_bin) *
temp_state->hist_move_size.push_back(cur_num_bin * 2); 2);
share_state->hist_move_size.push_back(cur_num_bin * 2);
delta.push_back(num_total_bin - new_num_total_bin); delta.push_back(num_total_bin - new_num_total_bin);
} }
} }
...@@ -1149,33 +1170,41 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, ...@@ -1149,33 +1170,41 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
// avoid out of range // avoid out of range
lower_bound.push_back(num_total_bin); lower_bound.push_back(num_total_bin);
upper_bound.push_back(num_total_bin); upper_bound.push_back(num_total_bin);
global_timer.Stop("Dataset::InitTrain.Prep"); data_size_t num_data =
global_timer.Start("Dataset::InitTrain.Resize"); share_state->is_use_subrow ? share_state->bagging_indices_cnt : num_data_;
if (temp_state->multi_val_bin_subfeature == nullptr) { if (share_state->multi_val_bin_subset == nullptr) {
temp_state->multi_val_bin_subfeature.reset( share_state->multi_val_bin_subset.reset(multi_val_bin->CreateLike(
temp_state->multi_val_bin->CreateLike(new_num_total_bin, num_used, num_data, new_num_total_bin, num_used, sum_used_dense_ratio));
sum_used_dense_ratio));
} else { } else {
temp_state->multi_val_bin_subfeature->ReSizeForSubFeature( share_state->multi_val_bin_subset->ReSize(num_data, new_num_total_bin,
new_num_total_bin, num_used, sum_used_dense_ratio); num_used, sum_used_dense_ratio);
} }
global_timer.Stop("Dataset::InitTrain.Resize"); if (share_state->is_use_subrow) {
global_timer.Start("Dataset::InitTrain.CopySubFeature"); share_state->multi_val_bin_subset->CopySubrowAndSubcol(
temp_state->multi_val_bin_subfeature->CopySubFeature( multi_val_bin, share_state->bagging_use_indices,
temp_state->multi_val_bin.get(), used_feature_index, lower_bound, share_state->bagging_indices_cnt, used_feature_index, lower_bound,
upper_bound, delta); upper_bound, delta);
global_timer.Stop("Dataset::InitTrain.CopySubFeature"); // may need to recopy subset
share_state->is_subrow_copied = false;
} else {
share_state->multi_val_bin_subset->CopySubcol(
multi_val_bin, used_feature_index, lower_bound, upper_bound, delta);
}
}
} }
void Dataset::ConstructHistogramsMultiVal( void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
const data_size_t* data_indices, data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* hessians, bool is_constant_hessian, const score_t* gradients,
TrainingTempState* temp_state, hist_t* hist_data) const { const score_t* hessians,
TrainingShareStates* share_state,
hist_t* hist_data) const {
Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
global_timer); global_timer);
const auto multi_val_bin = temp_state->use_subfeature const auto multi_val_bin =
? temp_state->multi_val_bin_subfeature.get() (share_state->is_use_subcol || share_state->is_use_subrow)
: temp_state->multi_val_bin.get(); ? share_state->multi_val_bin_subset.get()
: share_state->multi_val_bin.get();
if (multi_val_bin == nullptr) { if (multi_val_bin == nullptr) {
return; return;
} }
...@@ -1191,12 +1220,12 @@ void Dataset::ConstructHistogramsMultiVal( ...@@ -1191,12 +1220,12 @@ void Dataset::ConstructHistogramsMultiVal(
&n_data_block, &data_block_size); &n_data_block, &data_block_size);
const size_t buf_size = const size_t buf_size =
static_cast<size_t>(n_data_block - 1) * num_bin_aligned * 2; static_cast<size_t>(n_data_block - 1) * num_bin_aligned * 2;
if (temp_state->hist_buf.size() < buf_size) { if (share_state->hist_buf.size() < buf_size) {
temp_state->hist_buf.resize(buf_size); share_state->hist_buf.resize(buf_size);
} }
auto origin_hist_data = hist_data; auto origin_hist_data = hist_data;
if (temp_state->use_subfeature) { if (share_state->is_use_subcol) {
hist_data = temp_state->TempBuf(); hist_data = share_state->TempBuf();
} }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
...@@ -1206,12 +1235,12 @@ void Dataset::ConstructHistogramsMultiVal( ...@@ -1206,12 +1235,12 @@ void Dataset::ConstructHistogramsMultiVal(
data_size_t end = std::min(start + data_block_size, num_data); data_size_t end = std::min(start + data_block_size, num_data);
auto data_ptr = hist_data; auto data_ptr = hist_data;
if (tid > 0) { if (tid > 0) {
data_ptr = temp_state->hist_buf.data() + data_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1); static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
} }
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin * kHistEntrySize); std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin * kHistEntrySize);
if (data_indices != nullptr && num_data < num_data_) { if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) { if (!share_state->is_constant_hessian) {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
hessians, data_ptr); hessians, data_ptr);
} else { } else {
...@@ -1219,7 +1248,7 @@ void Dataset::ConstructHistogramsMultiVal( ...@@ -1219,7 +1248,7 @@ void Dataset::ConstructHistogramsMultiVal(
data_ptr); data_ptr);
} }
} else { } else {
if (!is_constant_hessian) { if (!share_state->is_constant_hessian) {
multi_val_bin->ConstructHistogram(start, end, gradients, hessians, multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
data_ptr); data_ptr);
} else { } else {
...@@ -1236,13 +1265,13 @@ void Dataset::ConstructHistogramsMultiVal( ...@@ -1236,13 +1265,13 @@ void Dataset::ConstructHistogramsMultiVal(
int bin_block_size = num_bin; int bin_block_size = num_bin;
Threading::BlockInfo<data_size_t>(num_threads, num_bin, 512, &n_bin_block, Threading::BlockInfo<data_size_t>(num_threads, num_bin, 512, &n_bin_block,
&bin_block_size); &bin_block_size);
if (!is_constant_hessian) { if (!share_state->is_constant_hessian) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) { for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size; const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin); const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) { for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = temp_state->hist_buf.data() + auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1); static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) { for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i]; hist_data[i] += src_ptr[i];
...@@ -1255,7 +1284,7 @@ void Dataset::ConstructHistogramsMultiVal( ...@@ -1255,7 +1284,7 @@ void Dataset::ConstructHistogramsMultiVal(
const int start = t * bin_block_size; const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin); const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) { for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = temp_state->hist_buf.data() + auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1); static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) { for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i]; hist_data[i] += src_ptr[i];
...@@ -1268,7 +1297,7 @@ void Dataset::ConstructHistogramsMultiVal( ...@@ -1268,7 +1297,7 @@ void Dataset::ConstructHistogramsMultiVal(
} }
global_timer.Stop("Dataset::sparse_bin_histogram_merge"); global_timer.Stop("Dataset::sparse_bin_histogram_merge");
global_timer.Start("Dataset::sparse_bin_histogram_move"); global_timer.Start("Dataset::sparse_bin_histogram_move");
temp_state->HistMove(hist_data, origin_hist_data); share_state->HistMove(hist_data, origin_hist_data);
global_timer.Stop("Dataset::sparse_bin_histogram_move"); global_timer.Stop("Dataset::sparse_bin_histogram_move");
} }
...@@ -1276,16 +1305,14 @@ void Dataset::ConstructHistograms( ...@@ -1276,16 +1305,14 @@ void Dataset::ConstructHistograms(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices, const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians, data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians, score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian, bool is_colwise, TrainingTempState* temp_state, TrainingShareStates* share_state, hist_t* hist_data) const {
hist_t* hist_data) const {
Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer); Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
if (num_data < 0 || hist_data == nullptr) { if (num_data < 0 || hist_data == nullptr) {
return; return;
} }
if (!is_colwise) { if (!share_state->is_colwise) {
return ConstructHistogramsMultiVal(data_indices, num_data, gradients, return ConstructHistogramsMultiVal(data_indices, num_data, gradients,
hessians, is_constant_hessian, hessians, share_state, hist_data);
temp_state, hist_data);
} }
global_timer.Start("Dataset::Get used group"); global_timer.Start("Dataset::Get used group");
std::vector<int> used_dense_group; std::vector<int> used_dense_group;
...@@ -1316,7 +1343,7 @@ void Dataset::ConstructHistograms( ...@@ -1316,7 +1343,7 @@ void Dataset::ConstructHistograms(
auto ptr_ordered_grad = gradients; auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians; auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) { if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) { if (!share_state->is_constant_hessian) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]]; ordered_gradients[i] = gradients[data_indices[i]];
...@@ -1330,7 +1357,7 @@ void Dataset::ConstructHistograms( ...@@ -1330,7 +1357,7 @@ void Dataset::ConstructHistograms(
} }
ptr_ordered_grad = ordered_gradients; ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians; ptr_ordered_hess = ordered_hessians;
if (!is_constant_hessian) { if (!share_state->is_constant_hessian) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) { for (int gi = 0; gi < num_used_dense_group; ++gi) {
...@@ -1372,7 +1399,7 @@ void Dataset::ConstructHistograms( ...@@ -1372,7 +1399,7 @@ void Dataset::ConstructHistograms(
OMP_THROW_EX(); OMP_THROW_EX();
} }
} else { } else {
if (!is_constant_hessian) { if (!share_state->is_constant_hessian) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) { for (int gi = 0; gi < num_used_dense_group; ++gi) {
...@@ -1416,8 +1443,8 @@ void Dataset::ConstructHistograms( ...@@ -1416,8 +1443,8 @@ void Dataset::ConstructHistograms(
global_timer.Stop("Dataset::dense_bin_histogram"); global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) { if (multi_val_groud_id >= 0) {
ConstructHistogramsMultiVal( ConstructHistogramsMultiVal(
data_indices, num_data, gradients, hessians, is_constant_hessian, data_indices, num_data, gradients, hessians, share_state,
temp_state, hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
} }
} }
......
...@@ -267,7 +267,7 @@ class DenseBin: public Bin { ...@@ -267,7 +267,7 @@ class DenseBin: public Bin {
} }
} }
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const DenseBin<VAL_T>*>(full_bin); auto other_bin = dynamic_cast<const DenseBin<VAL_T>*>(full_bin);
for (int i = 0; i < num_used_indices; ++i) { for (int i = 0; i < num_used_indices; ++i) {
data_[i] = other_bin->data_[used_indices[i]]; data_[i] = other_bin->data_[used_indices[i]];
......
...@@ -292,7 +292,7 @@ class Dense4bitsBin : public Bin { ...@@ -292,7 +292,7 @@ class Dense4bitsBin : public Bin {
} }
} }
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin); auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
const data_size_t rest = num_used_indices & 1; const data_size_t rest = num_used_indices & 1;
for (int i = 0; i < num_used_indices - rest; i += 2) { for (int i = 0; i < num_used_indices - rest; i += 2) {
......
...@@ -34,6 +34,8 @@ class MultiValDenseBin : public MultiValBin { ...@@ -34,6 +34,8 @@ class MultiValDenseBin : public MultiValBin {
return num_bin_; return num_bin_;
} }
double num_element_per_row() const override { return num_feature_; }
void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override { void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
auto start = RowPtr(idx); auto start = RowPtr(idx);
for (auto i = 0; i < num_feature_; ++i) { for (auto i = 0; i < num_feature_; ++i) {
...@@ -48,12 +50,6 @@ class MultiValDenseBin : public MultiValBin { ...@@ -48,12 +50,6 @@ class MultiValDenseBin : public MultiValBin {
return false; return false;
} }
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \ #define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \ const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \ hist[ti] += g; \
...@@ -125,24 +121,13 @@ class MultiValDenseBin : public MultiValBin { ...@@ -125,24 +121,13 @@ class MultiValDenseBin : public MultiValBin {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out); ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
} }
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double) const override {
auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin); return new MultiValDenseBin<VAL_T>(num_data, num_bin, num_feature);
data_.resize(num_feature_ * num_used_indices);
for (data_size_t i = 0; i < num_used_indices; ++i) {
auto j_start = RowPtr(i);
auto other_j_start = other_bin->RowPtr(used_indices[i]);
for (auto j = other_j_start;
j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
data_[j - other_j_start + j_start] = other_bin->data_[j];
}
}
}
MultiValBin* CreateLike(int num_bin, int num_feature, double) const override {
return new MultiValDenseBin<VAL_T>(num_data_, num_bin, num_feature);
} }
void ReSizeForSubFeature(int num_bin, int num_feature, double) override { void ReSize(data_size_t num_data, int num_bin, int num_feature,
double) override {
num_data_ = num_data;
num_bin_ = num_bin; num_bin_ = num_bin;
num_feature_ = num_feature; num_feature_ = num_feature;
size_t new_size = static_cast<size_t>(num_feature_) * num_data_; size_t new_size = static_cast<size_t>(num_feature_) * num_data_;
...@@ -151,35 +136,73 @@ class MultiValDenseBin : public MultiValBin { ...@@ -151,35 +136,73 @@ class MultiValDenseBin : public MultiValBin {
} }
} }
void CopySubFeature(const MultiValBin* full_bin, template <bool SUBROW, bool SUBCOL>
void CopyInner(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<int>& used_feature_index, const std::vector<int>& used_feature_index,
const std::vector<uint32_t>&, const std::vector<uint32_t>& delta) {
const std::vector<uint32_t>&, const auto other_bin =
const std::vector<uint32_t>& delta) override {
const auto other =
reinterpret_cast<const MultiValDenseBin<VAL_T>*>(full_bin); reinterpret_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
if (SUBROW) {
CHECK(num_data_ == num_used_indices);
}
int n_block = 1; int n_block = 1;
data_size_t block_size = num_data_; data_size_t block_size = num_data_;
Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block, &block_size); Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block,
&block_size);
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for schedule(static, 1)
for (int tid = 0; tid < n_block; ++tid) { for (int tid = 0; tid < n_block; ++tid) {
data_size_t start = tid * block_size; data_size_t start = tid * block_size;
data_size_t end = std::min(num_data_, start + block_size); data_size_t end = std::min(num_data_, start + block_size);
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
const auto j_start = RowPtr(i); const auto j_start = RowPtr(i);
const auto other_j_start = other->RowPtr(i); const auto other_j_start =
SUBROW ? other_bin->RowPtr(used_indices[i]) : other_bin->RowPtr(i);
for (int j = 0; j < num_feature_; ++j) { for (int j = 0; j < num_feature_; ++j) {
if (other->data_[other_j_start + used_feature_index[j]] > 0) { if (SUBCOL) {
if (other_bin->data_[other_j_start + used_feature_index[j]] > 0) {
data_[j_start + j] = static_cast<VAL_T>( data_[j_start + j] = static_cast<VAL_T>(
other->data_[other_j_start + used_feature_index[j]] - delta[j]); other_bin->data_[other_j_start + used_feature_index[j]] -
delta[j]);
} else { } else {
data_[j_start + j] = 0; data_[j_start + j] = 0;
} }
} else {
data_[j_start + j] =
static_cast<VAL_T>(other_bin->data_[other_j_start + j]);
}
} }
} }
} }
} }
void CopySubrow(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices) override {
CopyInner<true, false>(full_bin, used_indices, num_used_indices,
std::vector<int>(), std::vector<uint32_t>());
}
void CopySubcol(const MultiValBin* full_bin,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>&,
const std::vector<uint32_t>&,
const std::vector<uint32_t>& delta) override {
CopyInner<false, true>(full_bin, nullptr, num_data_, used_feature_index,
delta);
}
void CopySubrowAndSubcol(const MultiValBin* full_bin,
const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<int>& used_feature_index,
const std::vector<uint32_t>&,
const std::vector<uint32_t>&,
const std::vector<uint32_t>& delta) override {
CopyInner<true, true>(full_bin, used_indices, num_used_indices,
used_feature_index, delta);
}
inline size_t RowPtr(data_size_t idx) const { inline size_t RowPtr(data_size_t idx) const {
return static_cast<size_t>(idx) * num_feature_; return static_cast<size_t>(idx) * num_feature_;
} }
......
...@@ -42,6 +42,10 @@ class MultiValSparseBin : public MultiValBin { ...@@ -42,6 +42,10 @@ class MultiValSparseBin : public MultiValBin {
int num_bin() const override { return num_bin_; } int num_bin() const override { return num_bin_; }
double num_element_per_row() const override {
return estimate_element_per_row_;
}
void PushOneRow(int tid, data_size_t idx, void PushOneRow(int tid, data_size_t idx,
const std::vector<uint32_t>& values) override { const std::vector<uint32_t>& values) override {
const int pre_alloc_size = 50; const int pre_alloc_size = 50;
...@@ -102,12 +106,6 @@ class MultiValSparseBin : public MultiValBin { ...@@ -102,12 +106,6 @@ class MultiValSparseBin : public MultiValBin {
bool IsSparse() override { return true; } bool IsSparse() override { return true; }
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \ #define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \ const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \ hist[ti] += g; \
...@@ -189,32 +187,15 @@ class MultiValSparseBin : public MultiValBin { ...@@ -189,32 +187,15 @@ class MultiValSparseBin : public MultiValBin {
nullptr, out); nullptr, out);
} }
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin);
row_ptr_.resize(num_data_ + 1, 0);
INDEX_T estimate_num_data =
static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
data_.clear();
data_.reserve(estimate_num_data);
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (auto j = other_bin->row_ptr_[used_indices[i]];
j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
data_.push_back(other_bin->data_[j]);
}
row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] -
other_bin->row_ptr_[used_indices[i]];
}
}
MultiValBin* CreateLike(int num_bin, int,
double estimate_element_per_row) const override { double estimate_element_per_row) const override {
return new MultiValSparseBin<INDEX_T, VAL_T>(num_data_, num_bin, return new MultiValSparseBin<INDEX_T, VAL_T>(num_data, num_bin,
estimate_element_per_row); estimate_element_per_row);
} }
void ReSizeForSubFeature(int num_bin, int, void ReSize(data_size_t num_data, int num_bin, int,
double estimate_element_per_row) override { double estimate_element_per_row) override {
num_data_ = num_data;
num_bin_ = num_bin; num_bin_ = num_bin;
estimate_element_per_row_ = estimate_element_per_row; estimate_element_per_row_ = estimate_element_per_row;
INDEX_T estimate_num_data = INDEX_T estimate_num_data =
...@@ -229,14 +210,22 @@ class MultiValSparseBin : public MultiValBin { ...@@ -229,14 +210,22 @@ class MultiValSparseBin : public MultiValBin {
t_data_[i].resize(avg_num_data, 0); t_data_[i].resize(avg_num_data, 0);
} }
} }
if (num_data_ + 1 > static_cast<data_size_t>(row_ptr_.size())) {
row_ptr_.resize(num_data_ + 1);
}
} }
void CopySubFeature(const MultiValBin* full_bin, const std::vector<int>&, template <bool SUBROW, bool SUBCOL>
void CopyInner(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<uint32_t>& lower, const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper, const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) override { const std::vector<uint32_t>& delta) {
const auto other = const auto other =
reinterpret_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin); reinterpret_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin);
if (SUBROW) {
CHECK(num_data_ == num_used_indices);
}
int n_block = 1; int n_block = 1;
data_size_t block_size = num_data_; data_size_t block_size = num_data_;
Threading::BlockInfo<data_size_t>(static_cast<int>(t_data_.size() + 1), Threading::BlockInfo<data_size_t>(static_cast<int>(t_data_.size() + 1),
...@@ -250,21 +239,27 @@ class MultiValSparseBin : public MultiValBin { ...@@ -250,21 +239,27 @@ class MultiValSparseBin : public MultiValBin {
auto& buf = (tid == 0) ? data_ : t_data_[tid - 1]; auto& buf = (tid == 0) ? data_ : t_data_[tid - 1];
INDEX_T size = 0; INDEX_T size = 0;
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
const auto j_start = other->RowPtr(i); const auto j_start =
const auto j_end = other->RowPtr(i + 1); SUBROW ? other->RowPtr(used_indices[i]) : other->RowPtr(i);
const auto j_end =
SUBROW ? other->RowPtr(used_indices[i] + 1) : other->RowPtr(i + 1);
if (size + (j_end - j_start) > static_cast<INDEX_T>(buf.size())) { if (size + (j_end - j_start) > static_cast<INDEX_T>(buf.size())) {
buf.resize(size + (j_end - j_start) * pre_alloc_size); buf.resize(size + (j_end - j_start) * pre_alloc_size);
} }
int k = 0; int k = 0;
const auto pre_size = size; const auto pre_size = size;
for (auto j = j_start; j < j_end; ++j) { for (auto j = j_start; j < j_end; ++j) {
auto val = other->data_[j]; const auto val = other->data_[j];
if (SUBCOL) {
while (val >= upper[k]) { while (val >= upper[k]) {
++k; ++k;
} }
if (val >= lower[k]) { if (val >= lower[k]) {
buf[size++] = static_cast<VAL_T>(val - delta[k]); buf[size++] = static_cast<VAL_T>(val - delta[k]);
} }
} else {
buf[size++] = val;
}
} }
row_ptr_[i + 1] = size - pre_size; row_ptr_[i + 1] = size - pre_size;
} }
...@@ -273,6 +268,31 @@ class MultiValSparseBin : public MultiValBin { ...@@ -273,6 +268,31 @@ class MultiValSparseBin : public MultiValBin {
MergeData(sizes.data()); MergeData(sizes.data());
} }
void CopySubrow(const MultiValBin* full_bin, const data_size_t* used_indices,
data_size_t num_used_indices) override {
CopyInner<true, false>(full_bin, used_indices, num_used_indices,
std::vector<uint32_t>(), std::vector<uint32_t>(),
std::vector<uint32_t>());
}
void CopySubcol(const MultiValBin* full_bin, const std::vector<int>&,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) override {
CopyInner<false, true>(full_bin, nullptr, num_data_, lower, upper, delta);
}
void CopySubrowAndSubcol(const MultiValBin* full_bin,
const data_size_t* used_indices,
data_size_t num_used_indices,
const std::vector<int>&,
const std::vector<uint32_t>& lower,
const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) override {
CopyInner<true, true>(full_bin, used_indices, num_used_indices, lower,
upper, delta);
}
inline INDEX_T RowPtr(data_size_t idx) const { return row_ptr_[idx]; } inline INDEX_T RowPtr(data_size_t idx) const { return row_ptr_[idx]; }
MultiValSparseBin<INDEX_T, VAL_T>* Clone() override; MultiValSparseBin<INDEX_T, VAL_T>* Clone() override;
......
...@@ -451,7 +451,7 @@ class SparseBin: public Bin { ...@@ -451,7 +451,7 @@ class SparseBin: public Bin {
} }
} }
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const SparseBin<VAL_T>*>(full_bin); auto other_bin = dynamic_cast<const SparseBin<VAL_T>*>(full_bin);
deltas_.clear(); deltas_.clear();
vals_.clear(); vals_.clear();
......
...@@ -158,7 +158,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur ...@@ -158,7 +158,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
indices_future_.wait(); indices_future_.wait();
} }
// for constant hessian, hessians are not copied except for the root node // for constant hessian, hessians are not copied except for the root node
if (!is_constant_hessian_) { if (!share_state_->is_constant_hessian) {
hessians_future_.wait(); hessians_future_.wait();
} }
gradients_future_.wait(); gradients_future_.wait();
...@@ -581,7 +581,7 @@ void GPUTreeLearner::BuildGPUKernels() { ...@@ -581,7 +581,7 @@ void GPUTreeLearner::BuildGPUKernels() {
// compile the GPU kernel depending if double precision is used, constant hessian is used, etc. // compile the GPU kernel depending if double precision is used, constant hessian is used, etc.
opts << " -D POWER_FEATURE_WORKGROUPS=" << i opts << " -D POWER_FEATURE_WORKGROUPS=" << i
<< " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(config_->gpu_use_dp) << " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(config_->gpu_use_dp)
<< " -D CONST_HESSIAN=" << int(is_constant_hessian_) << " -D CONST_HESSIAN=" << int(share_state_->is_constant_hessian)
<< " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math"; << " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math";
#if GPU_DEBUG >= 1 #if GPU_DEBUG >= 1
std::cout << "Building GPU kernels with options: " << opts.str() << std::endl; std::cout << "Building GPU kernels with options: " << opts.str() << std::endl;
...@@ -642,7 +642,7 @@ void GPUTreeLearner::SetupKernelArguments() { ...@@ -642,7 +642,7 @@ void GPUTreeLearner::SetupKernelArguments() {
} }
for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) { for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
// The only argument that needs to be changed later is num_data_ // The only argument that needs to be changed later is num_data_
if (is_constant_hessian_) { if (share_state_->is_constant_hessian) {
// hessian is passed as a parameter, but it is not available now. // hessian is passed as a parameter, but it is not available now.
// hessian will be set in BeforeTrain() // hessian will be set in BeforeTrain()
histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_, histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
...@@ -736,20 +736,12 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { ...@@ -736,20 +736,12 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
} }
Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians, Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
bool is_constant_hessian, const Json& forced_split_json) { const Json& forced_split_json) {
// check if we need to recompile the GPU kernel (is_constant_hessian changed) return SerialTreeLearner::Train(gradients, hessians, forced_split_json);
// this should rarely occur
if (is_constant_hessian != is_constant_hessian_) {
Log::Info("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not ");
is_constant_hessian_ = is_constant_hessian;
BuildGPUKernels();
SetupKernelArguments();
}
return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
} }
void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) { void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
SerialTreeLearner::ResetTrainingData(train_data); SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
num_feature_groups_ = train_data_->num_feature_groups(); num_feature_groups_ = train_data_->num_feature_groups();
// GPU memory has to been reallocated because data may have been changed // GPU memory has to been reallocated because data may have been changed
AllocateGPUMemory(); AllocateGPUMemory();
...@@ -757,6 +749,14 @@ void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) { ...@@ -757,6 +749,14 @@ void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
SetupKernelArguments(); SetupKernelArguments();
} }
void GPUTreeLearner::ResetIsConstantHessian(bool is_constant_hessian) {
if (is_constant_hessian != share_state_->is_constant_hessian) {
SerialTreeLearner::ResetIsConstantHessian(is_constant_hessian);
BuildGPUKernels();
SetupKernelArguments();
}
}
void GPUTreeLearner::BeforeTrain() { void GPUTreeLearner::BeforeTrain() {
#if GPU_DEBUG >= 2 #if GPU_DEBUG >= 2
printf("Copying intial full gradients and hessians to device\n"); printf("Copying intial full gradients and hessians to device\n");
...@@ -764,7 +764,7 @@ void GPUTreeLearner::BeforeTrain() { ...@@ -764,7 +764,7 @@ void GPUTreeLearner::BeforeTrain() {
// Copy initial full hessians and gradients to GPU. // Copy initial full hessians and gradients to GPU.
// We start copying as early as possible, instead of at ConstructHistogram(). // We start copying as early as possible, instead of at ConstructHistogram().
if (!use_bagging_ && num_dense_feature_groups_) { if (!use_bagging_ && num_dense_feature_groups_) {
if (!is_constant_hessian_) { if (!share_state_->is_constant_hessian) {
hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data_ * sizeof(score_t), hessians_); hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data_ * sizeof(score_t), hessians_);
} else { } else {
// setup hessian parameters only // setup hessian parameters only
...@@ -792,7 +792,7 @@ void GPUTreeLearner::BeforeTrain() { ...@@ -792,7 +792,7 @@ void GPUTreeLearner::BeforeTrain() {
#endif #endif
// transfer the indices to GPU // transfer the indices to GPU
indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_); indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
if (!is_constant_hessian_) { if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
ordered_hessians_[i] = hessians_[indices[i]]; ordered_hessians_[i] = hessians_[indices[i]];
...@@ -846,7 +846,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri ...@@ -846,7 +846,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
#endif #endif
indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_); indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
if (!is_constant_hessian_) { if (!share_state_->is_constant_hessian) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) { for (data_size_t i = begin; i < end; ++i) {
ordered_hessians_[i - begin] = hessians_[indices[i]]; ordered_hessians_[i - begin] = hessians_[indices[i]];
...@@ -899,7 +899,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync( ...@@ -899,7 +899,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
} }
} }
// generate and copy ordered_hessians if hessians is not null // generate and copy ordered_hessians if hessians is not null
if (hessians != nullptr && !is_constant_hessian_) { if (hessians != nullptr && !share_state_->is_constant_hessian) {
if (num_data != num_data_) { if (num_data != num_data_) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
...@@ -976,8 +976,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -976,8 +976,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms(is_sparse_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, ordered_gradients_.data(), ordered_hessians_.data(),
is_hist_colwise_, temp_state_.get(), share_state_.get(),
ptr_smaller_leaf_hist_data); ptr_smaller_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used // wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) { if (is_gpu_used) {
...@@ -1041,8 +1041,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -1041,8 +1041,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms(is_sparse_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, ordered_gradients_.data(), ordered_hessians_.data(),
is_hist_colwise_, temp_state_.get(), share_state_.get(),
ptr_larger_leaf_hist_data); ptr_larger_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used // wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) { if (is_gpu_used) {
......
...@@ -46,15 +46,16 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -46,15 +46,16 @@ class GPUTreeLearner: public SerialTreeLearner {
explicit GPUTreeLearner(const Config* tree_config); explicit GPUTreeLearner(const Config* tree_config);
~GPUTreeLearner(); ~GPUTreeLearner();
void Init(const Dataset* train_data, bool is_constant_hessian) override; void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetTrainingData(const Dataset* train_data) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
void ResetIsConstantHessian(bool is_constant_hessian);
Tree* Train(const score_t* gradients, const score_t *hessians, Tree* Train(const score_t* gradients, const score_t *hessians,
bool is_constant_hessian, const Json& forced_split_json) override; const Json& forced_split_json) override;
void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override { void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
SerialTreeLearner::SetBaggingData(used_indices, num_data); SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
if (subset == nullptr && used_indices != nullptr) {
// determine if we are using bagging before we construct the data partition // determine if we are using bagging before we construct the data partition
// thus we can start data movement to GPU earlier // thus we can start data movement to GPU earlier
if (used_indices != nullptr) {
if (num_data != num_data_) { if (num_data != num_data_) {
use_bagging_ = true; use_bagging_ = true;
return; return;
......
...@@ -30,7 +30,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian ...@@ -30,7 +30,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
train_data_ = train_data; train_data_ = train_data;
num_data_ = train_data_->num_data(); num_data_ = train_data_->num_data();
num_features_ = train_data_->num_features(); num_features_ = train_data_->num_features();
is_constant_hessian_ = is_constant_hessian;
int max_cache_size = 0; int max_cache_size = 0;
// Get the max size of pool // Get the max size of pool
if (config_->histogram_pool_size <= 0) { if (config_->histogram_pool_size <= 0) {
...@@ -62,9 +61,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian ...@@ -62,9 +61,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
ordered_gradients_.resize(num_data_); ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_); ordered_hessians_.resize(num_data_);
GetMultiValBin(train_data_, true); GetShareStates(train_data_, is_constant_hessian, true);
histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_); Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
if (CostEfficientGradientBoosting::IsEnable(config_)) { if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this)); cegb_.reset(new CostEfficientGradientBoosting(this));
...@@ -72,22 +70,28 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian ...@@ -72,22 +70,28 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
} }
} }
void SerialTreeLearner::GetMultiValBin(const Dataset* dataset, bool is_first_time) { void SerialTreeLearner::GetShareStates(const Dataset* dataset,
bool is_constant_hessian,
bool is_first_time) {
if (is_first_time) { if (is_first_time) {
auto used_feature = GetUsedFeatures(true); auto used_feature = GetUsedFeatures(true);
temp_state_.reset(dataset->TestMultiThreadingMethod( share_state_.reset(dataset->GetShareStates(
ordered_gradients_.data(), ordered_hessians_.data(), used_feature, ordered_gradients_.data(), ordered_hessians_.data(), used_feature,
is_constant_hessian_, config_->force_col_wise, config_->force_row_wise, &is_hist_colwise_)); is_constant_hessian, config_->force_col_wise, config_->force_row_wise));
} else { } else {
CHECK(share_state_ != nullptr);
// cannot change is_hist_col_wise during training // cannot change is_hist_col_wise during training
temp_state_.reset(dataset->TestMultiThreadingMethod( share_state_.reset(dataset->GetShareStates(
ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_, ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_,
is_constant_hessian_, is_hist_colwise_, !is_hist_colwise_, &is_hist_colwise_)); is_constant_hessian, share_state_->is_colwise,
!share_state_->is_colwise));
} }
CHECK(share_state_ != nullptr);
} }
// Todo: optimized bagging for multi-val bin void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data,
void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { bool is_constant_hessian,
bool reset_multi_val_bin) {
train_data_ = train_data; train_data_ = train_data;
num_data_ = train_data_->num_data(); num_data_ = train_data_->num_data();
CHECK_EQ(num_features_, train_data_->num_features()); CHECK_EQ(num_features_, train_data_->num_features());
...@@ -99,7 +103,9 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { ...@@ -99,7 +103,9 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
// initialize data partition // initialize data partition
data_partition_->ResetNumData(num_data_); data_partition_->ResetNumData(num_data_);
GetMultiValBin(train_data_, false); if (reset_multi_val_bin) {
GetShareStates(train_data_, is_constant_hessian, false);
}
// initialize ordered gradients and hessians // initialize ordered gradients and hessians
ordered_gradients_.resize(num_data_); ordered_gradients_.resize(num_data_);
...@@ -127,7 +133,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) { ...@@ -127,7 +133,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
// at least need 2 leaves // at least need 2 leaves
max_cache_size = std::max(2, max_cache_size); max_cache_size = std::max(2, max_cache_size);
max_cache_size = std::min(max_cache_size, config_->num_leaves); max_cache_size = std::min(max_cache_size, config_->num_leaves);
histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves); histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
// push split information for all leaves // push split information for all leaves
best_split_per_leaf_.resize(config_->num_leaves); best_split_per_leaf_.resize(config_->num_leaves);
...@@ -142,11 +148,10 @@ void SerialTreeLearner::ResetConfig(const Config* config) { ...@@ -142,11 +148,10 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
} }
} }
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) { Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, const Json& forced_split_json) {
Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer); Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
gradients_ = gradients; gradients_ = gradients;
hessians_ = hessians; hessians_ = hessians;
is_constant_hessian_ = is_constant_hessian;
// some initial works before training // some initial works before training
BeforeTrain(); BeforeTrain();
...@@ -286,7 +291,7 @@ void SerialTreeLearner::BeforeTrain() { ...@@ -286,7 +291,7 @@ void SerialTreeLearner::BeforeTrain() {
is_feature_used_[i] = 1; is_feature_used_[i] = 1;
} }
} }
train_data_->InitTrain(is_feature_used_, is_hist_colwise_, temp_state_.get()); train_data_->InitTrain(is_feature_used_, share_state_.get());
// initialize data partition // initialize data partition
data_partition_->Init(); data_partition_->Init();
...@@ -369,24 +374,27 @@ void SerialTreeLearner::FindBestSplits() { ...@@ -369,24 +374,27 @@ void SerialTreeLearner::FindBestSplits() {
FindBestSplitsFromHistograms(is_feature_used, use_subtract); FindBestSplitsFromHistograms(is_feature_used, use_subtract);
} }
void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) { void SerialTreeLearner::ConstructHistograms(
Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer); const std::vector<int8_t>& is_feature_used, bool use_subtract) {
Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
global_timer);
// construct smaller leaf // construct smaller leaf
hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; hist_t* ptr_smaller_leaf_hist_data =
smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
train_data_->ConstructHistograms( train_data_->ConstructHistograms(
is_feature_used, smaller_leaf_splits_->data_indices(), is_feature_used, smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
is_hist_colwise_, temp_state_.get(), ptr_smaller_leaf_hist_data); ptr_smaller_leaf_hist_data);
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf // construct larger leaf
hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; hist_t* ptr_larger_leaf_hist_data =
larger_leaf_histogram_array_[0].RawData() - kHistOffset;
train_data_->ConstructHistograms( train_data_->ConstructHistograms(
is_feature_used, larger_leaf_splits_->data_indices(), is_feature_used, larger_leaf_splits_->data_indices(),
larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
is_constant_hessian_, is_hist_colwise_, temp_state_.get(),
ptr_larger_leaf_hist_data); ptr_larger_leaf_hist_data);
} }
} }
......
...@@ -48,11 +48,22 @@ class SerialTreeLearner: public TreeLearner { ...@@ -48,11 +48,22 @@ class SerialTreeLearner: public TreeLearner {
void Init(const Dataset* train_data, bool is_constant_hessian) override; void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetTrainingData(const Dataset* train_data) override; void ResetTrainingData(const Dataset* train_data,
bool is_constant_hessian) override {
ResetTrainingDataInner(train_data, is_constant_hessian, true);
}
void ResetIsConstantHessian(bool is_constant_hessian) override {
share_state_->is_constant_hessian = is_constant_hessian;
}
virtual void ResetTrainingDataInner(const Dataset* train_data,
bool is_constant_hessian,
bool reset_multi_val_bin);
void ResetConfig(const Config* config) override; void ResetConfig(const Config* config) override;
Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Tree* Train(const score_t* gradients, const score_t *hessians,
const Json& forced_split_json) override; const Json& forced_split_json) override;
Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override; Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
...@@ -60,8 +71,17 @@ class SerialTreeLearner: public TreeLearner { ...@@ -60,8 +71,17 @@ class SerialTreeLearner: public TreeLearner {
Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred, Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
const score_t* gradients, const score_t* hessians) override; const score_t* gradients, const score_t* hessians) override;
void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override { void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
if (subset == nullptr) {
data_partition_->SetUsedDataIndices(used_indices, num_data); data_partition_->SetUsedDataIndices(used_indices, num_data);
share_state_->is_use_subrow = false;
} else {
ResetTrainingDataInner(subset, share_state_->is_constant_hessian, false);
share_state_->is_use_subrow = true;
share_state_->is_subrow_copied = false;
share_state_->bagging_use_indices = used_indices;
share_state_->bagging_indices_cnt = num_data;
}
} }
void AddPredictionToScore(const Tree* tree, void AddPredictionToScore(const Tree* tree,
...@@ -84,8 +104,6 @@ class SerialTreeLearner: public TreeLearner { ...@@ -84,8 +104,6 @@ class SerialTreeLearner: public TreeLearner {
void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter, void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
bool IsHistColWise() const override { return is_hist_colwise_; }
protected: protected:
void ComputeBestSplitForFeature(FeatureHistogram* histogram_array_, void ComputeBestSplitForFeature(FeatureHistogram* histogram_array_,
int feature_index, int real_fidx, int feature_index, int real_fidx,
...@@ -93,7 +111,7 @@ class SerialTreeLearner: public TreeLearner { ...@@ -93,7 +111,7 @@ class SerialTreeLearner: public TreeLearner {
const LeafSplits* leaf_splits, const LeafSplits* leaf_splits,
SplitInfo* best_split); SplitInfo* best_split);
void GetMultiValBin(const Dataset* dataset, bool is_first_time); void GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time);
virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level); virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
/*! /*!
...@@ -182,17 +200,11 @@ class SerialTreeLearner: public TreeLearner { ...@@ -182,17 +200,11 @@ class SerialTreeLearner: public TreeLearner {
/*! \brief hessians of current iteration, ordered for cache optimized */ /*! \brief hessians of current iteration, ordered for cache optimized */
std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_; std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_;
#endif #endif
/*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */
std::vector<char, Common::AlignmentAllocator<char, kAlignedSize>> is_data_in_leaf_;
/*! \brief used to cache historical histogram to speed up*/ /*! \brief used to cache historical histogram to speed up*/
HistogramPool histogram_pool_; HistogramPool histogram_pool_;
/*! \brief config of tree learner*/ /*! \brief config of tree learner*/
const Config* config_; const Config* config_;
std::vector<int> ordered_bin_indices_; std::unique_ptr<TrainingShareStates> share_state_;
bool is_constant_hessian_;
std::unique_ptr<TrainingTempState> temp_state_;
bool is_hist_colwise_;
std::unique_ptr<CostEfficientGradientBoosting> cegb_; std::unique_ptr<CostEfficientGradientBoosting> cegb_;
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment