Commit 5442ed78 authored by Guolin Ke's avatar Guolin Ke Committed by xuehui
Browse files

Refactor for RAII (#86)

* RAII for utils, application and c_api(partical)

* raii for class in include folder

* raii for application and boosting

* raii for dataset and dataset loader

* raii for dense bin and parser

* RAII refactor for almost all classes

* RAII for c_api

* clean code

* refine repeated code

* Decouple the "sigmoid" between objective and boosting.

* change std::vector<bool> back to std::vector<char> due to concurrence problem

* slight reduce some memory cost
parent 3586673a
......@@ -16,74 +16,66 @@
namespace LightGBM {
GBDT::GBDT()
: train_score_updater_(nullptr),
gradients_(nullptr), hessians_(nullptr),
out_of_bag_data_indices_(nullptr), bag_data_indices_(nullptr),
saved_model_size_(-1), num_used_model_(0) {
GBDT::GBDT() : saved_model_size_(-1), num_used_model_(0) {
}
GBDT::~GBDT() {
for (auto& tree_learner: tree_learner_){
if (tree_learner != nullptr) { delete tree_learner; }
}
if (gradients_ != nullptr) { delete[] gradients_; }
if (hessians_ != nullptr) { delete[] hessians_; }
if (out_of_bag_data_indices_ != nullptr) { delete[] out_of_bag_data_indices_; }
if (bag_data_indices_ != nullptr) { delete[] bag_data_indices_; }
for (auto& tree : models_) {
if (tree != nullptr) { delete tree; }
}
if (train_score_updater_ != nullptr) { delete train_score_updater_; }
for (auto& score_tracker : valid_score_updater_) {
if (score_tracker != nullptr) { delete score_tracker; }
}
}
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) {
gbdt_config_ = dynamic_cast<const GBDTConfig*>(config);
gbdt_config_ = config;
iter_ = 0;
saved_model_size_ = -1;
num_used_model_ = 0;
max_feature_idx_ = 0;
early_stopping_round_ = gbdt_config_->early_stopping_round;
shrinkage_rate_ = gbdt_config_->learning_rate;
train_data_ = train_data;
num_class_ = config->num_class;
tree_learner_ = std::vector<TreeLearner*>(num_class_, nullptr);
// create tree learner
for (int i = 0; i < num_class_; ++i){
tree_learner_[i] =
TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config);
// init tree learner
tree_learner_[i]->Init(train_data_);
for (int i = 0; i < num_class_; ++i) {
auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
new_tree_learner->Init(train_data_);
// init tree learner
tree_learner_.push_back(std::move(new_tree_learner));
}
tree_learner_.shrink_to_fit();
object_function_ = object_function;
// push training metrics
for (const auto& metric : training_metrics) {
training_metrics_.push_back(metric);
}
training_metrics_.shrink_to_fit();
// create score tracker
train_score_updater_ = new ScoreUpdater(train_data_, num_class_);
train_score_updater_.reset(new ScoreUpdater(train_data_, num_class_));
num_data_ = train_data_->num_data();
// create buffer for gradients and hessians
if (object_function_ != nullptr) {
gradients_ = new score_t[num_data_ * num_class_];
hessians_ = new score_t[num_data_ * num_class_];
gradients_ = std::vector<score_t>(num_data_ * num_class_);
hessians_ = std::vector<score_t>(num_data_ * num_class_);
}
sigmoid_ = -1.0f;
if (object_function_ != nullptr
&& std::string(object_function_->GetName()) == std::string("binary")) {
// only binary classification need sigmoid transform
sigmoid_ = gbdt_config_->sigmoid;
}
// get max feature index
max_feature_idx_ = train_data_->num_total_features() - 1;
// get label index
label_idx_ = train_data_->label_idx();
// if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = new data_size_t[num_data_];
bag_data_indices_ = new data_size_t[num_data_];
out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
bag_data_indices_ = std::vector<data_size_t>(num_data_);
} else {
out_of_bag_data_cnt_ = 0;
out_of_bag_data_indices_ = nullptr;
out_of_bag_data_indices_.clear();
bag_data_cnt_ = num_data_;
bag_data_indices_ = nullptr;
bag_data_indices_.clear();
}
// initialize random generator
random_ = Random(gbdt_config_->bagging_seed);
......@@ -91,12 +83,13 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O
}
void GBDT::AddDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) {
const std::vector<const Metric*>& valid_metrics) {
if (iter_ > 0) {
Log::Fatal("Cannot add validation data after training started");
}
// for a validation dataset, we need its score and metric
valid_score_updater_.push_back(new ScoreUpdater(valid_data, num_class_));
auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
valid_score_updater_.push_back(std::move(new_score_updater));
valid_metrics_.emplace_back();
if (early_stopping_round_ > 0) {
best_iter_.emplace_back();
......@@ -109,12 +102,13 @@ void GBDT::AddDataset(const Dataset* valid_data,
best_score_.back().push_back(kMinScore);
}
}
valid_metrics_.back().shrink_to_fit();
}
void GBDT::Bagging(int iter, const int curr_class) {
// if need bagging
if (out_of_bag_data_indices_ != nullptr && iter % gbdt_config_->bagging_freq == 0) {
if (out_of_bag_data_indices_.size() > 0 && iter % gbdt_config_->bagging_freq == 0) {
// if doesn't have query data
if (train_data_->metadata().query_boundaries() == nullptr) {
bag_data_cnt_ =
......@@ -159,72 +153,75 @@ void GBDT::Bagging(int iter, const int curr_class) {
bag_data_cnt_ = cur_left_cnt;
out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
}
Log::Info("Re-bagging, using %d data to train", bag_data_cnt_);
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
tree_learner_[curr_class]->SetBaggingData(bag_data_indices_, bag_data_cnt_);
tree_learner_[curr_class]->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
}
}
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
// we need to predict out-of-bag socres of data for boosting
if (out_of_bag_data_indices_ != nullptr) {
train_score_updater_->
AddScore(tree, out_of_bag_data_indices_, out_of_bag_data_cnt_, curr_class);
if (out_of_bag_data_indices_.size() > 0) {
train_score_updater_->AddScore(tree, out_of_bag_data_indices_.data(), out_of_bag_data_cnt_, curr_class);
}
}
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
// boosting first
if (gradient == nullptr || hessian == nullptr) {
Boosting();
gradient = gradients_;
hessian = hessians_;
// boosting first
if (gradient == nullptr || hessian == nullptr) {
Boosting();
gradient = gradients_.data();
hessian = hessians_.data();
}
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
// bagging logic
Bagging(iter_, curr_class);
// train a new tree
std::unique_ptr<Tree> new_tree(tree_learner_[curr_class]->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
// if cannot learn a new tree, then stop
if (new_tree->num_leaves() <= 1) {
Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
return true;
}
for (int curr_class = 0; curr_class < num_class_; ++curr_class){
// bagging logic
Bagging(iter_, curr_class);
// train a new tree
Tree * new_tree = tree_learner_[curr_class]->Train(gradient + curr_class * num_data_, hessian+ curr_class * num_data_);
// if cannot learn a new tree, then stop
if (new_tree->num_leaves() <= 1) {
Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
return true;
}
// shrinkage by learning rate
new_tree->Shrinkage(shrinkage_rate_);
// update score
UpdateScore(new_tree.get(), curr_class);
UpdateScoreOutOfBag(new_tree.get(), curr_class);
// shrinkage by learning rate
new_tree->Shrinkage(gbdt_config_->learning_rate);
// update score
UpdateScore(new_tree, curr_class);
UpdateScoreOutOfBag(new_tree, curr_class);
// add model
models_.push_back(std::move(new_tree));
}
++iter_;
if (is_eval) {
return EvalAndCheckEarlyStopping();
} else {
return false;
}
// add model
models_.push_back(new_tree);
}
}
bool GBDT::EvalAndCheckEarlyStopping() {
bool is_met_early_stopping = false;
// print message for metric
if (is_eval) {
is_met_early_stopping = OutputMetric(iter_ + 1);
}
++iter_;
is_met_early_stopping = OutputMetric(iter_);
if (is_met_early_stopping) {
Log::Info("Early stopping at iteration %d, the best iteration round is %d",
iter_, iter_ - early_stopping_round_);
// pop last early_stopping_round_ models
for (int i = 0; i < early_stopping_round_ * num_class_; ++i) {
delete models_.back();
models_.pop_back();
}
}
return is_met_early_stopping;
}
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
// update training score
train_score_updater_->AddScore(tree_learner_[curr_class], curr_class);
train_score_updater_->AddScore(tree_learner_[curr_class].get(), curr_class);
// update validation score
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(tree, curr_class);
......@@ -327,7 +324,7 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len)
out_result[j * num_data + i] = static_cast<score_t>(tmp_result[i]);
}
}
} else if(sigmoid_ > 0){
} else if(sigmoid_ > 0.0f){
#pragma omp parallel for schedule(guided)
for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
......@@ -348,11 +345,10 @@ void GBDT::Boosting() {
// objective function will calculate gradients and hessians
int num_score = 0;
object_function_->
GetGradients(GetTrainingScore(&num_score), gradients_, hessians_);
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
}
void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filename) {
// first time to this function, open file
if (saved_model_size_ < 0) {
model_output_file_.open(filename);
......@@ -364,8 +360,12 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen
model_output_file_ << "label_index=" << label_idx_ << std::endl;
// output max_feature_idx
model_output_file_ << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output objective name
if (object_function_ != nullptr) {
model_output_file_ << "objective=" << object_function_->GetName() << std::endl;
}
// output sigmoid parameter
model_output_file_ << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
model_output_file_ << "sigmoid=" << sigmoid_ << std::endl;
model_output_file_ << std::endl;
saved_model_size_ = 0;
}
......@@ -445,7 +445,8 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
int end = static_cast<int>(i);
std::string tree_str = Common::Join<std::string>(lines, start, end, '\n');
models_.push_back(new Tree(tree_str));
auto new_tree = std::unique_ptr<Tree>(new Tree(tree_str));
models_.push_back(std::move(new_tree));
} else {
++i;
}
......
......@@ -8,6 +8,7 @@
#include <vector>
#include <string>
#include <fstream>
#include <memory>
namespace LightGBM {
/*!
......@@ -50,6 +51,8 @@ public:
*/
virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override;
bool EvalAndCheckEarlyStopping() override;
/*!
* \brief Get evaluation result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
......@@ -59,7 +62,7 @@ public:
/*!
* \brief Get current training score
* \param out_len lenght of returned score
* \param out_len length of returned score
* \return training score
*/
virtual const score_t* GetTrainingScore(data_size_t* out_len) override;
......@@ -94,8 +97,10 @@ public:
std::vector<int> PredictLeafIndex(const double* value) const override;
/*!
* \brief Serialize models by string
* \return String output of tranined model
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param filename filename that want to save to
*/
virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) override;
/*!
......@@ -179,17 +184,17 @@ protected:
/*! \brief Pointer to training data */
const Dataset* train_data_;
/*! \brief Config of gbdt */
const GBDTConfig* gbdt_config_;
const BoostingConfig* gbdt_config_;
/*! \brief Tree learner, will use this class to learn trees */
std::vector<TreeLearner*> tree_learner_;
std::vector<std::unique_ptr<TreeLearner>> tree_learner_;
/*! \brief Objective function */
const ObjectiveFunction* object_function_;
/*! \brief Store and update training data's score */
ScoreUpdater* train_score_updater_;
std::unique_ptr<ScoreUpdater> train_score_updater_;
/*! \brief Metrics for training data */
std::vector<const Metric*> training_metrics_;
/*! \brief Store and update validation data's scores */
std::vector<ScoreUpdater*> valid_score_updater_;
std::vector<std::unique_ptr<ScoreUpdater>> valid_score_updater_;
/*! \brief Metric for validation data */
std::vector<std::vector<const Metric*>> valid_metrics_;
/*! \brief Number of rounds for early stopping */
......@@ -198,19 +203,19 @@ protected:
std::vector<std::vector<int>> best_iter_;
std::vector<std::vector<double>> best_score_;
/*! \brief Trained models(trees) */
std::vector<Tree*> models_;
std::vector<std::unique_ptr<Tree>> models_;
/*! \brief Max feature index of training data*/
int max_feature_idx_;
/*! \brief First order derivative of training data */
score_t* gradients_;
std::vector<score_t> gradients_;
/*! \brief Secend order derivative of training data */
score_t* hessians_;
std::vector<score_t> hessians_;
/*! \brief Store the data indices of out-of-bag */
data_size_t* out_of_bag_data_indices_;
std::vector<data_size_t> out_of_bag_data_indices_;
/*! \brief Number of out-of-bag data */
data_size_t out_of_bag_data_cnt_;
/*! \brief Store the indices of in-bag data */
data_size_t* bag_data_indices_;
std::vector<data_size_t> bag_data_indices_;
/*! \brief Number of in-bag data */
data_size_t bag_data_cnt_;
/*! \brief Number of traning data */
......@@ -232,6 +237,8 @@ protected:
std::ofstream model_output_file_;
/*! \brief number of used model */
int num_used_model_;
/*! \brief Shrinkage rate for one iteration */
double shrinkage_rate_;
};
} // namespace LightGBM
......
......@@ -18,12 +18,11 @@ public:
* \brief Constructor, will pass a const pointer of dataset
* \param data This class will bind with this data set
*/
explicit ScoreUpdater(const Dataset* data, int num_class)
:data_(data) {
ScoreUpdater(const Dataset* data, int num_class) : data_(data) {
num_data_ = data->num_data();
score_ = new score_t[num_data_ * num_class];
score_ = std::vector<score_t>(num_data_ * num_class);
// default start score is zero
std::memset(score_, 0, sizeof(score_t) * num_data_ * num_class);
std::fill(score_.begin(), score_.end(), 0.0f);
const float* init_score = data->metadata().init_score();
// if exists initial score, will start from it
if (init_score != nullptr) {
......@@ -34,7 +33,7 @@ public:
}
/*! \brief Destructor */
~ScoreUpdater() {
delete[] score_;
}
/*!
* \brief Using tree model to get prediction number, then adding to scores for all data
......@@ -43,7 +42,7 @@ public:
* \param curr_class Current class for multiclass training
*/
inline void AddScore(const Tree* tree, int curr_class) {
tree->AddPredictionToScore(data_, num_data_, score_ + curr_class * num_data_);
tree->AddPredictionToScore(data_, num_data_, score_.data() + curr_class * num_data_);
}
/*!
* \brief Adding prediction score, only used for training data.
......@@ -53,7 +52,7 @@ public:
* \param curr_class Current class for multiclass training
*/
inline void AddScore(const TreeLearner* tree_learner, int curr_class) {
tree_learner->AddPredictionToScore(score_ + curr_class * num_data_);
tree_learner->AddPredictionToScore(score_.data() + curr_class * num_data_);
}
/*!
* \brief Using tree model to get prediction number, then adding to scores for parts of data
......@@ -65,18 +64,23 @@ public:
*/
inline void AddScore(const Tree* tree, const data_size_t* data_indices,
data_size_t data_cnt, int curr_class) {
tree->AddPredictionToScore(data_, data_indices, data_cnt, score_ + curr_class * num_data_);
tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + curr_class * num_data_);
}
/*! \brief Pointer of score */
inline const score_t* score() { return score_; }
inline const data_size_t num_data() { return num_data_; }
inline const score_t* score() const { return score_.data(); }
inline const data_size_t num_data() const { return num_data_; }
/*! \brief Disable copy */
ScoreUpdater& operator=(const ScoreUpdater&) = delete;
/*! \brief Disable copy */
ScoreUpdater(const ScoreUpdater&) = delete;
private:
/*! \brief Number of total data */
data_size_t num_data_;
/*! \brief Pointer of data set */
const Dataset* data_;
/*! \brief Scores for data set */
score_t* score_;
std::vector<score_t> score_;
};
} // namespace LightGBM
......
This diff is collapsed.
......@@ -14,29 +14,26 @@
namespace LightGBM {
BinMapper::BinMapper()
:bin_upper_bound_(nullptr) {
BinMapper::BinMapper() {
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other)
: bin_upper_bound_(nullptr) {
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_upper_bound_ = new double[num_bin_];
bin_upper_bound_ = std::vector<double>(num_bin_);
for (int i = 0; i < num_bin_; ++i) {
bin_upper_bound_[i] = other.bin_upper_bound_[i];
}
}
BinMapper::BinMapper(const void* memory)
:bin_upper_bound_(nullptr) {
BinMapper::BinMapper(const void* memory) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
BinMapper::~BinMapper() {
delete[] bin_upper_bound_;
}
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin) {
......@@ -87,7 +84,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
std::sort(distinct_values.begin(), distinct_values.end());
// use distinct value is enough
num_bin_ = num_values;
bin_upper_bound_ = new double[num_values];
bin_upper_bound_ = std::vector<double>(num_values);
for (int i = 0; i < num_values - 1; ++i) {
bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
}
......@@ -124,7 +121,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
//
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = new double[bin_cnt];
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
......@@ -159,7 +156,7 @@ void BinMapper::CopyTo(char * buffer) {
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, bin_upper_bound_, num_bin_ * sizeof(double));
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
}
void BinMapper::CopyFrom(const char * buffer) {
......@@ -169,16 +166,15 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
if (bin_upper_bound_ != nullptr) { delete[] bin_upper_bound_; }
bin_upper_bound_ = new double[num_bin_];
std::memcpy(bin_upper_bound_, buffer, num_bin_ * sizeof(double));
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
}
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(bin_upper_bound_, sizeof(double), num_bin_, file);
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
}
size_t BinMapper::SizesInByte() const {
......
......@@ -38,16 +38,12 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
GetObjectiveType(params);
GetMetricType(params);
// construct boosting configs
if (boosting_type == BoostingType::kGBDT || boosting_type == BoostingType::kDART) {
boosting_config = new GBDTConfig();
}
// sub-config setup
network_config.Set(params);
io_config.Set(params);
boosting_config->Set(params);
boosting_config.Set(params);
objective_config.Set(params);
metric_config.Set(params);
// check for conflicts
......@@ -110,6 +106,7 @@ void OverallConfig::GetMetricType(const std::unordered_map<std::string, std::str
std::string sub_metric_str = pair.first;
metric_types.push_back(sub_metric_str);
}
metric_types.shrink_to_fit();
}
}
......@@ -130,11 +127,10 @@ void OverallConfig::GetTaskType(const std::unordered_map<std::string, std::strin
}
void OverallConfig::CheckParamConflict() {
GBDTConfig* gbdt_config = dynamic_cast<GBDTConfig*>(boosting_config);
// check if objective_type, metric_type, and num_class match
bool objective_type_multiclass = (objective_type == std::string("multiclass"));
int num_class_check = gbdt_config->num_class;
int num_class_check = boosting_config.num_class;
if (objective_type_multiclass){
if (num_class_check <= 1){
Log::Fatal("Number of classes should be specified and greater than 1 for multiclass training");
......@@ -157,24 +153,24 @@ void OverallConfig::CheckParamConflict() {
is_parallel = true;
} else {
is_parallel = false;
gbdt_config->tree_learner_type = TreeLearnerType::kSerialTreeLearner;
boosting_config.tree_learner_type = TreeLearnerType::kSerialTreeLearner;
}
if (gbdt_config->tree_learner_type == TreeLearnerType::kSerialTreeLearner) {
if (boosting_config.tree_learner_type == TreeLearnerType::kSerialTreeLearner) {
is_parallel = false;
network_config.num_machines = 1;
}
if (gbdt_config->tree_learner_type == TreeLearnerType::kSerialTreeLearner ||
gbdt_config->tree_learner_type == TreeLearnerType::kFeatureParallelTreelearner) {
if (boosting_config.tree_learner_type == TreeLearnerType::kSerialTreeLearner ||
boosting_config.tree_learner_type == TreeLearnerType::kFeatureParallelTreelearner) {
is_parallel_find_bin = false;
} else if (gbdt_config->tree_learner_type == TreeLearnerType::kDataParallelTreeLearner) {
} else if (boosting_config.tree_learner_type == TreeLearnerType::kDataParallelTreeLearner) {
is_parallel_find_bin = true;
if (gbdt_config->tree_config.histogram_pool_size >= 0) {
if (boosting_config.tree_config.histogram_pool_size >= 0) {
Log::Warning("Histogram LRU queue was enabled (histogram_pool_size=%f). Will disable this to reduce communication costs"
, gbdt_config->tree_config.histogram_pool_size);
, boosting_config.tree_config.histogram_pool_size);
// Change pool size to -1 (not limit) when using data parallel to reduce communication costs
gbdt_config->tree_config.histogram_pool_size = -1;
boosting_config.tree_config.histogram_pool_size = -1;
}
}
......@@ -229,6 +225,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
label_gain.push_back(static_cast<double>((1 << i) - 1));
}
}
label_gain.shrink_to_fit();
}
......@@ -246,6 +243,7 @@ void MetricConfig::Set(const std::unordered_map<std::string, std::string>& param
label_gain.push_back(static_cast<double>((1 << i) - 1));
}
}
label_gain.shrink_to_fit();
if (GetString(params, "ndcg_eval_at", &tmp_str)) {
eval_at = Common::StringToIntArray(tmp_str, ',');
std::sort(eval_at.begin(), eval_at.end());
......@@ -258,6 +256,7 @@ void MetricConfig::Set(const std::unordered_map<std::string, std::string>& param
eval_at.push_back(i);
}
}
eval_at.shrink_to_fit();
}
......@@ -284,6 +283,7 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "num_iterations", &num_iterations);
GetDouble(params, "sigmoid", &sigmoid);
CHECK(num_iterations >= 0);
GetInt(params, "bagging_seed", &bagging_seed);
GetInt(params, "bagging_freq", &bagging_freq);
......@@ -301,9 +301,11 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
GetInt(params, "drop_seed", &drop_seed);
GetDouble(params, "drop_rate", &drop_rate);
CHECK(drop_rate <= 1.0 && drop_rate >= 0.0);
GetTreeLearnerType(params);
tree_config.Set(params);
}
void GBDTConfig::GetTreeLearnerType(const std::unordered_map<std::string, std::string>& params) {
void BoostingConfig::GetTreeLearnerType(const std::unordered_map<std::string, std::string>& params) {
std::string value;
if (GetString(params, "tree_learner", &value)) {
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
......@@ -320,12 +322,6 @@ void GBDTConfig::GetTreeLearnerType(const std::unordered_map<std::string, std::s
}
}
void GBDTConfig::Set(const std::unordered_map<std::string, std::string>& params) {
BoostingConfig::Set(params);
GetTreeLearnerType(params);
tree_config.Set(params);
}
void NetworkConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "num_machines", &num_machines);
CHECK(num_machines >= 1);
......
......@@ -29,10 +29,7 @@ Dataset::Dataset(data_size_t num_data, int num_class) {
}
Dataset::~Dataset() {
for (auto& feature : features_) {
delete feature;
}
features_.clear();
}
void Dataset::FinishLoad() {
......@@ -45,10 +42,15 @@ void Dataset::FinishLoad() {
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_sparse) {
features_.clear();
// copy feature bin mapper data
for (Feature* feature : dataset->features_) {
features_.push_back(new Feature(feature->feature_index(),
new BinMapper(*feature->bin_mapper()), num_data_, is_enable_sparse));
for (const auto& feature : dataset->features_) {
features_.emplace_back(std::unique_ptr<Feature>(
new Feature(feature->feature_index(),
new BinMapper(*feature->bin_mapper()),
num_data_,
is_enable_sparse)
));
}
features_.shrink_to_fit();
num_class_ = dataset->num_class_;
used_feature_map_ = dataset->used_feature_map_;
num_features_ = static_cast<int>(features_.size());
......@@ -56,14 +58,6 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_spars
feature_names_ = dataset->feature_names_;
}
std::vector<const BinMapper*> Dataset::GetBinMappers() const {
std::vector<const BinMapper*> ret(num_total_features_, nullptr);
for (const auto feature : features_) {
ret[feature->feature_index()] = feature->bin_mapper();
}
return ret;
}
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
std::string name(field_name);
name = Common::Trim(name);
......
......@@ -142,13 +142,13 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Please use an additional query file or pre-partition the data");
}
}
auto parser = Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_);
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_));
if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename);
}
data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices;
Dataset* dataset = new Dataset();
auto dataset = std::unique_ptr<Dataset>(new Dataset());
dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_);
......@@ -161,11 +161,11 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
// sample data
auto sample_data = SampleTextDataFromMemory(text_data);
// construct feature bin mappers
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser, dataset);
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
// initialize label
dataset->metadata_.Init(dataset->num_data_, io_config_.num_class, weight_idx_, group_idx_);
// extract features
ExtractFeaturesFromMemory(text_data, parser, dataset);
ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get());
text_data.clear();
} else {
// sample data from file
......@@ -176,38 +176,36 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
dataset->num_data_ = num_global_data;
}
// construct feature bin mappers
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser, dataset);
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
// initialize label
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
// extract features
ExtractFeaturesFromFile(filename, parser, used_data_indices, dataset);
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
}
} else {
// load data from binary file
delete dataset;
std::string bin_filename(filename);
bin_filename.append(".bin");
dataset = LoadFromBinFile(bin_filename.c_str(), rank, num_machines);
dataset.reset(LoadFromBinFile(bin_filename.c_str(), rank, num_machines));
}
// check meta data
dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
// need to check training data
CheckDataset(dataset);
delete parser;
return dataset;
CheckDataset(dataset.get());
return dataset.release();
}
Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data) {
auto parser = Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_);
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_));
if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename);
}
data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices;
Dataset* dataset = new Dataset();
auto dataset = std::unique_ptr<Dataset>(new Dataset());
dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_);
......@@ -221,7 +219,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data, io_config_.is_enable_sparse);
// extract features
ExtractFeaturesFromMemory(text_data, parser, dataset);
ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get());
text_data.clear();
} else {
TextReader<data_size_t> text_reader(filename, io_config_.has_header);
......@@ -232,24 +230,22 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data, io_config_.is_enable_sparse);
// extract features
ExtractFeaturesFromFile(filename, parser, used_data_indices, dataset);
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
}
} else {
// load data from binary file
delete dataset;
std::string bin_filename(filename);
bin_filename.append(".bin");
dataset = LoadFromBinFile(bin_filename.c_str(), 0, 1);
dataset.reset(LoadFromBinFile(bin_filename.c_str(), 0, 1));
}
// not need to check validation data
// check meta data
dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
delete parser;
return dataset;
return dataset.release();
}
Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int num_machines) {
Dataset* dataset = new Dataset();
auto dataset = std::unique_ptr<Dataset>(new Dataset());
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, bin_filename, "rb");
......@@ -263,31 +259,30 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
// buffer to read binary file
size_t buffer_size = 16 * 1024 * 1024;
char* buffer = new char[buffer_size];
auto buffer = std::vector<char>(buffer_size);
// read size of header
size_t read_cnt = fread(buffer, sizeof(size_t), 1, file);
size_t read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
if (read_cnt != 1) {
Log::Fatal("Binary file error: header has the wrong size");
}
size_t size_of_head = *(reinterpret_cast<size_t*>(buffer));
size_t size_of_head = *(reinterpret_cast<size_t*>(buffer.data()));
// re-allocmate space if not enough
if (size_of_head > buffer_size) {
delete[] buffer;
buffer_size = size_of_head;
buffer = new char[buffer_size];
buffer.resize(buffer_size);
}
// read header
read_cnt = fread(buffer, 1, size_of_head, file);
read_cnt = fread(buffer.data(), 1, size_of_head, file);
if (read_cnt != size_of_head) {
Log::Fatal("Binary file error: header is incorrect");
}
// get header
const char* mem_ptr = buffer;
const char* mem_ptr = buffer.data();
dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(dataset->num_data_);
dataset->num_class_ = *(reinterpret_cast<const int*>(mem_ptr));
......@@ -320,28 +315,27 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
}
// read size of meta data
read_cnt = fread(buffer, sizeof(size_t), 1, file);
read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
if (read_cnt != 1) {
Log::Fatal("Binary file error: meta data has the wrong size");
}
size_t size_of_metadata = *(reinterpret_cast<size_t*>(buffer));
size_t size_of_metadata = *(reinterpret_cast<size_t*>(buffer.data()));
// re-allocate space if not enough
if (size_of_metadata > buffer_size) {
delete[] buffer;
buffer_size = size_of_metadata;
buffer = new char[buffer_size];
buffer.resize(buffer_size);
}
// read meta data
read_cnt = fread(buffer, 1, size_of_metadata, file);
read_cnt = fread(buffer.data(), 1, size_of_metadata, file);
if (read_cnt != size_of_metadata) {
Log::Fatal("Binary file error: meta data is incorrect");
}
// load meta data
dataset->metadata_.LoadFromMemory(buffer);
dataset->metadata_.LoadFromMemory(buffer.data());
std::vector<data_size_t> used_data_indices;
data_size_t num_global_data = dataset->num_data_;
......@@ -383,40 +377,43 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
// read feature data
for (int i = 0; i < dataset->num_features_; ++i) {
// read feature size
read_cnt = fread(buffer, sizeof(size_t), 1, file);
read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
if (read_cnt != 1) {
Log::Fatal("Binary file error: feature %d has the wrong size", i);
}
size_t size_of_feature = *(reinterpret_cast<size_t*>(buffer));
size_t size_of_feature = *(reinterpret_cast<size_t*>(buffer.data()));
// re-allocate space if not enough
if (size_of_feature > buffer_size) {
delete[] buffer;
buffer_size = size_of_feature;
buffer = new char[buffer_size];
buffer.resize(buffer_size);
}
read_cnt = fread(buffer, 1, size_of_feature, file);
read_cnt = fread(buffer.data(), 1, size_of_feature, file);
if (read_cnt != size_of_feature) {
Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt);
}
dataset->features_.push_back(new Feature(buffer, num_global_data, used_data_indices));
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(buffer.data(),
num_global_data,
used_data_indices)
));
}
delete[] buffer;
dataset->features_.shrink_to_fit();
fclose(file);
dataset->is_loading_from_binfile_ = true;
return dataset;
return dataset.release();
}
Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data) {
std::vector<BinMapper*> bin_mappers(sample_values.size());
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
bin_mappers[i] = new BinMapper();
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(&sample_values[i], total_sample_size, io_config_.max_bin);
}
Dataset* dataset = new Dataset();
auto dataset = std::unique_ptr<Dataset>(new Dataset());
dataset->num_class_ = io_config_.num_class;
dataset->features_.clear();
dataset->num_data_ = num_data;
......@@ -429,14 +426,18 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
// map real feature index to used feature index
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
// push new feature
dataset->features_.push_back(new Feature(static_cast<int>(i), bin_mappers[i],
dataset->num_data_, io_config_.is_enable_sparse));
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mappers[i].release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
// if feature is trival(only 1 bin), free spaces
Log::Warning("Ignoring Column_%d , only has one value", i);
delete bin_mappers[i];
}
}
dataset->features_.shrink_to_fit();
// fill feature_names_ if not header
if (feature_names_.size() <= 0) {
for (int i = 0; i < dataset->num_total_features_; ++i) {
......@@ -448,7 +449,7 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
dataset->feature_names_ = feature_names_;
dataset->num_features_ = static_cast<int>(dataset->features_.size());
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, NO_SPECIFIC, NO_SPECIFIC);
return dataset;
return dataset.release();
}
......@@ -516,10 +517,10 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromMemory(const std::vect
sample_cnt = data.size();
}
std::vector<size_t> sample_indices = random_.Sample(data.size(), sample_cnt);
std::vector<std::string> out;
std::vector<std::string> out(sample_indices.size());
for (size_t i = 0; i < sample_indices.size(); ++i) {
const size_t idx = sample_indices[i];
out.push_back(data[idx]);
out[i] = data[idx];
}
return out;
}
......@@ -616,15 +617,15 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
dataset->feature_names_ = feature_names_;
// start find bins
if (num_machines == 1) {
std::vector<BinMapper*> bin_mappers(sample_values.size());
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
// if only one machine, find bin locally
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
if (ignore_features_.count(i) > 0) {
bin_mappers[i] = nullptr;
bin_mappers[i].reset(nullptr);
continue;
}
bin_mappers[i] = new BinMapper();
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(&sample_values[i], sample_data.size(), io_config_.max_bin);
}
......@@ -635,12 +636,15 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// map real feature index to used feature index
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
// push new feature
dataset->features_.push_back(new Feature(static_cast<int>(i), bin_mappers[i],
dataset->num_data_, io_config_.is_enable_sparse));
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mappers[i].release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
// if feature is trival(only 1 bin), free spaces
Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
delete bin_mappers[i];
}
}
} else {
......@@ -649,8 +653,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ strat[i], start[i] + len[i] )
int* start = new int[num_machines];
int* len = new int[num_machines];
std::vector<int> start(num_machines);
std::vector<int> len(num_machines);
int total_num_feature = static_cast<int>(sample_values.size());
int step = (total_num_feature + num_machines - 1) / num_machines;
if (step < 1) { step = 1; }
......@@ -665,17 +669,15 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
int type_size = BinMapper::SizeForSpecificBin(io_config_.max_bin);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int buffer_size = type_size * total_num_feature;
char* input_buffer = new char[buffer_size];
char* output_buffer = new char[buffer_size];
auto input_buffer = std::vector<char>(buffer_size);
auto output_buffer = std::vector<char>(buffer_size);
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
BinMapper* bin_mapper = new BinMapper();
bin_mapper->FindBin(&sample_values[start[rank] + i], sample_data.size(), io_config_.max_bin);
bin_mapper->CopyTo(input_buffer + i * type_size);
// don't need this any more
delete bin_mapper;
BinMapper bin_mapper;
bin_mapper.FindBin(&sample_values[start[rank] + i], sample_data.size(), io_config_.max_bin);
bin_mapper.CopyTo(input_buffer.data() + i * type_size);
}
// convert to binary size
for (int i = 0; i < num_machines; ++i) {
......@@ -683,29 +685,29 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
len[i] *= type_size;
}
// gather global feature bin mappers
Network::Allgather(input_buffer, buffer_size, start, len, output_buffer);
Network::Allgather(input_buffer.data(), buffer_size, start.data(), len.data(), output_buffer.data());
// restore features bins from buffer
for (int i = 0; i < total_num_feature; ++i) {
if (ignore_features_.count(i) > 0) {
Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
continue;
}
BinMapper* bin_mapper = new BinMapper();
bin_mapper->CopyFrom(output_buffer + i * type_size);
auto bin_mapper = std::unique_ptr<BinMapper>(new BinMapper());
bin_mapper->CopyFrom(output_buffer.data() + i * type_size);
if (!bin_mapper->is_trival()) {
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
dataset->features_.push_back(new Feature(static_cast<int>(i), bin_mapper, dataset->num_data_, io_config_.is_enable_sparse));
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mapper.release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
delete bin_mapper;
}
}
// free buffer
delete[] start;
delete[] len;
delete[] input_buffer;
delete[] output_buffer;
}
dataset->features_.shrink_to_fit();
dataset->num_features_ = static_cast<int>(dataset->features_.size());
}
......@@ -745,7 +747,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
}
} else {
// if need to prediction with initial model
float* init_score = new float[dataset->num_data_ * dataset->num_class_];
std::vector<score_t> init_score(dataset->num_data_ * dataset->num_class_);
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for (data_size_t i = 0; i < dataset->num_data_; ++i) {
const int tid = omp_get_thread_num();
......@@ -780,8 +782,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
}
}
// metadata_ will manage space of init_score
dataset->metadata_.SetInitScore(init_score, dataset->num_data_ * dataset->num_class_);
delete[] init_score;
dataset->metadata_.SetInitScore(init_score.data(), dataset->num_data_ * dataset->num_class_);
}
dataset->FinishLoad();
// text data can be free after loaded feature values
......@@ -790,9 +791,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
/*! \brief Extract local features from file */
void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset) {
float* init_score = nullptr;
std::vector<score_t> init_score;
if (predict_fun_ != nullptr) {
init_score = new float[dataset->num_data_ * dataset->num_class_];
init_score = std::vector<score_t>(dataset->num_data_ * dataset->num_class_);
}
std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
[this, &init_score, &parser, &dataset]
......@@ -806,7 +807,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
// parser
parser->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
// set initial score
if (init_score != nullptr) {
if (init_score.size() > 0) {
std::vector<double> oneline_init_score = predict_fun_(oneline_features);
for (int k = 0; k < dataset->num_class_; ++k) {
init_score[k * dataset->num_data_ + start_idx + i] = static_cast<float>(oneline_init_score[k]);
......@@ -841,9 +842,8 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
}
// metadata_ will manage space of init_score
if (init_score != nullptr) {
dataset->metadata_.SetInitScore(init_score, dataset->num_data_ * dataset->num_class_);
delete[] init_score;
if (init_score.size() > 0) {
dataset->metadata_.SetInitScore(init_score.data(), dataset->num_data_ * dataset->num_class_);
}
dataset->FinishLoad();
}
......
......@@ -18,19 +18,12 @@ class DenseBin: public Bin {
public:
explicit DenseBin(data_size_t num_data, int default_bin)
: num_data_(num_data) {
data_ = new VAL_T[num_data_];
if (default_bin == 0) {
std::memset(data_, 0, sizeof(VAL_T)*num_data_);
} else {
VAL_T default_bin_T = static_cast<VAL_T>(default_bin);
for (data_size_t i = 0; i < num_data_; ++i) {
data_[i] = default_bin_T;
}
}
data_.resize(num_data_);
VAL_T default_bin_T = static_cast<VAL_T>(default_bin);
std::fill(data_.begin(), data_.end(), default_bin_T);
}
~DenseBin() {
delete[] data_;
}
void Push(int, data_size_t idx, uint32_t value) override {
......@@ -43,7 +36,7 @@ public:
BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(data_size_t* data_indices, data_size_t num_data,
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
......@@ -146,7 +139,7 @@ public:
}
void SaveBinaryToFile(FILE* file) const override {
fwrite(data_, sizeof(VAL_T), num_data_, file);
fwrite(data_.data(), sizeof(VAL_T), num_data_, file);
}
size_t SizesInByte() const override {
......@@ -155,7 +148,7 @@ public:
private:
data_size_t num_data_;
VAL_T* data_;
std::vector<VAL_T> data_;
};
template <typename VAL_T>
......
......@@ -7,11 +7,7 @@
namespace LightGBM {
Metadata::Metadata()
:label_(nullptr), weights_(nullptr),
query_boundaries_(nullptr),
query_weights_(nullptr), init_score_(nullptr), queries_(nullptr){
Metadata::Metadata() {
}
void Metadata::Init(const char * data_filename, const int num_class) {
......@@ -27,36 +23,30 @@ void Metadata::Init(const char * data_filename, const int num_class) {
Metadata::~Metadata() {
if (label_ != nullptr) { delete[] label_; }
if (weights_ != nullptr) { delete[] weights_; }
if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
if (query_weights_ != nullptr) { delete[] query_weights_; }
if (init_score_ != nullptr) { delete[] init_score_; }
if (queries_ != nullptr) { delete[] queries_; }
}
void Metadata::Init(data_size_t num_data, int num_class, int weight_idx, int query_idx) {
num_data_ = num_data;
num_class_ = num_class;
label_ = new float[num_data_];
label_ = std::vector<float>(num_data_);
if (weight_idx >= 0) {
if (weights_ != nullptr) {
if (weights_.size() > 0) {
Log::Info("Using weights in data file, ignoring the additional weights file");
delete[] weights_;
weights_.clear();
}
weights_ = new float[num_data_];
weights_ = std::vector<float>(num_data_);
num_weights_ = num_data_;
memset(weights_, 0, sizeof(float) * num_data_);
std::fill(weights_.begin(), weights_.end(), 0.0f);
}
if (query_idx >= 0) {
if (query_boundaries_ != nullptr) {
if (query_boundaries_.size() > 0) {
Log::Info("Using query id in data file, ignoring the additional query file");
delete[] query_boundaries_;
query_boundaries_.clear();
}
if (query_weights_ != nullptr) { delete[] query_weights_; }
queries_ = new data_size_t[num_data_];
memset(queries_, 0, sizeof(data_size_t) * num_data_);
if (query_weights_.size() > 0) { query_weights_.clear(); }
queries_ = std::vector<data_size_t>(num_data_);
std::fill(queries_.begin(), queries_.end(), 0);
}
}
......@@ -64,18 +54,18 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
if (used_indices.size() <= 0) {
return;
}
float* old_label = label_;
auto old_label = label_;
num_data_ = static_cast<data_size_t>(used_indices.size());
label_ = new float[num_data_];
label_ = std::vector<float>(num_data_);
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = old_label[used_indices[i]];
}
delete[] old_label;
old_label.clear();
}
void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data_size_t>& used_data_indices) {
if (used_data_indices.size() == 0) {
if (queries_ != nullptr) {
if (queries_.size() > 0) {
// need convert query_id to boundaries
std::vector<data_size_t> tmp_buffer;
data_size_t last_qid = -1;
......@@ -91,77 +81,70 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
++cur_cnt;
}
tmp_buffer.push_back(cur_cnt);
query_boundaries_ = new data_size_t[tmp_buffer.size() + 1];
query_boundaries_ = std::vector<data_size_t>(tmp_buffer.size() + 1);
num_queries_ = static_cast<data_size_t>(tmp_buffer.size());
query_boundaries_[0] = 0;
for (size_t i = 0; i < tmp_buffer.size(); ++i) {
query_boundaries_[i + 1] = query_boundaries_[i] + tmp_buffer[i];
}
LoadQueryWeights();
delete[] queries_;
queries_ = nullptr;
queries_.clear();
}
// check weights
if (weights_ != nullptr && num_weights_ != num_data_) {
delete[] weights_;
if (weights_.size() > 0 && num_weights_ != num_data_) {
weights_.clear();
num_weights_ = 0;
weights_ = nullptr;
Log::Fatal("Weights size doesn't match data size");
}
// check query boundries
if (query_boundaries_ != nullptr && query_boundaries_[num_queries_] != num_data_) {
delete[] query_boundaries_;
if (query_boundaries_.size() > 0 && query_boundaries_[num_queries_] != num_data_) {
query_boundaries_.clear();
num_queries_ = 0;
query_boundaries_ = nullptr;
Log::Fatal("Query size doesn't match data size");
}
// contain initial score file
if (init_score_ != nullptr && num_init_score_ != num_data_) {
delete[] init_score_;
init_score_ = nullptr;
if (init_score_.size() > 0 && num_init_score_ != num_data_) {
init_score_.clear();
num_init_score_ = 0;
Log::Fatal("Initial score size doesn't match data size");
}
} else {
data_size_t num_used_data = static_cast<data_size_t>(used_data_indices.size());
// check weights
if (weights_ != nullptr && num_weights_ != num_all_data) {
delete[] weights_;
if (weights_.size() > 0 && num_weights_ != num_all_data) {
weights_.clear();
num_weights_ = 0;
weights_ = nullptr;
Log::Fatal("Weights size doesn't match data size");
}
// check query boundries
if (query_boundaries_ != nullptr && query_boundaries_[num_queries_] != num_all_data) {
delete[] query_boundaries_;
if (query_boundaries_.size() > 0 && query_boundaries_[num_queries_] != num_all_data) {
query_boundaries_.clear();
num_queries_ = 0;
query_boundaries_ = nullptr;
Log::Fatal("Query size doesn't match data size");
}
// contain initial score file
if (init_score_ != nullptr && num_init_score_ != num_all_data) {
delete[] init_score_;
if (init_score_.size() > 0 && num_init_score_ != num_all_data) {
init_score_.clear();
num_init_score_ = 0;
init_score_ = nullptr;
Log::Fatal("Initial score size doesn't match data size");
}
// get local weights
if (weights_ != nullptr) {
float* old_weights = weights_;
if (weights_.size() > 0) {
auto old_weights = weights_;
num_weights_ = num_data_;
weights_ = new float[num_data_];
weights_ = std::vector<float>(num_data_);
for (size_t i = 0; i < used_data_indices.size(); ++i) {
weights_[i] = old_weights[used_data_indices[i]];
}
delete[] old_weights;
old_weights.clear();
}
// get local query boundaries
if (query_boundaries_ != nullptr) {
if (query_boundaries_.size() > 0) {
std::vector<data_size_t> used_query;
data_size_t data_idx = 0;
for (data_size_t qid = 0; qid < num_queries_ && data_idx < num_used_data; ++qid) {
......@@ -181,8 +164,8 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
Log::Fatal("Data partition error, data didn't match queries");
}
}
data_size_t * old_query_boundaries = query_boundaries_;
query_boundaries_ = new data_size_t[used_query.size() + 1];
auto old_query_boundaries = query_boundaries_;
query_boundaries_ = std::vector<data_size_t>(used_query.size() + 1);
num_queries_ = static_cast<data_size_t>(used_query.size());
query_boundaries_[0] = 0;
for (data_size_t i = 0; i < num_queries_; ++i) {
......@@ -190,20 +173,20 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
data_size_t len = old_query_boundaries[qid + 1] - old_query_boundaries[qid];
query_boundaries_[i + 1] = query_boundaries_[i] + len;
}
delete[] old_query_boundaries;
old_query_boundaries.clear();
}
// get local initial scores
if (init_score_ != nullptr) {
float* old_scores = init_score_;
if (init_score_.size() > 0) {
auto old_scores = init_score_;
num_init_score_ = num_data_;
init_score_ = new float[num_init_score_ * num_class_];
init_score_ = std::vector<float>(num_init_score_ * num_class_);
for (int k = 0; k < num_class_; ++k){
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[k * num_data_ + i] = old_scores[k * num_all_data + used_data_indices[i]];
}
}
delete[] old_scores;
old_scores.clear();
}
// re-load query weight
......@@ -216,9 +199,9 @@ void Metadata::SetInitScore(const float* init_score, data_size_t len) {
if (len != num_data_ * num_class_) {
Log::Fatal("Initial score size doesn't match data size");
}
if (init_score_ != nullptr) { delete[] init_score_; }
if (init_score_.size() > 0) { init_score_.clear(); }
num_init_score_ = num_data_;
init_score_ = new float[len];
init_score_ = std::vector<float>(len);
for (data_size_t i = 0; i < len; ++i) {
init_score_[i] = init_score[i];
}
......@@ -228,8 +211,8 @@ void Metadata::SetLabel(const float* label, data_size_t len) {
if (num_data_ != len) {
Log::Fatal("len of label is not same with #data");
}
if (label_ != nullptr) { delete[] label_; }
label_ = new float[num_data_];
if (label_.size() > 0) { label_.clear(); }
label_ = std::vector<float>(num_data_);
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = label[i];
}
......@@ -239,9 +222,9 @@ void Metadata::SetWeights(const float* weights, data_size_t len) {
if (num_data_ != len) {
Log::Fatal("len of weights is not same with #data");
}
if (weights_ != nullptr) { delete[] weights_; }
if (weights_.size() > 0) { weights_.clear(); }
num_weights_ = num_data_;
weights_ = new float[num_weights_];
weights_ = std::vector<float>(num_weights_);
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = weights[i];
}
......@@ -256,9 +239,9 @@ void Metadata::SetQueryBoundaries(const data_size_t* query_boundaries, data_size
if (num_data_ != sum) {
Log::Fatal("sum of query counts is not same with #data");
}
if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
if (query_boundaries_.size() > 0) { query_boundaries_.clear(); }
num_queries_ = len;
query_boundaries_ = new data_size_t[num_queries_];
query_boundaries_ = std::vector<data_size_t>(num_queries_);
for (data_size_t i = 0; i < num_queries_; ++i) {
query_boundaries_[i] = query_boundaries[i];
}
......@@ -278,7 +261,7 @@ void Metadata::LoadWeights() {
}
Log::Info("Loading weights...");
num_weights_ = static_cast<data_size_t>(reader.Lines().size());
weights_ = new float[num_weights_];
weights_ = std::vector<float>(num_weights_);
for (data_size_t i = 0; i < num_weights_; ++i) {
double tmp_weight = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
......@@ -299,7 +282,7 @@ void Metadata::LoadInitialScore() {
Log::Info("Loading initial scores...");
num_init_score_ = static_cast<data_size_t>(reader.Lines().size());
init_score_ = new float[num_init_score_ * num_class_];
init_score_ = std::vector<float>(num_init_score_ * num_class_);
double tmp = 0.0f;
if (num_class_ == 1){
......@@ -333,7 +316,7 @@ void Metadata::LoadQueryBoundaries() {
return;
}
Log::Info("Loading query boundaries...");
query_boundaries_ = new data_size_t[reader.Lines().size() + 1];
query_boundaries_ = std::vector<data_size_t>(reader.Lines().size() + 1);
num_queries_ = static_cast<data_size_t>(reader.Lines().size());
query_boundaries_[0] = 0;
for (size_t i = 0; i < reader.Lines().size(); ++i) {
......@@ -344,11 +327,12 @@ void Metadata::LoadQueryBoundaries() {
}
void Metadata::LoadQueryWeights() {
if (weights_ == nullptr || query_boundaries_ == nullptr) {
if (weights_.size() == 0 || query_boundaries_.size() == 0) {
return;
}
query_weights_.clear();
Log::Info("Loading query weights...");
query_weights_ = new float[num_queries_];
query_weights_ = std::vector<float>(num_queries_);
for (data_size_t i = 0; i < num_queries_; ++i) {
query_weights_[i] = 0.0f;
for (data_size_t j = query_boundaries_[i]; j < query_boundaries_[i + 1]; ++j) {
......@@ -368,44 +352,36 @@ void Metadata::LoadFromMemory(const void* memory) {
num_queries_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_queries_);
if (label_ != nullptr) { delete[] label_; }
label_ = new float[num_data_];
std::memcpy(label_, mem_ptr, sizeof(float)*num_data_);
if (label_.size() > 0) { label_.clear(); }
label_ = std::vector<float>(num_data_);
std::memcpy(label_.data(), mem_ptr, sizeof(float)*num_data_);
mem_ptr += sizeof(float)*num_data_;
if (num_weights_ > 0) {
if (weights_ != nullptr) { delete[] weights_; }
weights_ = new float[num_weights_];
std::memcpy(weights_, mem_ptr, sizeof(float)*num_weights_);
if (weights_.size() > 0) { weights_.clear(); }
weights_ = std::vector<float>(num_weights_);
std::memcpy(weights_.data(), mem_ptr, sizeof(float)*num_weights_);
mem_ptr += sizeof(float)*num_weights_;
}
if (num_queries_ > 0) {
if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
query_boundaries_ = new data_size_t[num_queries_ + 1];
std::memcpy(query_boundaries_, mem_ptr, sizeof(data_size_t)*(num_queries_ + 1));
if (query_boundaries_.size() > 0) { query_boundaries_.clear(); }
query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1);
std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t)*(num_queries_ + 1));
mem_ptr += sizeof(data_size_t)*(num_queries_ + 1);
}
if (num_weights_ > 0 && num_queries_ > 0) {
if (query_weights_ != nullptr) { delete[] query_weights_; }
query_weights_ = new float[num_queries_];
std::memcpy(query_weights_, mem_ptr, sizeof(float)*num_queries_);
mem_ptr += sizeof(float)*num_queries_;
}
LoadQueryWeights();
}
void Metadata::SaveBinaryToFile(FILE* file) const {
fwrite(&num_data_, sizeof(num_data_), 1, file);
fwrite(&num_weights_, sizeof(num_weights_), 1, file);
fwrite(&num_queries_, sizeof(num_queries_), 1, file);
fwrite(label_, sizeof(float), num_data_, file);
if (weights_ != nullptr) {
fwrite(weights_, sizeof(float), num_weights_, file);
fwrite(label_.data(), sizeof(float), num_data_, file);
if (weights_.size() > 0) {
fwrite(weights_.data(), sizeof(float), num_weights_, file);
}
if (query_boundaries_ != nullptr) {
fwrite(query_boundaries_, sizeof(data_size_t), num_queries_ + 1, file);
}
if (query_weights_ != nullptr) {
fwrite(query_weights_, sizeof(float), num_queries_, file);
if (query_boundaries_.size() > 0) {
fwrite(query_boundaries_.data(), sizeof(data_size_t), num_queries_ + 1, file);
}
}
......@@ -414,15 +390,12 @@ size_t Metadata::SizesInByte() const {
size_t size = sizeof(num_data_) + sizeof(num_weights_)
+ sizeof(num_queries_);
size += sizeof(float) * num_data_;
if (weights_ != nullptr) {
if (weights_.size() > 0) {
size += sizeof(float) * num_weights_;
}
if (query_boundaries_ != nullptr) {
if (query_boundaries_.size() > 0) {
size += sizeof(data_size_t) * (num_queries_ + 1);
}
if (query_weights_ != nullptr) {
size += sizeof(float) * num_queries_;
}
return size;
}
......
......@@ -68,7 +68,7 @@ public:
data_size_t cur_pos = 0;
for (size_t i = 0; i < vals_.size(); ++i) {
cur_pos += delta_[i];
if (vals_[i] > 0 && used_idices[cur_pos] != 0) {
if (vals_[i] > 0 && used_idices[cur_pos]) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = vals_[i];
++j;
......@@ -101,7 +101,7 @@ public:
data_size_t new_left_end = l_start;
for (data_size_t i = l_start; i < l_end; ++i) {
if (left_indices[ordered_pair_[i].ridx] != 0) {
if (left_indices[ordered_pair_[i].ridx]) {
std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
++new_left_end;
}
......
......@@ -3,6 +3,7 @@
#include <iostream>
#include <fstream>
#include <functional>
#include <memory>
namespace LightGBM {
......@@ -122,24 +123,24 @@ Parser* Parser::CreateParser(const char* filename, bool has_header, int num_feat
if (type == DataType::INVALID) {
Log::Fatal("Unknown format of training data");
}
Parser* ret = nullptr;
std::unique_ptr<Parser> ret;
if (type == DataType::LIBSVM) {
label_idx = GetLabelIdxForLibsvm(line1, num_features, label_idx);
ret = new LibSVMParser(label_idx);
ret.reset(new LibSVMParser(label_idx));
}
else if (type == DataType::TSV) {
label_idx = GetLabelIdxForTSV(line1, num_features, label_idx);
ret = new TSVParser(label_idx);
ret.reset(new TSVParser(label_idx));
}
else if (type == DataType::CSV) {
label_idx = GetLabelIdxForCSV(line1, num_features, label_idx);
ret = new CSVParser(label_idx);
ret.reset(new CSVParser(label_idx));
}
if (label_idx < 0) {
Log::Info("Data file %s doesn't contain a label column", filename);
}
return ret;
return ret.release();
}
} // namespace LightGBM
......@@ -51,7 +51,7 @@ public:
BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(data_size_t*, data_size_t , const score_t* ,
void ConstructHistogram(const data_size_t*, data_size_t , const score_t* ,
const score_t* , HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
......
......@@ -11,42 +11,32 @@
#include <functional>
#include <vector>
#include <string>
#include <memory>
namespace LightGBM {
Tree::Tree(int max_leaves)
:max_leaves_(max_leaves) {
num_leaves_ = 0;
left_child_ = new int[max_leaves_ - 1];
right_child_ = new int[max_leaves_ - 1];
split_feature_ = new int[max_leaves_ - 1];
split_feature_real_ = new int[max_leaves_ - 1];
threshold_in_bin_ = new unsigned int[max_leaves_ - 1];
threshold_ = new double[max_leaves_ - 1];
split_gain_ = new double[max_leaves_ - 1];
leaf_parent_ = new int[max_leaves_];
leaf_value_ = new double[max_leaves_];
internal_value_ = new double[max_leaves_ - 1];
leaf_depth_ = new int[max_leaves_];
num_leaves_ = 0;
left_child_ = std::vector<int>(max_leaves_ - 1);
right_child_ = std::vector<int>(max_leaves_ - 1);
split_feature_ = std::vector<int>(max_leaves_ - 1);
split_feature_real_ = std::vector<int>(max_leaves_ - 1);
threshold_in_bin_ = std::vector<unsigned int>(max_leaves_ - 1);
threshold_ = std::vector<double>(max_leaves_ - 1);
split_gain_ = std::vector<double>(max_leaves_ - 1);
leaf_parent_ = std::vector<int>(max_leaves_);
leaf_value_ = std::vector<double>(max_leaves_);
internal_value_ = std::vector<double>(max_leaves_ - 1);
leaf_depth_ = std::vector<int>(max_leaves_);
// root is in the depth 1
leaf_depth_[0] = 1;
num_leaves_ = 1;
leaf_parent_[0] = -1;
}
Tree::~Tree() {
if (leaf_parent_ != nullptr) { delete[] leaf_parent_; }
if (left_child_ != nullptr) { delete[] left_child_; }
if (right_child_ != nullptr) { delete[] right_child_; }
if (split_feature_ != nullptr) { delete[] split_feature_; }
if (split_feature_real_ != nullptr) { delete[] split_feature_real_; }
if (threshold_in_bin_ != nullptr) { delete[] threshold_in_bin_; }
if (threshold_ != nullptr) { delete[] threshold_; }
if (split_gain_ != nullptr) { delete[] split_gain_; }
if (leaf_value_ != nullptr) { delete[] leaf_value_; }
if (internal_value_ != nullptr) { delete[] internal_value_; }
if (leaf_depth_ != nullptr) { delete[] leaf_depth_; }
}
int Tree::Split(int leaf, int feature, unsigned int threshold_bin, int real_feature,
......@@ -88,9 +78,9 @@ int Tree::Split(int leaf, int feature, unsigned int threshold_bin, int real_feat
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, score_t* score) const {
Threading::For<data_size_t>(0, num_data, [this, data, score](int, data_size_t start, data_size_t end) {
std::vector<BinIterator*> iterators;
std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iterators.push_back(data->FeatureAt(i)->bin_data()->GetIterator(start));
iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(start));
}
for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<score_t>(leaf_value_[GetLeaf(iterators, i)]);
......@@ -102,9 +92,9 @@ void Tree::AddPredictionToScore(const Dataset* data, const data_size_t* used_dat
data_size_t num_data, score_t* score) const {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<BinIterator*> iterators;
std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iterators.push_back(data->FeatureAt(i)->bin_data()->GetIterator(used_data_indices[start]));
iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(used_data_indices[start]));
}
for (data_size_t i = start; i < end; ++i) {
score[used_data_indices[i]] += static_cast<score_t>(leaf_value_[GetLeaf(iterators, used_data_indices[i])]);
......@@ -116,21 +106,21 @@ std::string Tree::ToString() {
std::stringstream ss;
ss << "num_leaves=" << num_leaves_ << std::endl;
ss << "split_feature="
<< Common::ArrayToString<int>(split_feature_real_, num_leaves_ - 1, ' ') << std::endl;
<< Common::ArrayToString<int>(split_feature_real_.data(), num_leaves_ - 1, ' ') << std::endl;
ss << "split_gain="
<< Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl;
<< Common::ArrayToString<double>(split_gain_.data(), num_leaves_ - 1, ' ') << std::endl;
ss << "threshold="
<< Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
<< Common::ArrayToString<double>(threshold_.data(), num_leaves_ - 1, ' ') << std::endl;
ss << "left_child="
<< Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
<< Common::ArrayToString<int>(left_child_.data(), num_leaves_ - 1, ' ') << std::endl;
ss << "right_child="
<< Common::ArrayToString<int>(right_child_, num_leaves_ - 1, ' ') << std::endl;
<< Common::ArrayToString<int>(right_child_.data(), num_leaves_ - 1, ' ') << std::endl;
ss << "leaf_parent="
<< Common::ArrayToString<int>(leaf_parent_, num_leaves_, ' ') << std::endl;
<< Common::ArrayToString<int>(leaf_parent_.data(), num_leaves_, ' ') << std::endl;
ss << "leaf_value="
<< Common::ArrayToString<double>(leaf_value_, num_leaves_, ' ') << std::endl;
<< Common::ArrayToString<double>(leaf_value_.data(), num_leaves_, ' ') << std::endl;
ss << "internal_value="
<< Common::ArrayToString<double>(internal_value_, num_leaves_ - 1, ' ') << std::endl;
<< Common::ArrayToString<double>(internal_value_.data(), num_leaves_ - 1, ' ') << std::endl;
ss << std::endl;
return ss.str();
}
......@@ -158,35 +148,31 @@ Tree::Tree(const std::string& str) {
Common::Atoi(key_vals["num_leaves"].c_str(), &num_leaves_);
left_child_ = new int[num_leaves_ - 1];
right_child_ = new int[num_leaves_ - 1];
split_feature_real_ = new int[num_leaves_ - 1];
threshold_ = new double[num_leaves_ - 1];
split_gain_ = new double[num_leaves_ - 1];
leaf_parent_ = new int[num_leaves_];
leaf_value_ = new double[num_leaves_];
internal_value_ = new double[num_leaves_ - 1];
split_feature_ = nullptr;
threshold_in_bin_ = nullptr;
leaf_depth_ = nullptr;
left_child_ = std::vector<int>(num_leaves_ - 1);
right_child_ = std::vector<int>(num_leaves_ - 1);
split_feature_real_ = std::vector<int>(num_leaves_ - 1);
threshold_ = std::vector<double>(num_leaves_ - 1);
split_gain_ = std::vector<double>(num_leaves_ - 1);
leaf_parent_ = std::vector<int>(num_leaves_);
leaf_value_ = std::vector<double>(num_leaves_);
internal_value_ = std::vector<double>(num_leaves_ - 1);
Common::StringToIntArray(key_vals["split_feature"], ' ',
num_leaves_ - 1, split_feature_real_);
num_leaves_ - 1, split_feature_real_.data());
Common::StringToDoubleArray(key_vals["split_gain"], ' ',
num_leaves_ - 1, split_gain_);
num_leaves_ - 1, split_gain_.data());
Common::StringToDoubleArray(key_vals["threshold"], ' ',
num_leaves_ - 1, threshold_);
num_leaves_ - 1, threshold_.data());
Common::StringToIntArray(key_vals["left_child"], ' ',
num_leaves_ - 1, left_child_);
num_leaves_ - 1, left_child_.data());
Common::StringToIntArray(key_vals["right_child"], ' ',
num_leaves_ - 1, right_child_);
num_leaves_ - 1, right_child_.data());
Common::StringToIntArray(key_vals["leaf_parent"], ' ',
num_leaves_ , leaf_parent_);
num_leaves_ , leaf_parent_.data());
Common::StringToDoubleArray(key_vals["leaf_value"], ' ',
num_leaves_ , leaf_value_);
num_leaves_ , leaf_value_.data());
Common::StringToDoubleArray(key_vals["internal_value"], ' ',
num_leaves_ - 1 , internal_value_);
num_leaves_ - 1 , internal_value_.data());
}
} // namespace LightGBM
......@@ -52,7 +52,7 @@ public:
}
}
std::vector<std::string> GetName() const override {
const std::vector<std::string>& GetName() const override {
return name_;
}
......@@ -154,7 +154,7 @@ public:
virtual ~AUCMetric() {
}
std::vector<std::string> GetName() const override {
const std::vector<std::string>& GetName() const override {
return name_;
}
......
......@@ -22,10 +22,12 @@ void DCGCalculator::Init(std::vector<double> input_label_gain) {
for(size_t i = 0;i < input_label_gain.size();++i){
label_gain_.push_back(static_cast<score_t>(input_label_gain[i]));
}
label_gain_.shrink_to_fit();
discount_.clear();
for (data_size_t i = 0; i < kMaxPosition; ++i) {
discount_.emplace_back(1.0f / std::log2(2.0f + i));
}
discount_.shrink_to_fit();
is_inited_ = true;
}
......
......@@ -42,7 +42,7 @@ public:
}
}
std::vector<std::string> GetName() const override {
const std::vector<std::string>& GetName() const override {
return name_;
}
......
......@@ -20,6 +20,7 @@ public:
for (auto k : config.eval_at) {
eval_at_.push_back(static_cast<data_size_t>(k));
}
eval_at_.shrink_to_fit();
// initialize DCG calculator
DCGCalculator::Init(config.label_gain);
// get number of threads
......@@ -76,7 +77,7 @@ public:
}
}
std::vector<std::string> GetName() const override {
const std::vector<std::string>& GetName() const override {
return name_;
}
......
......@@ -23,7 +23,7 @@ public:
}
std::vector<std::string> GetName() const override {
const std::vector<std::string>& GetName() const override {
return name_;
}
......
......@@ -15,6 +15,7 @@
#include <thread>
#include <vector>
#include <string>
#include <memory>
#endif
#ifdef USE_MPI
......@@ -144,9 +145,9 @@ private:
/*! \brief Local listen ports */
int local_listen_port_;
/*! \brief Linkers */
std::vector<TcpSocket*> linkers_;
std::vector<std::unique_ptr<TcpSocket>> linkers_;
/*! \brief Local socket listener */
TcpSocket* listener_;
std::unique_ptr<TcpSocket> listener_;
#endif // USE_SOCKET
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment