Commit 16d1853d authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Merge pull request #94 from Microsoft/python-package

Python package (#11)
parents 65e711a2 29cf97e9
# pylint: disable=invalid-name, exec-used
"""Setup lightgbm package."""
from __future__ import absolute_import
import sys
import os
from setuptools import setup, find_packages
# import subprocess
sys.path.insert(0, '.')
CURRENT_DIR = os.path.dirname(__file__)
libpath_py = os.path.join(CURRENT_DIR, 'lightgbm/libpath.py')
libpath = {'__file__': libpath_py}
exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath)
LIB_PATH = libpath['find_lib_path']()
print("Install lib_lightgbm from: %s" % LIB_PATH)
setup(name='lightgbm',
version=0.1,
description="LightGBM Python Package",
install_requires=[
'numpy',
'scipy',
],
maintainer='Guolin Ke',
maintainer_email='guolin.ke@microsoft.com',
zip_safe=False,
packages=find_packages(),
include_package_data=True,
data_files=[('lightgbm', LIB_PATH)],
url='https://github.com/Microsoft/LightGBM')
......@@ -108,7 +108,7 @@ void Application::LoadData() {
// prediction is needed if using input initial model(continued train)
PredictFunction predict_fun = nullptr;
// need to continue training
if (boosting_->NumberOfSubModels() > 0) {
if (boosting_->NumberOfTotalModel() > 0) {
Predictor predictor(boosting_.get(), true, false);
predict_fun = predictor.GetPredictFunction();
}
......@@ -139,40 +139,44 @@ void Application::LoadData() {
for (auto metric_type : config_.metric_types) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config));
if (metric == nullptr) { continue; }
metric->Init("training", train_data_->metadata(),
train_data_->num_data());
metric->Init(train_data_->metadata(), train_data_->num_data());
train_metric_.push_back(std::move(metric));
}
}
train_metric_.shrink_to_fit();
// Add validation data, if it exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add
auto new_dataset = std::unique_ptr<Dataset>(
dataset_loader.LoadFromFileAlignWithOtherDataset(
config_.io_config.valid_data_filenames[i].c_str(),
train_data_.get())
);
valid_datas_.push_back(std::move(new_dataset));
// need save binary file
if (config_.io_config.is_save_binary_file) {
valid_datas_.back()->SaveBinaryFile(nullptr);
}
// add metric for validation data
valid_metrics_.emplace_back();
for (auto metric_type : config_.metric_types) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config));
if (metric == nullptr) { continue; }
metric->Init(config_.io_config.valid_data_filenames[i].c_str(),
valid_datas_.back()->metadata(),
valid_datas_.back()->num_data());
valid_metrics_.back().push_back(std::move(metric));
if (config_.metric_types.size() > 0) {
// only when have metrics then need to construct validation data
// Add validation data, if it exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add
auto new_dataset = std::unique_ptr<Dataset>(
dataset_loader.LoadFromFileAlignWithOtherDataset(
config_.io_config.valid_data_filenames[i].c_str(),
train_data_.get())
);
valid_datas_.push_back(std::move(new_dataset));
// need save binary file
if (config_.io_config.is_save_binary_file) {
valid_datas_.back()->SaveBinaryFile(nullptr);
}
// add metric for validation data
valid_metrics_.emplace_back();
for (auto metric_type : config_.metric_types) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config));
if (metric == nullptr) { continue; }
metric->Init(valid_datas_.back()->metadata(),
valid_datas_.back()->num_data());
valid_metrics_.back().push_back(std::move(metric));
}
valid_metrics_.back().shrink_to_fit();
}
valid_metrics_.back().shrink_to_fit();
valid_datas_.shrink_to_fit();
valid_metrics_.shrink_to_fit();
}
valid_datas_.shrink_to_fit();
valid_metrics_.shrink_to_fit();
auto end_time = std::chrono::high_resolution_clock::now();
// output used time on each iteration
Log::Info("Finished loading data in %f seconds",
......@@ -209,7 +213,7 @@ void Application::InitTrain() {
Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
// add validation data into boosting
for (size_t i = 0; i < valid_datas_.size(); ++i) {
boosting_->AddDataset(valid_datas_[i].get(),
boosting_->AddValidDataset(valid_datas_[i].get(),
Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_[i]));
}
Log::Info("Finished initializing training");
......@@ -227,17 +231,15 @@ void Application::Train() {
// output used time per iteration
Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
std::milli>(end_time - start_time) * 1e-3, iter + 1);
boosting_->SaveModelToFile(NO_LIMIT, is_finished, config_.io_config.output_model.c_str());
}
is_finished = true;
// save model to file
boosting_->SaveModelToFile(NO_LIMIT, is_finished, config_.io_config.output_model.c_str());
boosting_->SaveModelToFile(-1, config_.io_config.output_model.c_str());
Log::Info("Finished training");
}
void Application::Predict() {
boosting_->SetNumUsedModel(config_.io_config.num_model_predict);
boosting_->SetNumIterationForPred(config_.io_config.num_iteration_predict);
// create predictor
Predictor predictor(boosting_.get(), config_.io_config.is_predict_raw_score,
config_.io_config.is_predict_leaf_index);
......
......@@ -15,7 +15,7 @@ BoostingType GetBoostingTypeFromModelFile(const char* filename) {
return BoostingType::kUnknow;
}
void LoadFileToBoosting(Boosting* boosting, const char* filename) {
void Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
if (boosting != nullptr) {
TextReader<size_t> model_reader(filename, true);
model_reader.ReadAllLines();
......
......@@ -43,6 +43,7 @@ public:
* \brief one training iteration
*/
bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override {
is_update_score_cur_iter_ = false;
GBDT::TrainOneIter(gradient, hessian, false);
// normalize
Normalize();
......@@ -58,22 +59,15 @@ public:
* \return training score
*/
const score_t* GetTrainingScore(data_size_t* out_len) override {
DroppingTrees();
if (!is_update_score_cur_iter_) {
// only drop one time in one iteration
DroppingTrees();
is_update_score_cur_iter_ = true;
}
*out_len = train_score_updater_->num_data() * num_class_;
return train_score_updater_->score();
}
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param filename filename that want to save to
*/
void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) override {
// only save model once when is_finish = true
if (is_finish && saved_model_size_ < 0) {
GBDT::SaveModelToFile(num_used_model, is_finish, filename);
}
}
/*!
* \brief Get Type name of this boosting object
*/
......@@ -133,6 +127,8 @@ private:
double drop_rate_;
/*! \brief Random generator, used to select dropping trees */
Random random_for_drop_;
/*! \brief Flag that the score is update on current iter or not*/
bool is_update_score_cur_iter_;
};
} // namespace LightGBM
......
......@@ -16,7 +16,9 @@
namespace LightGBM {
GBDT::GBDT() : saved_model_size_(-1), num_used_model_(0) {
GBDT::GBDT()
:num_iteration_for_pred_(0),
num_init_iteration_(0) {
}
......@@ -26,69 +28,94 @@ GBDT::~GBDT() {
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) {
gbdt_config_ = config;
iter_ = 0;
saved_model_size_ = -1;
num_used_model_ = 0;
num_iteration_for_pred_ = 0;
max_feature_idx_ = 0;
num_class_ = config->num_class;
train_data_ = nullptr;
ResetTrainingData(config, train_data, object_function, training_metrics);
}
void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) {
if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
Log::Fatal("cannot reset training data, since new training data has different bin mappers");
}
gbdt_config_ = config;
early_stopping_round_ = gbdt_config_->early_stopping_round;
shrinkage_rate_ = gbdt_config_->learning_rate;
train_data_ = train_data;
num_class_ = config->num_class;
random_ = Random(gbdt_config_->bagging_seed);
// create tree learner
tree_learner_.clear();
for (int i = 0; i < num_class_; ++i) {
auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
new_tree_learner->Init(train_data_);
new_tree_learner->Init(train_data);
// init tree learner
tree_learner_.push_back(std::move(new_tree_learner));
}
tree_learner_.shrink_to_fit();
object_function_ = object_function;
// push training metrics
training_metrics_.clear();
for (const auto& metric : training_metrics) {
training_metrics_.push_back(metric);
}
training_metrics_.shrink_to_fit();
// create score tracker
train_score_updater_.reset(new ScoreUpdater(train_data_, num_class_));
num_data_ = train_data_->num_data();
// create buffer for gradients and hessians
if (object_function_ != nullptr) {
gradients_ = std::vector<score_t>(num_data_ * num_class_);
hessians_ = std::vector<score_t>(num_data_ * num_class_);
}
sigmoid_ = -1.0f;
if (object_function_ != nullptr
if (object_function_ != nullptr
&& std::string(object_function_->GetName()) == std::string("binary")) {
// only binary classification need sigmoid transform
sigmoid_ = gbdt_config_->sigmoid;
}
// get max feature index
max_feature_idx_ = train_data_->num_total_features() - 1;
// get label index
label_idx_ = train_data_->label_idx();
// if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
bag_data_indices_ = std::vector<data_size_t>(num_data_);
} else {
out_of_bag_data_cnt_ = 0;
out_of_bag_data_indices_.clear();
bag_data_cnt_ = num_data_;
bag_data_indices_.clear();
if (train_data_ != train_data) {
// not same training data, need reset score and others
// create score tracker
train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
// update score
for (int i = 0; i < iter_; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
}
}
num_data_ = train_data->num_data();
// create buffer for gradients and hessians
if (object_function_ != nullptr) {
gradients_ = std::vector<score_t>(num_data_ * num_class_);
hessians_ = std::vector<score_t>(num_data_ * num_class_);
}
// get max feature index
max_feature_idx_ = train_data->num_total_features() - 1;
// get label index
label_idx_ = train_data->label_idx();
// if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
bag_data_indices_ = std::vector<data_size_t>(num_data_);
} else {
out_of_bag_data_cnt_ = 0;
out_of_bag_data_indices_.clear();
bag_data_cnt_ = num_data_;
bag_data_indices_.clear();
}
}
// initialize random generator
random_ = Random(gbdt_config_->bagging_seed);
train_data_ = train_data;
}
void GBDT::AddDataset(const Dataset* valid_data,
void GBDT::AddValidDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) {
if (iter_ > 0) {
Log::Fatal("Cannot add validation data after training started");
if (!train_data_->CheckAlign(*valid_data)) {
Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
}
// for a validation dataset, we need its score and metric
auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
// update score
for (int i = 0; i < iter_; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
}
}
valid_score_updater_.push_back(std::move(new_score_updater));
valid_metrics_.emplace_back();
if (early_stopping_round_ > 0) {
......@@ -204,6 +231,25 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
}
void GBDT::RollbackOneIter() {
if (iter_ == 0) { return; }
int cur_iter = iter_ + num_init_iteration_ - 1;
// reset score
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = cur_iter * num_class_ + curr_class;
models_[curr_tree]->Shrinkage(-1.0);
train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(models_[curr_tree].get(), curr_class);
}
}
// remove model
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
models_.pop_back();
}
--iter_;
}
bool GBDT::EvalAndCheckEarlyStopping() {
bool is_met_early_stopping = false;
// print message for metric
......@@ -236,7 +282,7 @@ bool GBDT::OutputMetric(int iter) {
auto name = sub_metric->GetName();
auto scores = sub_metric->Eval(train_score_updater_->score());
for (size_t k = 0; k < name.size(); ++k) {
Log::Info("Iteration: %d, %s : %f", iter, name[k].c_str(), scores[k]);
Log::Info("Iteration:%d, training %s : %f", iter, name[k].c_str(), scores[k]);
}
}
}
......@@ -248,7 +294,7 @@ bool GBDT::OutputMetric(int iter) {
if ((iter % gbdt_config_->output_freq) == 0) {
auto name = valid_metrics_[i][j]->GetName();
for (size_t k = 0; k < name.size(); ++k) {
Log::Info("Iteration: %d, %s : %f", iter, name[k].c_str(), test_scores[k]);
Log::Info("Iteration:%d, valid_%d %s : %f", iter, i + 1, name[k].c_str(), test_scores[k]);
}
}
if (!ret && early_stopping_round_ > 0) {
......@@ -296,24 +342,23 @@ const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
return train_score_updater_->score();
}
void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) const {
void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
std::vector<double> ret;
const score_t* raw_scores = nullptr;
data_size_t num_data = 0;
if (data_idx == 0) {
raw_scores = train_score_updater_->score();
raw_scores = GetTrainingScore(out_len);
num_data = train_score_updater_->num_data();
} else {
auto used_idx = data_idx - 1;
raw_scores = valid_score_updater_[used_idx]->score();
num_data = valid_score_updater_[used_idx]->num_data();
*out_len = num_data * num_class_;
}
*out_len = num_data * num_class_;
if (num_class_ > 1) {
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
std::vector<double> tmp_result;
for (int j = 0; j < num_class_; ++j) {
......@@ -325,12 +370,12 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len)
}
}
} else if(sigmoid_ > 0.0f){
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
}
} else {
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = raw_scores[i];
}
......@@ -348,55 +393,41 @@ void GBDT::Boosting() {
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
}
void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filename) {
// first time to this function, open file
if (saved_model_size_ < 0) {
model_output_file_.open(filename);
// output model type
model_output_file_ << Name() << std::endl;
// output number of class
model_output_file_ << "num_class=" << num_class_ << std::endl;
// output label index
model_output_file_ << "label_index=" << label_idx_ << std::endl;
// output max_feature_idx
model_output_file_ << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output objective name
if (object_function_ != nullptr) {
model_output_file_ << "objective=" << object_function_->GetName() << std::endl;
}
// output sigmoid parameter
model_output_file_ << "sigmoid=" << sigmoid_ << std::endl;
model_output_file_ << std::endl;
saved_model_size_ = 0;
}
// already saved
if (!model_output_file_.is_open()) {
return;
void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
/*! \brief File to write models */
std::ofstream output_file;
output_file.open(filename);
// output model type
output_file << Name() << std::endl;
// output number of class
output_file << "num_class=" << num_class_ << std::endl;
// output label index
output_file << "label_index=" << label_idx_ << std::endl;
// output max_feature_idx
output_file << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output objective name
if (object_function_ != nullptr) {
output_file << "objective=" << object_function_->GetName() << std::endl;
}
if (num_used_model == NO_LIMIT) {
// output sigmoid parameter
output_file << "sigmoid=" << sigmoid_ << std::endl;
output_file << std::endl;
int num_used_model = 0;
if (num_iteration <= 0) {
num_used_model = static_cast<int>(models_.size());
} else {
num_used_model = num_used_model * num_class_;
num_used_model = num_iteration * num_class_;
}
int rest = num_used_model - early_stopping_round_ * num_class_;
num_used_model = std::min(num_used_model, static_cast<int>(models_.size()));
// output tree models
for (int i = saved_model_size_; i < rest; ++i) {
model_output_file_ << "Tree=" << i << std::endl;
model_output_file_ << models_[i]->ToString() << std::endl;
for (int i = 0; i < num_used_model; ++i) {
output_file << "Tree=" << i << std::endl;
output_file << models_[i]->ToString() << std::endl;
}
saved_model_size_ = std::max(saved_model_size_, rest);
model_output_file_.flush();
// training finished, can close file
if (is_finish) {
for (int i = saved_model_size_; i < num_used_model; ++i) {
model_output_file_ << "Tree=" << i << std::endl;
model_output_file_ << models_[i]->ToString() << std::endl;
}
model_output_file_ << std::endl << FeatureImportance() << std::endl;
model_output_file_.close();
}
output_file << std::endl << FeatureImportance() << std::endl;
output_file.close();
}
void GBDT::LoadModelFromString(const std::string& model_str) {
......@@ -452,7 +483,8 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
}
}
Log::Info("Finished loading %d models", models_.size());
num_used_model_ = static_cast<int>(models_.size()) / num_class_;
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
num_init_iteration_ = num_iteration_for_pred_;
}
std::string GBDT::FeatureImportance() const {
......@@ -486,7 +518,7 @@ std::string GBDT::FeatureImportance() const {
std::vector<double> GBDT::PredictRaw(const double* value) const {
std::vector<double> ret(num_class_, 0.0f);
for (int i = 0; i < num_used_model_; ++i) {
for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) {
ret[j] += models_[i * num_class_ + j]->Predict(value);
}
......@@ -496,7 +528,7 @@ std::vector<double> GBDT::PredictRaw(const double* value) const {
std::vector<double> GBDT::Predict(const double* value) const {
std::vector<double> ret(num_class_, 0.0f);
for (int i = 0; i < num_used_model_; ++i) {
for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) {
ret[j] += models_[i * num_class_ + j]->Predict(value);
}
......@@ -512,7 +544,7 @@ std::vector<double> GBDT::Predict(const double* value) const {
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
std::vector<int> ret;
for (int i = 0; i < num_used_model_; ++i) {
for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) {
ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
}
......
......@@ -35,12 +35,53 @@ public:
void Init(const BoostingConfig* gbdt_config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics)
override;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
void MergeFrom(const Boosting* other) override {
auto other_gbdt = reinterpret_cast<const GBDT*>(other);
// tmp move to other vector
auto original_models = std::move(models_);
models_ = std::vector<std::unique_ptr<Tree>>();
// push model from other first
for (const auto& tree : other_gbdt->models_) {
auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
models_.push_back(std::move(new_tree));
}
num_init_iteration_ = static_cast<int>(models_.size()) / num_class_;
// push model in current object
for (const auto& tree : original_models) {
auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
models_.push_back(std::move(new_tree));
}
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
}
/*!
* \brief Reset training data for current boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metric
*/
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, const std::vector<const Metric*>& training_metrics) override;
/*!
* \brief Reset shrinkage_rate data for current boosting
* \param shrinkage_rate Configs for boosting
*/
void ResetShrinkageRate(double shrinkage_rate) override {
shrinkage_rate_ = shrinkage_rate;
}
/*!
* \brief Adding a validation dataset
* \param valid_data Validation dataset
* \param valid_metrics Metrics for validation dataset
*/
void AddDataset(const Dataset* valid_data,
void AddValidDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) override;
/*!
* \brief Training logic
......@@ -51,6 +92,13 @@ public:
*/
virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override;
/*!
* \brief Rollback one iteration
*/
void RollbackOneIter() override;
int GetCurrentIteration() const override { return iter_ + num_init_iteration_; }
bool EvalAndCheckEarlyStopping() override;
/*!
......@@ -73,7 +121,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
void GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) const override;
void GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) override;
/*!
* \brief Prediction for one record without sigmoid transformation
......@@ -98,11 +146,11 @@ public:
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
*/
virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) override;
virtual void SaveModelToFile(int num_iterations, const char* filename) const override ;
/*!
* \brief Restore from a serialized string
*/
......@@ -119,11 +167,12 @@ public:
*/
inline int LabelIdx() const override { return label_idx_; }
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
inline int NumberOfSubModels() const override { return static_cast<int>(models_.size()); }
inline int NumberOfTotalModel() const override { return static_cast<int>(models_.size()); }
/*!
* \brief Get number of classes
......@@ -132,14 +181,16 @@ public:
inline int NumberOfClasses() const override { return num_class_; }
/*!
* \brief Set number of used model for prediction
* \brief Set number of iterations for prediction
*/
inline void SetNumUsedModel(int num_used_model) {
if (num_used_model >= 0) {
num_used_model_ = static_cast<int>(num_used_model / num_class_);
inline void SetNumIterationForPred(int num_iteration) override {
if (num_iteration > 0) {
num_iteration_for_pred_ = num_iteration;
} else {
num_used_model_ = static_cast<int>(models_.size()) / num_class_;
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
}
num_iteration_for_pred_ = std::min(num_iteration_for_pred_,
static_cast<int>(models_.size()) / num_class_);
}
/*!
......@@ -233,14 +284,12 @@ protected:
double sigmoid_;
/*! \brief Index of label column */
data_size_t label_idx_;
/*! \brief Saved number of models */
int saved_model_size_;
/*! \brief File to write models */
std::ofstream model_output_file_;
/*! \brief number of used model */
int num_used_model_;
int num_iteration_for_pred_;
/*! \brief Shrinkage rate for one iteration */
double shrinkage_rate_;
/*! \brief Number of loaded initial models */
int num_init_iteration_;
};
} // namespace LightGBM
......
This diff is collapsed.
......@@ -5,14 +5,14 @@
#include <vector>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <algorithm>
namespace LightGBM {
void OverallConfig::LoadFromString(const char* str) {
std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* parameters) {
std::unordered_map<std::string, std::string> params;
auto args = Common::Split(str, " \t\n\r");
auto args = Common::Split(parameters, " \t\n\r");
for (auto arg : args) {
std::vector<std::string> tmp_strs = Common::Split(arg.c_str(), '=');
if (tmp_strs.size() == 2) {
......@@ -27,7 +27,7 @@ void OverallConfig::LoadFromString(const char* str) {
}
}
ParameterAlias::KeyAliasTransform(&params);
Set(params);
return params;
}
void OverallConfig::Set(const std::unordered_map<std::string, std::string>& params) {
......@@ -95,16 +95,15 @@ void OverallConfig::GetMetricType(const std::unordered_map<std::string, std::str
// split
std::vector<std::string> metrics = Common::Split(value.c_str(), ',');
// remove dumplicate
std::unordered_map<std::string, int> metric_maps;
std::unordered_set<std::string> metric_sets;
for (auto& metric : metrics) {
std::transform(metric.begin(), metric.end(), metric.begin(), Common::tolower);
if (metric_maps.count(metric) <= 0) {
metric_maps[metric] = 1;
if (metric_sets.count(metric) <= 0) {
metric_sets.insert(metric);
}
}
for (auto& pair : metric_maps) {
std::string sub_metric_str = pair.first;
metric_types.push_back(sub_metric_str);
for (auto& metric : metric_sets) {
metric_types.push_back(metric);
}
metric_types.shrink_to_fit();
}
......@@ -183,7 +182,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "data_random_seed", &data_random_seed);
GetString(params, "data", &data_filename);
GetInt(params, "verbose", &verbosity);
GetInt(params, "num_model_predict", &num_model_predict);
GetInt(params, "num_iteration_predict", &num_iteration_predict);
GetInt(params, "bin_construct_sample_cnt", &bin_construct_sample_cnt);
GetBool(params, "is_pre_partition", &is_pre_partition);
GetBool(params, "is_enable_sparse", &is_enable_sparse);
......@@ -214,6 +213,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
CHECK(max_position > 0);
GetInt(params, "num_class", &num_class);
CHECK(num_class >= 1);
GetDouble(params, "scale_pos_weight", &scale_pos_weight);
std::string tmp_str = "";
if (GetString(params, "label_gain", &tmp_str)) {
label_gain = Common::StringToDoubleArray(tmp_str, ',');
......
......@@ -14,17 +14,16 @@
namespace LightGBM {
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
Dataset::Dataset() {
num_class_ = 1;
num_data_ = 0;
is_loading_from_binfile_ = false;
}
Dataset::Dataset(data_size_t num_data, int num_class) {
num_class_ = num_class;
num_data_ = num_data;
is_loading_from_binfile_ = false;
metadata_.Init(num_data_, num_class_, -1, -1);
}
......@@ -56,6 +55,21 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_spars
num_features_ = static_cast<int>(features_.size());
num_total_features_ = dataset->num_total_features_;
feature_names_ = dataset->feature_names_;
label_idx_ = dataset->label_idx_;
}
Dataset* Dataset::Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const {
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_indices, num_class_));
ret->CopyFeatureMapperFrom(this, is_enable_sparse);
#pragma omp parallel for schedule(guided)
for (int fidx = 0; fidx < num_features_; ++fidx) {
auto iterator = features_[fidx]->bin_data()->GetIterator(0);
for (data_size_t i = 0; i < num_used_indices; ++i) {
ret->features_[fidx]->PushBin(0, i, iterator->Get(used_indices[i]));
}
}
ret->metadata_.Init(metadata_, used_indices, num_used_indices);
return ret.release();
}
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
......@@ -78,6 +92,8 @@ bool Dataset::SetIntField(const char* field_name, const int* field_data, data_si
name = Common::Trim(name);
if (name == std::string("query") || name == std::string("group")) {
metadata_.SetQueryBoundaries(field_data, num_element);
} else if (name == std::string("query_id") || name == std::string("group_id")) {
metadata_.SetQueryId(field_data, num_element);
} else {
return false;
}
......@@ -107,7 +123,7 @@ bool Dataset::GetIntField(const char* field_name, int64_t* out_len, const int**
name = Common::Trim(name);
if (name == std::string("query") || name == std::string("group")) {
*out_ptr = metadata_.query_boundaries();
*out_len = num_data_;
*out_len = metadata_.num_queries();
} else {
return false;
}
......@@ -115,15 +131,27 @@ bool Dataset::GetIntField(const char* field_name, int64_t* out_len, const int**
}
void Dataset::SaveBinaryFile(const char* bin_filename) {
bool is_file_existed = false;
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, bin_filename, "rb");
#else
file = fopen(bin_filename, "rb");
#endif
if (file != NULL) {
is_file_existed = true;
Log::Warning("File %s existed, cannot save binary to it", bin_filename);
fclose(file);
}
if (!is_loading_from_binfile_) {
if (!is_file_existed) {
std::string bin_filename_str(data_filename_);
// if not pass a filename, just append ".bin" of original file
if (bin_filename == nullptr || bin_filename[0] == '\0') {
bin_filename_str.append(".bin");
bin_filename = bin_filename_str.c_str();
}
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, bin_filename, "wb");
#else
......@@ -133,7 +161,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log::Fatal("Cannot write binary data to %s ", bin_filename);
}
Log::Info("Saving data to binary file %s", bin_filename);
size_t size_of_token = std::strlen(binary_file_token);
fwrite(binary_file_token, sizeof(char), size_of_token, file);
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_class_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(size_t) + sizeof(int) * used_feature_map_.size();
......
......@@ -142,18 +142,18 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Please use an additional query file or pre-partition the data");
}
}
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_));
if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename);
}
auto dataset = std::unique_ptr<Dataset>(new Dataset());
data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices;
auto dataset = std::unique_ptr<Dataset>(new Dataset());
dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_);
bool is_loading_from_binfile = CheckCanLoadFromBin(filename);
if (!is_loading_from_binfile) {
auto bin_filename = CheckCanLoadFromBin(filename);
if (bin_filename.size() == 0) {
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_));
if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename);
}
dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_);
if (!io_config_.use_two_round_loading) {
// read data to memory
auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, rank, num_machines,&num_global_data, &used_data_indices);
......@@ -185,8 +185,6 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
} else {
// load data from binary file
std::string bin_filename(filename);
bin_filename.append(".bin");
dataset.reset(LoadFromBinFile(bin_filename.c_str(), rank, num_machines));
}
// check meta data
......@@ -199,18 +197,18 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data) {
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_));
if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename);
}
data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices;
auto dataset = std::unique_ptr<Dataset>(new Dataset());
dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_);
bool is_loading_from_binfile = CheckCanLoadFromBin(filename);
if (!is_loading_from_binfile) {
auto bin_filename = CheckCanLoadFromBin(filename);
if (bin_filename.size() == 0) {
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_));
if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename);
}
dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_);
if (!io_config_.use_two_round_loading) {
// read data in memory
auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices);
......@@ -234,8 +232,6 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
}
} else {
// load data from binary file
std::string bin_filename(filename);
bin_filename.append(".bin");
dataset.reset(LoadFromBinFile(bin_filename.c_str(), 0, 1));
}
// not need to check validation data
......@@ -260,9 +256,19 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
// buffer to read binary file
size_t buffer_size = 16 * 1024 * 1024;
auto buffer = std::vector<char>(buffer_size);
// check token
size_t size_of_token = std::strlen(Dataset::binary_file_token);
size_t read_cnt = fread(buffer.data(), sizeof(char), size_of_token, file);
if (read_cnt != size_of_token) {
Log::Fatal("Binary file error: token has the wrong size");
}
if (std::string(buffer.data()) != std::string(Dataset::binary_file_token)) {
Log::Fatal("input file is not LightGBM binary file");
}
// read size of header
size_t read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
if (read_cnt != 1) {
Log::Fatal("Binary file error: header has the wrong size");
......@@ -401,7 +407,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
}
dataset->features_.shrink_to_fit();
fclose(file);
dataset->is_loading_from_binfile_ = true;
return dataset.release();
}
......@@ -849,7 +854,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
}
/*! \brief Check can load from binary file */
bool DatasetLoader::CheckCanLoadFromBin(const char* filename) {
std::string DatasetLoader::CheckCanLoadFromBin(const char* filename) {
std::string bin_filename(filename);
bin_filename.append(".bin");
......@@ -860,12 +865,32 @@ bool DatasetLoader::CheckCanLoadFromBin(const char* filename) {
#else
file = fopen(bin_filename.c_str(), "rb");
#endif
if (file == NULL) {
return false;
bin_filename = std::string(filename);
#ifdef _MSC_VER
fopen_s(&file, bin_filename.c_str(), "rb");
#else
file = fopen(bin_filename.c_str(), "rb");
#endif
if (file == NULL) {
Log::Fatal("cannot open data file %s", bin_filename.c_str());
}
}
size_t buffer_size = 256;
auto buffer = std::vector<char>(buffer_size);
// read size of token
size_t size_of_token = std::strlen(Dataset::binary_file_token);
size_t read_cnt = fread(buffer.data(), sizeof(char), size_of_token, file);
fclose(file);
if (read_cnt == size_of_token
&& std::string(buffer.data()) == std::string(Dataset::binary_file_token)) {
return bin_filename;
} else {
fclose(file);
return true;
return std::string();
}
}
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -33,12 +33,9 @@ public:
~NDCGMetric() {
}
void Init(const char* test_name, const Metadata& metadata, data_size_t num_data) override {
void Init(const Metadata& metadata, data_size_t num_data) override {
for (auto k : eval_at_) {
std::stringstream str_buf;
str_buf << test_name << "'s : ";
str_buf << "NDCG@" + std::to_string(k) + " ";
name_.emplace_back(str_buf.str());
name_.emplace_back(std::string("ndcg@") + std::to_string(k));
}
num_data_ = num_data;
// get label
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment