Unverified Commit 3c394c8d authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

remove init-score parameter (#2776)



* remove related cpp codes

* removed more mentiones of init_score_filename params
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 18e7de4f
......@@ -591,24 +591,6 @@ IO Parameters
- **Note**: can be used only in CLI version
- ``initscore_filename`` :raw-html:`<a id="initscore_filename" title="Permalink to this parameter" href="#initscore_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``init_score_filename``, ``init_score_file``, ``init_score``, ``input_init_score``
- path of file with training initial scores
- if ``""``, will use ``train_data_file`` + ``.init`` (if exists)
- **Note**: works only in case of loading data directly from file
- ``valid_data_initscores`` :raw-html:`<a id="valid_data_initscores" title="Permalink to this parameter" href="#valid_data_initscores">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``valid_data_init_scores``, ``valid_init_score_file``, ``valid_init_score``
- path(s) of file(s) with validation initial scores
- if ``""``, will use ``valid_data_file`` + ``.init`` (if exists)
- separate by ``,`` for multi-validation data
- **Note**: works only in case of loading data directly from file
- ``pre_partition`` :raw-html:`<a id="pre_partition" title="Permalink to this parameter" href="#pre_partition">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``is_pre_partition``
- used for parallel learning (excluding the ``feature_parallel`` mode)
......@@ -1077,11 +1059,9 @@ LightGBM supports continued training with initial scores. It uses an additional
It means the initial score of the first data row is ``0.5``, second is ``-0.1``, and so on.
The initial score file corresponds with data file line by line, and has per score per line.
And if the name of data file is ``train.txt``, the initial score file should be named as ``train.txt.init`` and in the same folder as the data file.
And if the name of data file is ``train.txt``, the initial score file should be named as ``train.txt.init`` and placed in the same folder as the data file.
In this case, LightGBM will auto load initial score file if it exists.
Otherwise, you should specify the path to the custom named file with initial scores by the ``initscore_filename`` `parameter <#initscore_filename>`__.
Weight Data
~~~~~~~~~~~
......
......@@ -97,9 +97,6 @@ output_model = LightGBM_model.txt
# output prediction file for predict task
# output_result= prediction.txt
# support continuous train from initial score file
# input_init_score= init_score.txt
# number of machines in parallel training, alias: num_machine
num_machines = 1
......@@ -110,5 +107,5 @@ local_listen_port = 12400
# machines list file for parallel training, alias: mlist
machine_list_file = mlist.txt
# # force splits
# force splits
# forced_splits = forced_splits.json
task = predict
data = rank.test
......
......@@ -102,9 +102,6 @@ output_model = LightGBM_model.txt
# output prediction file for predict task
# output_result= prediction.txt
# support continuous train from initial score file
# input_init_score= init_score.txt
# number of machines in parallel training, alias: num_machine
num_machines = 1
......
task = predict
data = binary.test
input_model= LightGBM_model.txt
......@@ -97,9 +97,6 @@ output_model = LightGBM_model.txt
# output prediction file for predict task
# output_result= prediction.txt
# support continuous train from initial score file
# input_init_score= init_score.txt
# number of machines in parallel training, alias: num_machine
num_machines = 2
......
task = predict
data = regression.test
......
......@@ -100,9 +100,6 @@ output_model = LightGBM_model.txt
# output prediction file for predict task
# output_result= prediction.txt
# support continuous train from initial score file
# input_init_score= init_score.txt
# number of machines in parallel training, alias: num_machine
num_machines = 1
......
......@@ -103,9 +103,6 @@ output_model = LightGBM_model.txt
# output prediction file for predict task
# output_result= prediction.txt
# support continuous train from initial score file
# input_init_score= init_score.txt
# number of machines in parallel training, alias: num_machine
num_machines = 1
......
......@@ -549,20 +549,6 @@ struct Config {
// desc = **Note**: can be used only in CLI version
std::string output_result = "LightGBM_predict_result.txt";
// alias = init_score_filename, init_score_file, init_score, input_init_score
// desc = path of file with training initial scores
// desc = if ``""``, will use ``train_data_file`` + ``.init`` (if exists)
// desc = **Note**: works only in case of loading data directly from file
std::string initscore_filename = "";
// alias = valid_data_init_scores, valid_init_score_file, valid_init_score
// default = ""
// desc = path(s) of file(s) with validation initial scores
// desc = if ``""``, will use ``valid_data_file`` + ``.init`` (if exists)
// desc = separate by ``,`` for multi-validation data
// desc = **Note**: works only in case of loading data directly from file
std::vector<std::string> valid_data_initscores;
// alias = is_pre_partition
// desc = used for parallel learning (excluding the ``feature_parallel`` mode)
// desc = ``true`` if training data are pre-partitioned, and different machines use different partitions
......
......@@ -47,9 +47,8 @@ class Metadata {
/*!
* \brief Initialization will load query level informations, since it is need for sampling data
* \param data_filename Filename of data
* \param init_score_filename Filename of initial score
*/
void Init(const char* data_filename, const char* initscore_file);
void Init(const char* data_filename);
/*!
* \brief init as subset
* \param metadata Filename of data
......@@ -213,7 +212,7 @@ class Metadata {
private:
/*! \brief Load initial scores from file */
void LoadInitialScore(const char* initscore_file);
void LoadInitialScore();
/*! \brief Load wights from file */
void LoadWeights();
/*! \brief Load query boundaries from file */
......
......@@ -19,13 +19,13 @@ class DatasetLoader {
LIGHTGBM_EXPORT ~DatasetLoader();
LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file, int rank, int num_machines);
LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file) {
return LoadFromFile(filename, initscore_file, 0, 1);
LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename) {
return LoadFromFile(filename, 0, 1);
}
LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const char* initscore_file, const Dataset* train_data);
LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
LIGHTGBM_EXPORT Dataset* CostructFromSampleData(double** sample_values,
int** sample_indices, int num_col, const int* num_per_col,
......
......@@ -104,12 +104,10 @@ void Application::LoadData() {
if (config_.is_parallel_find_bin) {
// load data for parallel training
train_data_.reset(dataset_loader.LoadFromFile(config_.data.c_str(),
config_.initscore_filename.c_str(),
Network::rank(), Network::num_machines()));
} else {
// load data for single machine
train_data_.reset(dataset_loader.LoadFromFile(config_.data.c_str(), config_.initscore_filename.c_str(),
0, 1));
train_data_.reset(dataset_loader.LoadFromFile(config_.data.c_str(), 0, 1));
}
// need save binary file
if (config_.save_binary) {
......@@ -136,7 +134,6 @@ void Application::LoadData() {
auto new_dataset = std::unique_ptr<Dataset>(
dataset_loader.LoadFromFileAlignWithOtherDataset(
config_.valid[i].c_str(),
config_.valid_data_initscores[i].c_str(),
train_data_.get()));
valid_datas_.push_back(std::move(new_dataset));
// need save binary file
......@@ -228,8 +225,7 @@ void Application::Predict() {
}
DatasetLoader dataset_loader(config_, nullptr,
config_.num_class, config_.data.c_str());
train_data_.reset(dataset_loader.LoadFromFile(config_.data.c_str(), config_.initscore_filename.c_str(),
0, 1));
train_data_.reset(dataset_loader.LoadFromFile(config_.data.c_str(), 0, 1));
train_metric_.clear();
objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective,
config_));
......
......@@ -597,12 +597,12 @@ int LGBM_DatasetCreateFromFile(const char* filename,
DatasetLoader loader(config, nullptr, 1, filename);
if (reference == nullptr) {
if (Network::num_machines() == 1) {
*out = loader.LoadFromFile(filename, "");
*out = loader.LoadFromFile(filename);
} else {
*out = loader.LoadFromFile(filename, "", Network::rank(), Network::num_machines());
*out = loader.LoadFromFile(filename, Network::rank(), Network::num_machines());
}
} else {
*out = loader.LoadFromFileAlignWithOtherDataset(filename, "",
*out = loader.LoadFromFileAlignWithOtherDataset(filename,
reinterpret_cast<const Dataset*>(reference));
}
API_END();
......
......@@ -210,23 +210,16 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
// sort eval_at
std::sort(eval_at.begin(), eval_at.end());
if (valid_data_initscores.size() == 0 && valid.size() > 0) {
valid_data_initscores = std::vector<std::string>(valid.size(), "");
}
CHECK(valid.size() == valid_data_initscores.size());
if (valid_data_initscores.empty()) {
std::vector<std::string> new_valid;
for (size_t i = 0; i < valid.size(); ++i) {
if (valid[i] != data) {
// Only push the non-training data
new_valid.push_back(valid[i]);
} else {
is_provide_training_metric = true;
}
std::vector<std::string> new_valid;
for (size_t i = 0; i < valid.size(); ++i) {
if (valid[i] != data) {
// Only push the non-training data
new_valid.push_back(valid[i]);
} else {
is_provide_training_metric = true;
}
valid = new_valid;
}
valid = new_valid;
// check for conflicts
CheckParamConflict();
......
......@@ -110,13 +110,6 @@ const std::unordered_map<std::string, std::string>& Config::alias_table() {
{"prediction_name", "output_result"},
{"pred_name", "output_result"},
{"name_pred", "output_result"},
{"init_score_filename", "initscore_filename"},
{"init_score_file", "initscore_filename"},
{"init_score", "initscore_filename"},
{"input_init_score", "initscore_filename"},
{"valid_data_init_scores", "valid_data_initscores"},
{"valid_init_score_file", "valid_data_initscores"},
{"valid_init_score", "valid_data_initscores"},
{"is_pre_partition", "pre_partition"},
{"is_enable_bundle", "enable_bundle"},
{"bundle", "enable_bundle"},
......@@ -242,8 +235,6 @@ const std::unordered_set<std::string>& Config::parameter_set() {
"snapshot_freq",
"input_model",
"output_result",
"initscore_filename",
"valid_data_initscores",
"pre_partition",
"enable_bundle",
"use_missing",
......@@ -478,12 +469,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetString(params, "output_result", &output_result);
GetString(params, "initscore_filename", &initscore_filename);
if (GetString(params, "valid_data_initscores", &tmp_str)) {
valid_data_initscores = Common::Split(tmp_str.c_str(), ',');
}
GetBool(params, "pre_partition", &pre_partition);
GetBool(params, "enable_bundle", &enable_bundle);
......@@ -668,8 +653,6 @@ std::string Config::SaveMembersToString() const {
str_buf << "[snapshot_freq: " << snapshot_freq << "]\n";
str_buf << "[input_model: " << input_model << "]\n";
str_buf << "[output_result: " << output_result << "]\n";
str_buf << "[initscore_filename: " << initscore_filename << "]\n";
str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n";
str_buf << "[pre_partition: " << pre_partition << "]\n";
str_buf << "[enable_bundle: " << enable_bundle << "]\n";
str_buf << "[use_missing: " << use_missing << "]\n";
......
......@@ -165,7 +165,7 @@ void DatasetLoader::SetHeader(const char* filename) {
}
}
Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore_file, int rank, int num_machines) {
Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
// don't support query id in data file when training in parallel
if (num_machines > 1 && !config_.pre_partition) {
if (group_idx_ > 0) {
......@@ -184,7 +184,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore
}
dataset->data_filename_ = filename;
dataset->label_idx_ = label_idx_;
dataset->metadata_.Init(filename, initscore_file);
dataset->metadata_.Init(filename);
if (!config_.two_round) {
// read data to memory
auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, rank, num_machines, &num_global_data, &used_data_indices);
......@@ -227,7 +227,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore
Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, const char* initscore_file, const Dataset* train_data) {
Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data) {
data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices;
auto dataset = std::unique_ptr<Dataset>(new Dataset());
......@@ -239,7 +239,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
}
dataset->data_filename_ = filename;
dataset->label_idx_ = label_idx_;
dataset->metadata_.Init(filename, initscore_file);
dataset->metadata_.Init(filename);
if (!config_.two_round) {
// read data in memory
auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices);
......
......@@ -20,13 +20,13 @@ Metadata::Metadata() {
init_score_load_from_file_ = false;
}
void Metadata::Init(const char* data_filename, const char* initscore_file) {
void Metadata::Init(const char* data_filename) {
data_filename_ = data_filename;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries();
LoadWeights();
LoadQueryWeights();
LoadInitialScore(initscore_file);
LoadInitialScore();
}
Metadata::~Metadata() {
......@@ -386,14 +386,12 @@ void Metadata::LoadWeights() {
weight_load_from_file_ = true;
}
void Metadata::LoadInitialScore(const char* initscore_file) {
void Metadata::LoadInitialScore() {
num_init_score_ = 0;
std::string init_score_filename(initscore_file);
if (init_score_filename.size() <= 0) {
init_score_filename = std::string(data_filename_);
// default weight file name
init_score_filename.append(".init");
}
std::string init_score_filename(data_filename_);
init_score_filename = std::string(data_filename_);
// default init_score file name
init_score_filename.append(".init");
TextReader<size_t> reader(init_score_filename.c_str(), false);
reader.ReadAllLines();
if (reader.Lines().empty()) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment