Unverified Commit 111d0c80 authored by Chen Yufei's avatar Chen Yufei Committed by GitHub
Browse files

Add new task type: "save_binary" (#3651)

* Add new task type: "save_binary".

* Document for task "save_binary".
parent c3ac77b5
...@@ -49,6 +49,8 @@ Core Parameters ...@@ -49,6 +49,8 @@ Core Parameters
- ``refit``, for refitting existing models with new data, aliases: ``refit_tree`` - ``refit``, for refitting existing models with new data, aliases: ``refit_tree``
- ``save_binary``, load train (and validation) data then save dataset to binary file. Typical usage: ``save_binary`` first, then run multiple ``train`` tasks in parallel using the saved binary file
- **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions - **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
- ``objective`` :raw-html:`<a id="objective" title="Permalink to this parameter" href="#objective">&#x1F517;&#xFE0E;</a>`, default = ``regression``, type = enum, options: ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``, ``gamma``, ``tweedie``, ``binary``, ``multiclass``, ``multiclassova``, ``cross_entropy``, ``cross_entropy_lambda``, ``lambdarank``, ``rank_xendcg``, aliases: ``objective_type``, ``app``, ``application`` - ``objective`` :raw-html:`<a id="objective" title="Permalink to this parameter" href="#objective">&#x1F517;&#xFE0E;</a>`, default = ``regression``, type = enum, options: ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``, ``gamma``, ``tweedie``, ``binary``, ``multiclass``, ``multiclassova``, ``cross_entropy``, ``cross_entropy_lambda``, ``lambdarank``, ``rank_xendcg``, aliases: ``objective_type``, ``app``, ``application``
......
...@@ -27,7 +27,7 @@ namespace LightGBM { ...@@ -27,7 +27,7 @@ namespace LightGBM {
/*! \brief Types of tasks */ /*! \brief Types of tasks */
enum TaskType { enum TaskType {
kTrain, kPredict, kConvertModel, KRefitTree kTrain, kPredict, kConvertModel, KRefitTree, kSaveBinary
}; };
const int kDefaultNumLeaves = 31; const int kDefaultNumLeaves = 31;
...@@ -102,6 +102,7 @@ struct Config { ...@@ -102,6 +102,7 @@ struct Config {
// desc = ``predict``, for prediction, aliases: ``prediction``, ``test`` // desc = ``predict``, for prediction, aliases: ``prediction``, ``test``
// desc = ``convert_model``, for converting model file into if-else format, see more information in `Convert Parameters <#convert-parameters>`__ // desc = ``convert_model``, for converting model file into if-else format, see more information in `Convert Parameters <#convert-parameters>`__
// desc = ``refit``, for refitting existing models with new data, aliases: ``refit_tree`` // desc = ``refit``, for refitting existing models with new data, aliases: ``refit_tree``
// desc = ``save_binary``, load train (and validation) data then save dataset to binary file. Typical usage: ``save_binary`` first, then run multiple ``train`` tasks in parallel using the saved binary file
// desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
TaskType task = TaskType::kTrain; TaskType task = TaskType::kTrain;
......
...@@ -187,6 +187,10 @@ void Application::InitTrain() { ...@@ -187,6 +187,10 @@ void Application::InitTrain() {
config_)); config_));
// load training data // load training data
LoadData(); LoadData();
if (config_.task == TaskType::kSaveBinary) {
Log::Info("Save data as binary finished, exit");
exit(0);
}
// initialize the objective function // initialize the objective function
objective_fun_->Init(train_data_->metadata(), train_data_->num_data()); objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
// initialize the boosting // initialize the boosting
......
...@@ -113,6 +113,8 @@ void GetTaskType(const std::unordered_map<std::string, std::string>& params, Tas ...@@ -113,6 +113,8 @@ void GetTaskType(const std::unordered_map<std::string, std::string>& params, Tas
*task = TaskType::kConvertModel; *task = TaskType::kConvertModel;
} else if (value == std::string("refit") || value == std::string("refit_tree")) { } else if (value == std::string("refit") || value == std::string("refit_tree")) {
*task = TaskType::KRefitTree; *task = TaskType::KRefitTree;
} else if (value == std::string("save_binary")) {
*task = TaskType::kSaveBinary;
} else { } else {
Log::Fatal("Unknown task type %s", value.c_str()); Log::Fatal("Unknown task type %s", value.c_str());
} }
...@@ -234,6 +236,11 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) { ...@@ -234,6 +236,11 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
} }
valid = new_valid; valid = new_valid;
if ((task == TaskType::kSaveBinary) && !save_binary) {
Log::Info("save_binary parameter set to true because task is save_binary");
save_binary = true;
}
if (verbosity == 1) { if (verbosity == 1) {
LightGBM::Log::ResetLogLevel(LightGBM::LogLevel::Info); LightGBM::Log::ResetLogLevel(LightGBM::LogLevel::Info);
} else if (verbosity == 0) { } else if (verbosity == 0) {
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <LightGBM/utils/log.h> #include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h> #include <LightGBM/utils/openmp_wrapper.h>
#include <chrono>
#include <fstream> #include <fstream>
namespace LightGBM { namespace LightGBM {
...@@ -237,7 +238,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -237,7 +238,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
} }
// initialize label // initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
Log::Debug("Making second pass..."); Log::Info("Making second pass...");
// extract features // extract features
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get()); ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
} }
...@@ -950,6 +951,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen ...@@ -950,6 +951,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
const std::vector<std::string>& sample_data, const std::vector<std::string>& sample_data,
const Parser* parser, Dataset* dataset) { const Parser* parser, Dataset* dataset) {
auto t1 = std::chrono::high_resolution_clock::now();
std::vector<std::vector<double>> sample_values; std::vector<std::vector<double>> sample_values;
std::vector<std::vector<int>> sample_indices; std::vector<std::vector<int>> sample_indices;
std::vector<std::pair<int, double>> oneline_features; std::vector<std::pair<int, double>> oneline_features;
...@@ -1127,6 +1129,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -1127,6 +1129,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
if (dataset->has_raw()) { if (dataset->has_raw()) {
dataset->ResizeRaw(static_cast<int>(sample_data.size())); dataset->ResizeRaw(static_cast<int>(sample_data.size()));
} }
auto t2 = std::chrono::high_resolution_clock::now();
Log::Info("Construct bin mappers from text data time %.2f seconds",
std::chrono::duration<double, std::milli>(t2 - t1) * 1e-3);
} }
/*! \brief Extract local features from memory */ /*! \brief Extract local features from memory */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment