"tests/vscode:/vscode.git/clone" did not exist on "dd54a4b026455f728f9d5945eca369b2be7b12f9"
Unverified Commit 111d0c80 authored by Chen Yufei's avatar Chen Yufei Committed by GitHub
Browse files

Add new task type: "save_binary" (#3651)

* Add new task type: "save_binary".

* Document for task "save_binary".
parent c3ac77b5
......@@ -49,6 +49,8 @@ Core Parameters
- ``refit``, for refitting existing models with new data, aliases: ``refit_tree``
- ``save_binary``, load train (and validation) data then save dataset to binary file. Typical usage: ``save_binary`` first, then run multiple ``train`` tasks in parallel using the saved binary file
- **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
- ``objective`` :raw-html:`<a id="objective" title="Permalink to this parameter" href="#objective">&#x1F517;&#xFE0E;</a>`, default = ``regression``, type = enum, options: ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``, ``gamma``, ``tweedie``, ``binary``, ``multiclass``, ``multiclassova``, ``cross_entropy``, ``cross_entropy_lambda``, ``lambdarank``, ``rank_xendcg``, aliases: ``objective_type``, ``app``, ``application``
......
......@@ -27,7 +27,7 @@ namespace LightGBM {
/*! \brief Types of tasks */
enum TaskType {
kTrain, kPredict, kConvertModel, KRefitTree
kTrain, kPredict, kConvertModel, KRefitTree, kSaveBinary
};
const int kDefaultNumLeaves = 31;
......@@ -102,6 +102,7 @@ struct Config {
// desc = ``predict``, for prediction, aliases: ``prediction``, ``test``
// desc = ``convert_model``, for converting model file into if-else format, see more information in `Convert Parameters <#convert-parameters>`__
// desc = ``refit``, for refitting existing models with new data, aliases: ``refit_tree``
// desc = ``save_binary``, load train (and validation) data then save dataset to binary file. Typical usage: ``save_binary`` first, then run multiple ``train`` tasks in parallel using the saved binary file
// desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
TaskType task = TaskType::kTrain;
......
......@@ -187,6 +187,10 @@ void Application::InitTrain() {
config_));
// load training data
LoadData();
if (config_.task == TaskType::kSaveBinary) {
Log::Info("Save data as binary finished, exit");
exit(0);
}
// initialize the objective function
objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
// initialize the boosting
......
......@@ -113,6 +113,8 @@ void GetTaskType(const std::unordered_map<std::string, std::string>& params, Tas
*task = TaskType::kConvertModel;
} else if (value == std::string("refit") || value == std::string("refit_tree")) {
*task = TaskType::KRefitTree;
} else if (value == std::string("save_binary")) {
*task = TaskType::kSaveBinary;
} else {
Log::Fatal("Unknown task type %s", value.c_str());
}
......@@ -234,6 +236,11 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
}
valid = new_valid;
if ((task == TaskType::kSaveBinary) && !save_binary) {
Log::Info("save_binary parameter set to true because task is save_binary");
save_binary = true;
}
if (verbosity == 1) {
LightGBM::Log::ResetLogLevel(LightGBM::LogLevel::Info);
} else if (verbosity == 0) {
......
......@@ -10,6 +10,7 @@
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <chrono>
#include <fstream>
namespace LightGBM {
......@@ -237,7 +238,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
// initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
Log::Debug("Making second pass...");
Log::Info("Making second pass...");
// extract features
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
}
......@@ -950,6 +951,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
const std::vector<std::string>& sample_data,
const Parser* parser, Dataset* dataset) {
auto t1 = std::chrono::high_resolution_clock::now();
std::vector<std::vector<double>> sample_values;
std::vector<std::vector<int>> sample_indices;
std::vector<std::pair<int, double>> oneline_features;
......@@ -1127,6 +1129,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
if (dataset->has_raw()) {
dataset->ResizeRaw(static_cast<int>(sample_data.size()));
}
auto t2 = std::chrono::high_resolution_clock::now();
Log::Info("Construct bin mappers from text data time %.2f seconds",
std::chrono::duration<double, std::milli>(t2 - t1) * 1e-3);
}
/*! \brief Extract local features from memory */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment