Unverified Commit dc02dcaf authored by Zhiyuan He's avatar Zhiyuan He Committed by GitHub
Browse files

Fix some paramater hints when loading from binary file (#4701)


Co-authored-by: default avatarhzy46 <email@example.com>
parent 51cd8fc2
...@@ -746,13 +746,13 @@ Dataset Parameters ...@@ -746,13 +746,13 @@ Dataset Parameters
- by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed, but may cause run out of memory error when the data file is very big - by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed, but may cause run out of memory error when the data file is very big
- **Note**: works only in case of loading data directly from file - **Note**: works only in case of loading data directly from text file
- ``header`` :raw-html:`<a id="header" title="Permalink to this parameter" href="#header">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``has_header`` - ``header`` :raw-html:`<a id="header" title="Permalink to this parameter" href="#header">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``has_header``
- set this to ``true`` if input data has header - set this to ``true`` if input data has header
- **Note**: works only in case of loading data directly from file - **Note**: works only in case of loading data directly from text file
- ``label_column`` :raw-html:`<a id="label_column" title="Permalink to this parameter" href="#label_column">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = int or string, aliases: ``label`` - ``label_column`` :raw-html:`<a id="label_column" title="Permalink to this parameter" href="#label_column">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = int or string, aliases: ``label``
...@@ -764,7 +764,7 @@ Dataset Parameters ...@@ -764,7 +764,7 @@ Dataset Parameters
- if omitted, the first column in the training data is used as the label - if omitted, the first column in the training data is used as the label
- **Note**: works only in case of loading data directly from file - **Note**: works only in case of loading data directly from text file
- ``weight_column`` :raw-html:`<a id="weight_column" title="Permalink to this parameter" href="#weight_column">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = int or string, aliases: ``weight`` - ``weight_column`` :raw-html:`<a id="weight_column" title="Permalink to this parameter" href="#weight_column">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = int or string, aliases: ``weight``
...@@ -774,7 +774,7 @@ Dataset Parameters ...@@ -774,7 +774,7 @@ Dataset Parameters
- add a prefix ``name:`` for column name, e.g. ``weight=name:weight`` - add a prefix ``name:`` for column name, e.g. ``weight=name:weight``
- **Note**: works only in case of loading data directly from file - **Note**: works only in case of loading data directly from text file
- **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0`` - **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``
...@@ -786,7 +786,7 @@ Dataset Parameters ...@@ -786,7 +786,7 @@ Dataset Parameters
- add a prefix ``name:`` for column name, e.g. ``query=name:query_id`` - add a prefix ``name:`` for column name, e.g. ``query=name:query_id``
- **Note**: works only in case of loading data directly from file - **Note**: works only in case of loading data directly from text file
- **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__ - **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__
...@@ -800,7 +800,7 @@ Dataset Parameters ...@@ -800,7 +800,7 @@ Dataset Parameters
- add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored - add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored
- **Note**: works only in case of loading data directly from file - **Note**: works only in case of loading data directly from text file
- **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int`` - **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``
......
...@@ -645,12 +645,12 @@ struct Config { ...@@ -645,12 +645,12 @@ struct Config {
// alias = two_round_loading, use_two_round_loading // alias = two_round_loading, use_two_round_loading
// desc = set this to ``true`` if data file is too big to fit in memory // desc = set this to ``true`` if data file is too big to fit in memory
// desc = by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed, but may cause run out of memory error when the data file is very big // desc = by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed, but may cause run out of memory error when the data file is very big
// desc = **Note**: works only in case of loading data directly from file // desc = **Note**: works only in case of loading data directly from text file
bool two_round = false; bool two_round = false;
// alias = has_header // alias = has_header
// desc = set this to ``true`` if input data has header // desc = set this to ``true`` if input data has header
// desc = **Note**: works only in case of loading data directly from file // desc = **Note**: works only in case of loading data directly from text file
bool header = false; bool header = false;
// type = int or string // type = int or string
...@@ -659,7 +659,7 @@ struct Config { ...@@ -659,7 +659,7 @@ struct Config {
// desc = use number for index, e.g. ``label=0`` means column\_0 is the label // desc = use number for index, e.g. ``label=0`` means column\_0 is the label
// desc = add a prefix ``name:`` for column name, e.g. ``label=name:is_click`` // desc = add a prefix ``name:`` for column name, e.g. ``label=name:is_click``
// desc = if omitted, the first column in the training data is used as the label // desc = if omitted, the first column in the training data is used as the label
// desc = **Note**: works only in case of loading data directly from file // desc = **Note**: works only in case of loading data directly from text file
std::string label_column = ""; std::string label_column = "";
// type = int or string // type = int or string
...@@ -667,7 +667,7 @@ struct Config { ...@@ -667,7 +667,7 @@ struct Config {
// desc = used to specify the weight column // desc = used to specify the weight column
// desc = use number for index, e.g. ``weight=0`` means column\_0 is the weight // desc = use number for index, e.g. ``weight=0`` means column\_0 is the weight
// desc = add a prefix ``name:`` for column name, e.g. ``weight=name:weight`` // desc = add a prefix ``name:`` for column name, e.g. ``weight=name:weight``
// desc = **Note**: works only in case of loading data directly from file // desc = **Note**: works only in case of loading data directly from text file
// desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0`` // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``
std::string weight_column = ""; std::string weight_column = "";
...@@ -676,7 +676,7 @@ struct Config { ...@@ -676,7 +676,7 @@ struct Config {
// desc = used to specify the query/group id column // desc = used to specify the query/group id column
// desc = use number for index, e.g. ``query=0`` means column\_0 is the query id // desc = use number for index, e.g. ``query=0`` means column\_0 is the query id
// desc = add a prefix ``name:`` for column name, e.g. ``query=name:query_id`` // desc = add a prefix ``name:`` for column name, e.g. ``query=name:query_id``
// desc = **Note**: works only in case of loading data directly from file // desc = **Note**: works only in case of loading data directly from text file
// desc = **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__ // desc = **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__
// desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0`` // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0``
std::string group_column = ""; std::string group_column = "";
...@@ -686,7 +686,7 @@ struct Config { ...@@ -686,7 +686,7 @@ struct Config {
// desc = used to specify some ignoring columns in training // desc = used to specify some ignoring columns in training
// desc = use number for index, e.g. ``ignore_column=0,1,2`` means column\_0, column\_1 and column\_2 will be ignored // desc = use number for index, e.g. ``ignore_column=0,1,2`` means column\_0, column\_1 and column\_2 will be ignored
// desc = add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored // desc = add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored
// desc = **Note**: works only in case of loading data directly from file // desc = **Note**: works only in case of loading data directly from text file
// desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int`` // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``
// desc = **Note**: despite the fact that specified columns will be completely ignored during the training, they still should have a valid format allowing LightGBM to load file successfully // desc = **Note**: despite the fact that specified columns will be completely ignored during the training, they still should have a valid format allowing LightGBM to load file successfully
std::string ignore_column = ""; std::string ignore_column = "";
......
...@@ -35,7 +35,7 @@ DatasetLoader::~DatasetLoader() { ...@@ -35,7 +35,7 @@ DatasetLoader::~DatasetLoader() {
void DatasetLoader::SetHeader(const char* filename) { void DatasetLoader::SetHeader(const char* filename) {
std::unordered_map<std::string, int> name2idx; std::unordered_map<std::string, int> name2idx;
std::string name_prefix("name:"); std::string name_prefix("name:");
if (filename != nullptr) { if (filename != nullptr && CheckCanLoadFromBin(filename) == "") {
TextReader<data_size_t> text_reader(filename, config_.header); TextReader<data_size_t> text_reader(filename, config_.header);
// get column names // get column names
...@@ -837,6 +837,19 @@ void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binar ...@@ -837,6 +837,19 @@ void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binar
} else { } else {
Log::Info("Recommend use integer for label index when loading data from binary for sanity check."); Log::Info("Recommend use integer for label index when loading data from binary for sanity check.");
} }
if (config_.label_column != "") {
Log::Warning("Config label_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
}
if (config_.weight_column != "") {
Log::Warning("Config weight_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
}
if (config_.group_column != "") {
Log::Warning("Config group_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
}
if (config_.ignore_column != "") {
Log::Warning("Config ignore_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
}
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment