Fix some paramater hints when loading from binary file (#4701)

Co-authored-by: hzy46 <email@example.com>

Fix some paramater hints when loading from binary file (#4701)
Co-authored-by: hzy46 <email@example.com>
dc02dcaf · Zhiyuan He · GitHub · 51cd8fc2 · dc02dcaf · dc02dcaf
Unverified Commit dc02dcaf authored Oct 25, 2021 by Zhiyuan He Committed by GitHub Oct 25, 2021
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 13 deletions

docs/Parameters.rst docs/Parameters.rst +6 -6

include/LightGBM/config.h include/LightGBM/config.h +6 -6

src/io/dataset_loader.cpp src/io/dataset_loader.cpp +14 -1

No files found.
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -746,13 +746,13 @@ Dataset Parameters

   -  by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed, but may cause run out of memory error when the data file is very big

-   -  **Note**: works only in case of loading data directly from file
+   -  **Note**: works only in case of loading data directly from text file

 -  ``header`` :raw-html:`<a id="header" title="Permalink to this parameter" href="#header">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``has_header``

   -  set this to ``true`` if input data has header

-   -  **Note**: works only in case of loading data directly from file
+   -  **Note**: works only in case of loading data directly from text file

 -  ``label_column`` :raw-html:`<a id="label_column" title="Permalink to this parameter" href="#label_column">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = int or string, aliases: ``label``

@@ -764,7 +764,7 @@ Dataset Parameters

   -  if omitted, the first column in the training data is used as the label

-   -  **Note**: works only in case of loading data directly from file
+   -  **Note**: works only in case of loading data directly from text file

 -  ``weight_column`` :raw-html:`<a id="weight_column" title="Permalink to this parameter" href="#weight_column">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = int or string, aliases: ``weight``

@@ -774,7 +774,7 @@ Dataset Parameters

   -  add a prefix ``name:`` for column name, e.g. ``weight=name:weight``

-   -  **Note**: works only in case of loading data directly from file
+   -  **Note**: works only in case of loading data directly from text file

   -  **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``

@@ -786,7 +786,7 @@ Dataset Parameters

   -  add a prefix ``name:`` for column name, e.g. ``query=name:query_id``

-   -  **Note**: works only in case of loading data directly from file
+   -  **Note**: works only in case of loading data directly from text file

   -  **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__

@@ -800,7 +800,7 @@ Dataset Parameters

   -  add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored

-   -  **Note**: works only in case of loading data directly from file
+   -  **Note**: works only in case of loading data directly from text file

   -  **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``


--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -645,12 +645,12 @@ struct Config {
  // alias = two_round_loading, use_two_round_loading
  // desc = set this to ``true`` if data file is too big to fit in memory
  // desc = by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed, but may cause run out of memory error when the data file is very big
-  // desc = **Note**: works only in case of loading data directly from file
+  // desc = **Note**: works only in case of loading data directly from text file
  bool two_round = false;

  // alias = has_header
  // desc = set this to ``true`` if input data has header
-  // desc = **Note**: works only in case of loading data directly from file
+  // desc = **Note**: works only in case of loading data directly from text file
  bool header = false;

  // type = int or string
@@ -659,7 +659,7 @@ struct Config {
  // desc = use number for index, e.g. ``label=0`` means column\_0 is the label
  // desc = add a prefix ``name:`` for column name, e.g. ``label=name:is_click``
  // desc = if omitted, the first column in the training data is used as the label
-  // desc = **Note**: works only in case of loading data directly from file
+  // desc = **Note**: works only in case of loading data directly from text file
  std::string label_column = "";

  // type = int or string
@@ -667,7 +667,7 @@ struct Config {
  // desc = used to specify the weight column
  // desc = use number for index, e.g. ``weight=0`` means column\_0 is the weight
  // desc = add a prefix ``name:`` for column name, e.g. ``weight=name:weight``
-  // desc = **Note**: works only in case of loading data directly from file
+  // desc = **Note**: works only in case of loading data directly from text file
  // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``
  std::string weight_column = "";

@@ -676,7 +676,7 @@ struct Config {
  // desc = used to specify the query/group id column
  // desc = use number for index, e.g. ``query=0`` means column\_0 is the query id
  // desc = add a prefix ``name:`` for column name, e.g. ``query=name:query_id``
-  // desc = **Note**: works only in case of loading data directly from file
+  // desc = **Note**: works only in case of loading data directly from text file
  // desc = **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__
  // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0``
  std::string group_column = "";
@@ -686,7 +686,7 @@ struct Config {
  // desc = used to specify some ignoring columns in training
  // desc = use number for index, e.g. ``ignore_column=0,1,2`` means column\_0, column\_1 and column\_2 will be ignored
  // desc = add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored
-  // desc = **Note**: works only in case of loading data directly from file
+  // desc = **Note**: works only in case of loading data directly from text file
  // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``
  // desc = **Note**: despite the fact that specified columns will be completely ignored during the training, they still should have a valid format allowing LightGBM to load file successfully
  std::string ignore_column = "";

--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -35,7 +35,7 @@ DatasetLoader::~DatasetLoader() {
 void DatasetLoader::SetHeader(const char* filename) {
  std::unordered_map<std::string, int> name2idx;
  std::string name_prefix("name:");
-  if (filename != nullptr) {
+  if (filename != nullptr && CheckCanLoadFromBin(filename) == "") {
    TextReader<data_size_t> text_reader(filename, config_.header);

    // get column names
@@ -837,6 +837,19 @@ void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binar
    } else {
      Log::Info("Recommend use integer for label index when loading data from binary for sanity check.");
    }
+
+    if (config_.label_column != "") {
+      Log::Warning("Config label_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
+    }
+    if (config_.weight_column != "") {
+      Log::Warning("Config weight_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
+    }
+    if (config_.group_column != "") {
+      Log::Warning("Config group_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
+    }
+    if (config_.ignore_column != "") {
+      Log::Warning("Config ignore_column works only in case of loading data directly from text file. It will be ignored when loading from binary file.");
+    }
  }
 }