[docs] document rounding behavior of floating point numbers in categorical features (#5009)

057ba078 · Nikita Titov · GitHub · d31346f6 · 057ba078 · 057ba078
Unverified Commit 057ba078 authored Feb 17, 2022 by Nikita Titov Committed by GitHub Feb 17, 2022
4 changed files
--- a/docs/Advanced-Topics.rst
+++ b/docs/Advanced-Topics.rst
@@ -25,6 +25,7 @@ Categorical Feature Support

 -  Categorical features must be encoded as non-negative integers (``int``) less than ``Int32.MaxValue`` (2147483647).
   It is best to use a contiguous range of integers started from zero.
+   Floating point numbers in categorical features will be rounded towards 0.

 -  Use ``min_data_per_group``, ``cat_smooth`` to deal with over-fitting (when ``#data`` is small or ``#category`` is large).


--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -1159,6 +1159,7 @@ class Dataset:
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
            All negative values in categorical features will be treated as missing values.
            The output cannot be monotonically constrained with respect to a categorical feature.
+            Floating point numbers in categorical features will be rounded towards 0.
        params : dict or None, optional (default=None)
            Other parameters for Dataset.
        free_raw_data : bool, optional (default=True)
@@ -3563,6 +3564,7 @@ class Booster:
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
            All negative values in categorical features will be treated as missing values.
            The output cannot be monotonically constrained with respect to a categorical feature.
+            Floating point numbers in categorical features will be rounded towards 0.
        dataset_params : dict or None, optional (default=None)
            Other parameters for Dataset ``data``.
        free_raw_data : bool, optional (default=True)

--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -109,6 +109,7 @@ def train(
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
        All negative values in categorical features will be treated as missing values.
        The output cannot be monotonically constrained with respect to a categorical feature.
+        Floating point numbers in categorical features will be rounded towards 0.
    keep_training_booster : bool, optional (default=False)
        Whether the returned Booster will be used to keep training.
        If False, the returned value will be converted into _InnerPredictor before returning.
@@ -463,6 +464,7 @@ def cv(params, train_set, num_boost_round=100,
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
        All negative values in categorical features will be treated as missing values.
        The output cannot be monotonically constrained with respect to a categorical feature.
+        Floating point numbers in categorical features will be rounded towards 0.
    fpreproc : callable or None, optional (default=None)
        Preprocessing function that takes (dtrain, dtest, params)
        and returns transformed versions of those.

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -262,6 +262,7 @@ _lgbmmodel_doc_fit = (
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
        All negative values in categorical features will be treated as missing values.
        The output cannot be monotonically constrained with respect to a categorical feature.
+        Floating point numbers in categorical features will be rounded towards 0.
    callbacks : list of callable, or None, optional (default=None)
        List of callback functions that are applied at each iteration.
        See Callbacks in Python API for more information.