[dask] [docs] Fix inaccuracies in API docs for Dask module (fixes #3871) (#3930)

* got fit() working * add predict() * predict_proba() * remove custom objective docs * Apply suggestions from code review Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * fix capitalization * Update tests/python_package_test/test_dask.py Co-authored-by: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

[dask] [docs] Fix inaccuracies in API docs for Dask module (fixes #3871) (#3930)
* got fit() working * add predict() * predict_proba() * remove custom objective docs * Apply suggestions from code review Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * fix capitalization * Update tests/python_package_test/test_dask.py Co-authored-by: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
06ed4337 · James Lamb · GitHub · 846b512d · 06ed4337 · 06ed4337
Unverified Commit 06ed4337 authored Feb 09, 2021 by James Lamb Committed by GitHub Feb 09, 2021
3 changed files
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
 # coding: utf-8
-"""Distributed training with LightGBM and Dask.distributed.
+"""Distributed training with LightGBM and dask.distributed.

 This module enables you to perform distributed training with LightGBM on
-Dask.Array and Dask.DataFrame collections.
+dask.Array and dask.DataFrame collections.

 It is based on dask-lightgbm, which was based on dask-xgboost.
 """
@@ -19,7 +19,14 @@ from .basic import _choose_param_value, _ConfigAliases, _LIB, _log_warning, _saf
 from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat,
                     SKLEARN_INSTALLED, LGBMNotFittedError,
                     DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait)
-from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker
+from .sklearn import (
+    _lgbmmodel_doc_fit,
+    _lgbmmodel_doc_predict,
+    LGBMClassifier,
+    LGBMModel,
+    LGBMRegressor,
+    LGBMRanker
+)

 _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
 _DaskMatrixLike = Union[dask_Array, dask_DataFrame]
@@ -216,17 +223,17 @@ def _train(
    ----------
    client : dask.distributed.Client
        Dask client.
-    data : dask Array or dask DataFrame of shape = [n_samples, n_features]
+    data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
        Input feature matrix.
-    label : dask Array, dask DataFrame or dask Series of shape = [n_samples]
+    label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]
        The target values (class labels in classification, real numbers in regression).
    params : dict
        Parameters passed to constructor of the local underlying model.
    model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
        Class of the local underlying model.
-    sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
+    sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
        Weights of training data.
-    group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
+    group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
        Group/query data.
        Only used in the learning-to-rank task.
        sum(group) = n_samples.
@@ -396,7 +403,7 @@ def _predict(
    ----------
    model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
        Fitted underlying model.
-    data : dask Array or dask DataFrame of shape = [n_samples, n_features]
+    data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
        Input feature matrix.
    raw_score : bool, optional (default=False)
        Whether to predict raw scores.
@@ -413,11 +420,11 @@ def _predict(

    Returns
    -------
-    predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
+    predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
        The predicted values.
-    X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
+    X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
        If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-    X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
+    X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
        If ``pred_contrib=True``, the feature contributions for each sample.
    """
    if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
@@ -448,7 +455,7 @@ def _predict(
            **kwargs
        )
    else:
-        raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data)))
+        raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data)))


 class _DaskLGBMModel:
@@ -578,13 +585,17 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):

    _base_doc = LGBMClassifier.__init__.__doc__
    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
        _before_kwargs
        + 'client : dask.distributed.Client or None, optional (default=None)\n'
        + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
        + ' ' * 8 + _kwargs + _after_kwargs
    )

+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
    def __getstate__(self) -> Dict[Any, Any]:
        return self._lgb_getstate()

@@ -604,7 +615,23 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
            **kwargs
        )

-    fit.__doc__ = LGBMClassifier.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMClassifier support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n'
+    )

    def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
        """Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
@@ -615,7 +642,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
            **kwargs
        )

-    predict.__doc__ = LGBMClassifier.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+    )

    def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
        """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba."""
@@ -626,7 +660,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
            **kwargs
        )

-    predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__
+    predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted probability for each class for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_probability",
+        predicted_result_shape="Dask Array of shape = [n_samples, n_classes]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+    )

    def to_local(self) -> LGBMClassifier:
        """Create regular version of lightgbm.LGBMClassifier from the distributed version.
@@ -695,13 +736,17 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):

    _base_doc = LGBMRegressor.__init__.__doc__
    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
        _before_kwargs
        + 'client : dask.distributed.Client or None, optional (default=None)\n'
        + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
        + ' ' * 8 + _kwargs + _after_kwargs
    )

+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
    def __getstate__(self) -> Dict[Any, Any]:
        return self._lgb_getstate()

@@ -721,7 +766,23 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
            **kwargs
        )

-    fit.__doc__ = LGBMRegressor.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMRegressor support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n'
+    )

    def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
        """Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
@@ -731,7 +792,14 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
            **kwargs
        )

-    predict.__doc__ = LGBMRegressor.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
+    )

    def to_local(self) -> LGBMRegressor:
        """Create regular version of lightgbm.LGBMRegressor from the distributed version.
@@ -800,13 +868,17 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):

    _base_doc = LGBMRanker.__init__.__doc__
    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
        _before_kwargs
        + 'client : dask.distributed.Client or None, optional (default=None)\n'
        + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
        + ' ' * 8 + _kwargs + _after_kwargs
    )

+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
    def __getstate__(self) -> Dict[Any, Any]:
        return self._lgb_getstate()

@@ -832,13 +904,39 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
            **kwargs
        )

-    fit.__doc__ = LGBMRanker.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMRanker does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('init_score :'):])
+
+    _base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMRanker support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n'
+    )

    def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
        """Docstring is inherited from the lightgbm.LGBMRanker.predict."""
        return _predict(self.to_local(), X, **kwargs)

-    predict.__doc__ = LGBMRanker.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
+    )

    def to_local(self) -> LGBMRanker:
        """Create regular version of lightgbm.LGBMRanker from the distributed version.

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group):
            group=group,
        )

-    # rebalance small dask.array dataset for better performance.
+    # rebalance small dask.Array dataset for better performance.
    if output == 'array':
        dX = dX.persist()
        dy = dy.persist()
@@ -584,7 +584,7 @@ def test_ranker(output, client, listen_port, group):
        _ = wait([dX, dy, dw, dg])
        client.rebalance()

-    # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of
+    # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
        "random_state": 42,