Unverified Commit 06ed4337 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[dask] [docs] Fix inaccuracies in API docs for Dask module (fixes #3871) (#3930)



* got fit() working

* add predict()

* predict_proba()

* remove custom objective docs

* Apply suggestions from code review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* fix capitalization

* Update tests/python_package_test/test_dask.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 846b512d
# coding: utf-8 # coding: utf-8
"""Distributed training with LightGBM and Dask.distributed. """Distributed training with LightGBM and dask.distributed.
This module enables you to perform distributed training with LightGBM on This module enables you to perform distributed training with LightGBM on
Dask.Array and Dask.DataFrame collections. dask.Array and dask.DataFrame collections.
It is based on dask-lightgbm, which was based on dask-xgboost. It is based on dask-lightgbm, which was based on dask-xgboost.
""" """
...@@ -19,7 +19,14 @@ from .basic import _choose_param_value, _ConfigAliases, _LIB, _log_warning, _saf ...@@ -19,7 +19,14 @@ from .basic import _choose_param_value, _ConfigAliases, _LIB, _log_warning, _saf
from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat, from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat,
SKLEARN_INSTALLED, LGBMNotFittedError, SKLEARN_INSTALLED, LGBMNotFittedError,
DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait) DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait)
from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker from .sklearn import (
_lgbmmodel_doc_fit,
_lgbmmodel_doc_predict,
LGBMClassifier,
LGBMModel,
LGBMRegressor,
LGBMRanker
)
_DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
_DaskMatrixLike = Union[dask_Array, dask_DataFrame] _DaskMatrixLike = Union[dask_Array, dask_DataFrame]
...@@ -216,17 +223,17 @@ def _train( ...@@ -216,17 +223,17 @@ def _train(
---------- ----------
client : dask.distributed.Client client : dask.distributed.Client
Dask client. Dask client.
data : dask Array or dask DataFrame of shape = [n_samples, n_features] data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
Input feature matrix. Input feature matrix.
label : dask Array, dask DataFrame or dask Series of shape = [n_samples] label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]
The target values (class labels in classification, real numbers in regression). The target values (class labels in classification, real numbers in regression).
params : dict params : dict
Parameters passed to constructor of the local underlying model. Parameters passed to constructor of the local underlying model.
model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
Class of the local underlying model. Class of the local underlying model.
sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Weights of training data. Weights of training data.
group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Group/query data. Group/query data.
Only used in the learning-to-rank task. Only used in the learning-to-rank task.
sum(group) = n_samples. sum(group) = n_samples.
...@@ -396,7 +403,7 @@ def _predict( ...@@ -396,7 +403,7 @@ def _predict(
---------- ----------
model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
Fitted underlying model. Fitted underlying model.
data : dask Array or dask DataFrame of shape = [n_samples, n_features] data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
Input feature matrix. Input feature matrix.
raw_score : bool, optional (default=False) raw_score : bool, optional (default=False)
Whether to predict raw scores. Whether to predict raw scores.
...@@ -413,11 +420,11 @@ def _predict( ...@@ -413,11 +420,11 @@ def _predict(
Returns Returns
------- -------
predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes] predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
The predicted values. The predicted values.
X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
If ``pred_leaf=True``, the predicted leaf of every tree for each sample. If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
If ``pred_contrib=True``, the feature contributions for each sample. If ``pred_contrib=True``, the feature contributions for each sample.
""" """
if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
...@@ -448,7 +455,7 @@ def _predict( ...@@ -448,7 +455,7 @@ def _predict(
**kwargs **kwargs
) )
else: else:
raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data))) raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data)))
class _DaskLGBMModel: class _DaskLGBMModel:
...@@ -578,13 +585,17 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -578,13 +585,17 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
_base_doc = LGBMClassifier.__init__.__doc__ _base_doc = LGBMClassifier.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = ( _base_doc = (
_before_kwargs _before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n' + 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs + ' ' * 8 + _kwargs + _after_kwargs
) )
# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
def __getstate__(self) -> Dict[Any, Any]: def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate() return self._lgb_getstate()
...@@ -604,7 +615,23 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -604,7 +615,23 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
**kwargs **kwargs
) )
fit.__doc__ = LGBMClassifier.fit.__doc__ _base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMClassifier support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n'
)
def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict.""" """Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
...@@ -615,7 +642,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -615,7 +642,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
**kwargs **kwargs
) )
predict.__doc__ = LGBMClassifier.predict.__doc__ predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
)
def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba.""" """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba."""
...@@ -626,7 +660,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -626,7 +660,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
**kwargs **kwargs
) )
predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__ predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted probability for each class for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_probability",
predicted_result_shape="Dask Array of shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
)
def to_local(self) -> LGBMClassifier: def to_local(self) -> LGBMClassifier:
"""Create regular version of lightgbm.LGBMClassifier from the distributed version. """Create regular version of lightgbm.LGBMClassifier from the distributed version.
...@@ -695,13 +736,17 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): ...@@ -695,13 +736,17 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
_base_doc = LGBMRegressor.__init__.__doc__ _base_doc = LGBMRegressor.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = ( _base_doc = (
_before_kwargs _before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n' + 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs + ' ' * 8 + _kwargs + _after_kwargs
) )
# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
def __getstate__(self) -> Dict[Any, Any]: def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate() return self._lgb_getstate()
...@@ -721,7 +766,23 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): ...@@ -721,7 +766,23 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
**kwargs **kwargs
) )
fit.__doc__ = LGBMRegressor.fit.__doc__ _base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMRegressor support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n'
)
def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRegressor.predict.""" """Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
...@@ -731,7 +792,14 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): ...@@ -731,7 +792,14 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
**kwargs **kwargs
) )
predict.__doc__ = LGBMRegressor.predict.__doc__ predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
)
def to_local(self) -> LGBMRegressor: def to_local(self) -> LGBMRegressor:
"""Create regular version of lightgbm.LGBMRegressor from the distributed version. """Create regular version of lightgbm.LGBMRegressor from the distributed version.
...@@ -800,13 +868,17 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): ...@@ -800,13 +868,17 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
_base_doc = LGBMRanker.__init__.__doc__ _base_doc = LGBMRanker.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = ( _base_doc = (
_before_kwargs _before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n' + 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs + ' ' * 8 + _kwargs + _after_kwargs
) )
# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
def __getstate__(self) -> Dict[Any, Any]: def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate() return self._lgb_getstate()
...@@ -832,13 +904,39 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): ...@@ -832,13 +904,39 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
**kwargs **kwargs
) )
fit.__doc__ = LGBMRanker.fit.__doc__ _base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMRanker does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('init_score :'):])
_base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMRanker support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n'
)
def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRanker.predict.""" """Docstring is inherited from the lightgbm.LGBMRanker.predict."""
return _predict(self.to_local(), X, **kwargs) return _predict(self.to_local(), X, **kwargs)
predict.__doc__ = LGBMRanker.predict.__doc__ predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
)
def to_local(self) -> LGBMRanker: def to_local(self) -> LGBMRanker:
"""Create regular version of lightgbm.LGBMRanker from the distributed version. """Create regular version of lightgbm.LGBMRanker from the distributed version.
......
This diff is collapsed.
...@@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group): ...@@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group):
group=group, group=group,
) )
# rebalance small dask.array dataset for better performance. # rebalance small dask.Array dataset for better performance.
if output == 'array': if output == 'array':
dX = dX.persist() dX = dX.persist()
dy = dy.persist() dy = dy.persist()
...@@ -584,7 +584,7 @@ def test_ranker(output, client, listen_port, group): ...@@ -584,7 +584,7 @@ def test_ranker(output, client, listen_port, group):
_ = wait([dX, dy, dw, dg]) _ = wait([dX, dy, dw, dg])
client.rebalance() client.rebalance()
# use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
# serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
params = { params = {
"random_state": 42, "random_state": 42,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment