Unverified Commit 06ed4337 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[dask] [docs] Fix inaccuracies in API docs for Dask module (fixes #3871) (#3930)



* got fit() working

* add predict()

* predict_proba()

* remove custom objective docs

* Apply suggestions from code review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* fix capitalization

* Update tests/python_package_test/test_dask.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 846b512d
# coding: utf-8
"""Distributed training with LightGBM and Dask.distributed.
"""Distributed training with LightGBM and dask.distributed.
This module enables you to perform distributed training with LightGBM on
Dask.Array and Dask.DataFrame collections.
dask.Array and dask.DataFrame collections.
It is based on dask-lightgbm, which was based on dask-xgboost.
"""
......@@ -19,7 +19,14 @@ from .basic import _choose_param_value, _ConfigAliases, _LIB, _log_warning, _saf
from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat,
SKLEARN_INSTALLED, LGBMNotFittedError,
DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait)
from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker
from .sklearn import (
_lgbmmodel_doc_fit,
_lgbmmodel_doc_predict,
LGBMClassifier,
LGBMModel,
LGBMRegressor,
LGBMRanker
)
_DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
_DaskMatrixLike = Union[dask_Array, dask_DataFrame]
......@@ -216,17 +223,17 @@ def _train(
----------
client : dask.distributed.Client
Dask client.
data : dask Array or dask DataFrame of shape = [n_samples, n_features]
data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
Input feature matrix.
label : dask Array, dask DataFrame or dask Series of shape = [n_samples]
label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]
The target values (class labels in classification, real numbers in regression).
params : dict
Parameters passed to constructor of the local underlying model.
model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
Class of the local underlying model.
sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Weights of training data.
group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Group/query data.
Only used in the learning-to-rank task.
sum(group) = n_samples.
......@@ -396,7 +403,7 @@ def _predict(
----------
model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
Fitted underlying model.
data : dask Array or dask DataFrame of shape = [n_samples, n_features]
data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
Input feature matrix.
raw_score : bool, optional (default=False)
Whether to predict raw scores.
......@@ -413,11 +420,11 @@ def _predict(
Returns
-------
predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
The predicted values.
X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
If ``pred_contrib=True``, the feature contributions for each sample.
"""
if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
......@@ -448,7 +455,7 @@ def _predict(
**kwargs
)
else:
raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data)))
raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data)))
class _DaskLGBMModel:
......@@ -578,13 +585,17 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
_base_doc = LGBMClassifier.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = (
_base_doc = (
_before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs
)
# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate()
......@@ -604,7 +615,23 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
**kwargs
)
fit.__doc__ = LGBMClassifier.fit.__doc__
_base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMClassifier support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n'
)
def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
......@@ -615,7 +642,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
**kwargs
)
predict.__doc__ = LGBMClassifier.predict.__doc__
predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
)
def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba."""
......@@ -626,7 +660,14 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
**kwargs
)
predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__
predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted probability for each class for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_probability",
predicted_result_shape="Dask Array of shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
)
def to_local(self) -> LGBMClassifier:
"""Create regular version of lightgbm.LGBMClassifier from the distributed version.
......@@ -695,13 +736,17 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
_base_doc = LGBMRegressor.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = (
_base_doc = (
_before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs
)
# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate()
......@@ -721,7 +766,23 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
**kwargs
)
fit.__doc__ = LGBMRegressor.fit.__doc__
_base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMRegressor support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n'
)
def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
......@@ -731,7 +792,14 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
**kwargs
)
predict.__doc__ = LGBMRegressor.predict.__doc__
predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
)
def to_local(self) -> LGBMRegressor:
"""Create regular version of lightgbm.LGBMRegressor from the distributed version.
......@@ -800,13 +868,17 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
_base_doc = LGBMRanker.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = (
_base_doc = (
_before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs
)
# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate()
......@@ -832,13 +904,39 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
**kwargs
)
fit.__doc__ = LGBMRanker.fit.__doc__
_base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMRanker does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('init_score :'):])
_base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMRanker support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n'
)
def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRanker.predict."""
return _predict(self.to_local(), X, **kwargs)
predict.__doc__ = LGBMRanker.predict.__doc__
predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
)
def to_local(self) -> LGBMRanker:
"""Create regular version of lightgbm.LGBMRanker from the distributed version.
......
This diff is collapsed.
......@@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group):
group=group,
)
# rebalance small dask.array dataset for better performance.
# rebalance small dask.Array dataset for better performance.
if output == 'array':
dX = dX.persist()
dy = dy.persist()
......@@ -584,7 +584,7 @@ def test_ranker(output, client, listen_port, group):
_ = wait([dX, dy, dw, dg])
client.rebalance()
# use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of
# use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
# serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
params = {
"random_state": 42,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment