Unverified Commit 37e98782 authored by jmoralez's avatar jmoralez Committed by GitHub
Browse files

[dask] Include support for init_score (#3950)

* include support for init_score

* use dataframe from init_score and test difference with and without init_score in local model

* revert refactoring

* initial docs. test between distributed models with and without init_score

* remove ranker from tests

* test value for root node and change docs

* comma

* re-include parametrize

* fix incorrect merge

* use single init_score and the booster_ attribute

* use np.float64 instead of float
parent 19f35772
...@@ -105,12 +105,17 @@ def _train_part( ...@@ -105,12 +105,17 @@ def _train_part(
else: else:
group = None group = None
if 'init_score' in list_of_parts[0]:
init_score = _concat([x['init_score'] for x in list_of_parts])
else:
init_score = None
try: try:
model = model_factory(**params) model = model_factory(**params)
if is_ranker: if is_ranker:
model.fit(data, label, sample_weight=weight, group=group, **kwargs) model.fit(data, label, sample_weight=weight, init_score=init_score, group=group, **kwargs)
else: else:
model.fit(data, label, sample_weight=weight, **kwargs) model.fit(data, label, sample_weight=weight, init_score=init_score, **kwargs)
finally: finally:
_safe_call(_LIB.LGBM_NetworkFree()) _safe_call(_LIB.LGBM_NetworkFree())
...@@ -168,6 +173,7 @@ def _train( ...@@ -168,6 +173,7 @@ def _train(
params: Dict[str, Any], params: Dict[str, Any],
model_factory: Type[LGBMModel], model_factory: Type[LGBMModel],
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
group: Optional[_DaskCollection] = None, group: Optional[_DaskCollection] = None,
**kwargs: Any **kwargs: Any
) -> LGBMModel: ) -> LGBMModel:
...@@ -187,6 +193,8 @@ def _train( ...@@ -187,6 +193,8 @@ def _train(
Class of the local underlying model. Class of the local underlying model.
sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Weights of training data. Weights of training data.
init_score : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Init score of training data.
group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Group/query data. Group/query data.
Only used in the learning-to-rank task. Only used in the learning-to-rank task.
...@@ -289,6 +297,11 @@ def _train( ...@@ -289,6 +297,11 @@ def _train(
for i in range(n_parts): for i in range(n_parts):
parts[i]['group'] = group_parts[i] parts[i]['group'] = group_parts[i]
if init_score is not None:
init_score_parts = _split_to_parts(data=init_score, is_matrix=False)
for i in range(n_parts):
parts[i]['init_score'] = init_score_parts[i]
# Start computation in the background # Start computation in the background
parts = list(map(delayed, parts)) parts = list(map(delayed, parts))
parts = client.compute(parts) parts = client.compute(parts)
...@@ -540,6 +553,7 @@ class _DaskLGBMModel: ...@@ -540,6 +553,7 @@ class _DaskLGBMModel:
X: _DaskMatrixLike, X: _DaskMatrixLike,
y: _DaskCollection, y: _DaskCollection,
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
group: Optional[_DaskCollection] = None, group: Optional[_DaskCollection] = None,
**kwargs: Any **kwargs: Any
) -> "_DaskLGBMModel": ) -> "_DaskLGBMModel":
...@@ -556,6 +570,7 @@ class _DaskLGBMModel: ...@@ -556,6 +570,7 @@ class _DaskLGBMModel:
params=params, params=params,
model_factory=model_factory, model_factory=model_factory,
sample_weight=sample_weight, sample_weight=sample_weight,
init_score=init_score,
group=group, group=group,
**kwargs **kwargs
) )
...@@ -657,6 +672,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -657,6 +672,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X: _DaskMatrixLike, X: _DaskMatrixLike,
y: _DaskCollection, y: _DaskCollection,
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
**kwargs: Any **kwargs: Any
) -> "DaskLGBMClassifier": ) -> "DaskLGBMClassifier":
"""Docstring is inherited from the lightgbm.LGBMClassifier.fit.""" """Docstring is inherited from the lightgbm.LGBMClassifier.fit."""
...@@ -665,6 +681,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -665,6 +681,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X=X, X=X,
y=y, y=y,
sample_weight=sample_weight, sample_weight=sample_weight,
init_score=init_score,
**kwargs **kwargs
) )
...@@ -672,11 +689,12 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -672,11 +689,12 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
) )
# DaskLGBMClassifier does not support init_score, evaluation data, or early stopping # DaskLGBMClassifier does not support evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')] _base_doc = (_base_doc[:_base_doc.find('group :')]
+ _base_doc[_base_doc.find('verbose :'):]) + _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMClassifier support for callbacks and init_model is not tested # DaskLGBMClassifier support for callbacks and init_model is not tested
...@@ -808,6 +826,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): ...@@ -808,6 +826,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
X: _DaskMatrixLike, X: _DaskMatrixLike,
y: _DaskCollection, y: _DaskCollection,
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
**kwargs: Any **kwargs: Any
) -> "DaskLGBMRegressor": ) -> "DaskLGBMRegressor":
"""Docstring is inherited from the lightgbm.LGBMRegressor.fit.""" """Docstring is inherited from the lightgbm.LGBMRegressor.fit."""
...@@ -816,6 +835,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): ...@@ -816,6 +835,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
X=X, X=X,
y=y, y=y,
sample_weight=sample_weight, sample_weight=sample_weight,
init_score=init_score,
**kwargs **kwargs
) )
...@@ -823,11 +843,12 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): ...@@ -823,11 +843,12 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
) )
# DaskLGBMRegressor does not support init_score, evaluation data, or early stopping # DaskLGBMRegressor does not support evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')] _base_doc = (_base_doc[:_base_doc.find('group :')]
+ _base_doc[_base_doc.find('verbose :'):]) + _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMRegressor support for callbacks and init_model is not tested # DaskLGBMRegressor support for callbacks and init_model is not tested
...@@ -945,14 +966,12 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): ...@@ -945,14 +966,12 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
**kwargs: Any **kwargs: Any
) -> "DaskLGBMRanker": ) -> "DaskLGBMRanker":
"""Docstring is inherited from the lightgbm.LGBMRanker.fit.""" """Docstring is inherited from the lightgbm.LGBMRanker.fit."""
if init_score is not None:
raise RuntimeError('init_score is not currently supported in lightgbm.dask')
return self._lgb_dask_fit( return self._lgb_dask_fit(
model_factory=LGBMRanker, model_factory=LGBMRanker,
X=X, X=X,
y=y, y=y,
sample_weight=sample_weight, sample_weight=sample_weight,
init_score=init_score,
group=group, group=group,
**kwargs **kwargs
) )
...@@ -961,13 +980,11 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): ...@@ -961,13 +980,11 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
) )
# DaskLGBMRanker does not support init_score, evaluation data, or early stopping # DaskLGBMRanker does not support evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('init_score :'):])
_base_doc = (_base_doc[:_base_doc.find('eval_set :')] _base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+ _base_doc[_base_doc.find('verbose :'):]) + _base_doc[_base_doc.find('verbose :'):])
......
...@@ -189,7 +189,7 @@ _lgbmmodel_doc_fit = ( ...@@ -189,7 +189,7 @@ _lgbmmodel_doc_fit = (
The target values (class labels in classification, real numbers in regression). The target values (class labels in classification, real numbers in regression).
sample_weight : {sample_weight_shape} sample_weight : {sample_weight_shape}
Weights of training data. Weights of training data.
init_score : array-like of shape = [n_samples] or None, optional (default=None) init_score : {init_score_shape}
Init score of training data. Init score of training data.
group : {group_shape} group : {group_shape}
Group/query data. Group/query data.
...@@ -706,6 +706,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -706,6 +706,7 @@ class LGBMModel(_LGBMModelBase):
X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
y_shape="array-like of shape = [n_samples]", y_shape="array-like of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or None, optional (default=None)",
group_shape="array-like or None, optional (default=None)" group_shape="array-like or None, optional (default=None)"
) + "\n\n" + _lgbmmodel_doc_custom_eval_note ) + "\n\n" + _lgbmmodel_doc_custom_eval_note
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import inspect import inspect
import pickle import pickle
import random
import socket import socket
from itertools import groupby from itertools import groupby
from os import getenv from os import getenv
...@@ -1228,6 +1229,50 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array( ...@@ -1228,6 +1229,50 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
client.close(timeout=CLIENT_CLOSE_TIMEOUT) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
def test_init_score(
task,
output,
client):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
if task == 'ranking':
_, _, _, _, dX, dy, dw, dg = _create_ranking_data(
output=output,
group=None
)
model_factory = lgb.DaskLGBMRanker
else:
_, _, _, dX, dy, dw = _create_data(
objective=task,
output=output,
)
dg = None
if task == 'classification':
model_factory = lgb.DaskLGBMClassifier
elif task == 'regression':
model_factory = lgb.DaskLGBMRegressor
params = {
'n_estimators': 1,
'num_leaves': 2,
'time_out': 5
}
init_score = random.random()
if output.startswith('dataframe'):
init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size))
else:
init_scores = da.full_like(dy, fill_value=init_score, dtype=np.float64)
model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set
assert model.booster_.trees_to_dataframe()['value'][0] == 0
client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def sklearn_checks_to_run(): def sklearn_checks_to_run():
check_names = [ check_names = [
"check_estimator_get_tags_default_keys", "check_estimator_get_tags_default_keys",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment