Unverified Commit 37e98782 authored by jmoralez's avatar jmoralez Committed by GitHub
Browse files

[dask] Include support for init_score (#3950)

* include support for init_score

* use dataframe from init_score and test difference with and without init_score in local model

* revert refactoring

* initial docs. test between distributed models with and without init_score

* remove ranker from tests

* test value for root node and change docs

* comma

* re-include parametrize

* fix incorrect merge

* use single init_score and the booster_ attribute

* use np.float64 instead of float
parent 19f35772
......@@ -105,12 +105,17 @@ def _train_part(
else:
group = None
if 'init_score' in list_of_parts[0]:
init_score = _concat([x['init_score'] for x in list_of_parts])
else:
init_score = None
try:
model = model_factory(**params)
if is_ranker:
model.fit(data, label, sample_weight=weight, group=group, **kwargs)
model.fit(data, label, sample_weight=weight, init_score=init_score, group=group, **kwargs)
else:
model.fit(data, label, sample_weight=weight, **kwargs)
model.fit(data, label, sample_weight=weight, init_score=init_score, **kwargs)
finally:
_safe_call(_LIB.LGBM_NetworkFree())
......@@ -168,6 +173,7 @@ def _train(
params: Dict[str, Any],
model_factory: Type[LGBMModel],
sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
group: Optional[_DaskCollection] = None,
**kwargs: Any
) -> LGBMModel:
......@@ -187,6 +193,8 @@ def _train(
Class of the local underlying model.
sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Weights of training data.
init_score : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Init score of training data.
group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Group/query data.
Only used in the learning-to-rank task.
......@@ -289,6 +297,11 @@ def _train(
for i in range(n_parts):
parts[i]['group'] = group_parts[i]
if init_score is not None:
init_score_parts = _split_to_parts(data=init_score, is_matrix=False)
for i in range(n_parts):
parts[i]['init_score'] = init_score_parts[i]
# Start computation in the background
parts = list(map(delayed, parts))
parts = client.compute(parts)
......@@ -540,6 +553,7 @@ class _DaskLGBMModel:
X: _DaskMatrixLike,
y: _DaskCollection,
sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
group: Optional[_DaskCollection] = None,
**kwargs: Any
) -> "_DaskLGBMModel":
......@@ -556,6 +570,7 @@ class _DaskLGBMModel:
params=params,
model_factory=model_factory,
sample_weight=sample_weight,
init_score=init_score,
group=group,
**kwargs
)
......@@ -657,6 +672,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X: _DaskMatrixLike,
y: _DaskCollection,
sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
**kwargs: Any
) -> "DaskLGBMClassifier":
"""Docstring is inherited from the lightgbm.LGBMClassifier.fit."""
......@@ -665,6 +681,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X=X,
y=y,
sample_weight=sample_weight,
init_score=init_score,
**kwargs
)
......@@ -672,11 +689,12 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
# DaskLGBMClassifier does not support evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('group :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMClassifier support for callbacks and init_model is not tested
......@@ -808,6 +826,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
X: _DaskMatrixLike,
y: _DaskCollection,
sample_weight: Optional[_DaskCollection] = None,
init_score: Optional[_DaskCollection] = None,
**kwargs: Any
) -> "DaskLGBMRegressor":
"""Docstring is inherited from the lightgbm.LGBMRegressor.fit."""
......@@ -816,6 +835,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
X=X,
y=y,
sample_weight=sample_weight,
init_score=init_score,
**kwargs
)
......@@ -823,11 +843,12 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
# DaskLGBMRegressor does not support evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('group :')]
+ _base_doc[_base_doc.find('verbose :'):])
# DaskLGBMRegressor support for callbacks and init_model is not tested
......@@ -945,14 +966,12 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
**kwargs: Any
) -> "DaskLGBMRanker":
"""Docstring is inherited from the lightgbm.LGBMRanker.fit."""
if init_score is not None:
raise RuntimeError('init_score is not currently supported in lightgbm.dask')
return self._lgb_dask_fit(
model_factory=LGBMRanker,
X=X,
y=y,
sample_weight=sample_weight,
init_score=init_score,
group=group,
**kwargs
)
......@@ -961,13 +980,11 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)
# DaskLGBMRanker does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('init_score :'):])
# DaskLGBMRanker does not support evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+ _base_doc[_base_doc.find('verbose :'):])
......
......@@ -189,7 +189,7 @@ _lgbmmodel_doc_fit = (
The target values (class labels in classification, real numbers in regression).
sample_weight : {sample_weight_shape}
Weights of training data.
init_score : array-like of shape = [n_samples] or None, optional (default=None)
init_score : {init_score_shape}
Init score of training data.
group : {group_shape}
Group/query data.
......@@ -706,6 +706,7 @@ class LGBMModel(_LGBMModelBase):
X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
y_shape="array-like of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or None, optional (default=None)",
group_shape="array-like or None, optional (default=None)"
) + "\n\n" + _lgbmmodel_doc_custom_eval_note
......
......@@ -3,6 +3,7 @@
import inspect
import pickle
import random
import socket
from itertools import groupby
from os import getenv
......@@ -1228,6 +1229,50 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
client.close(timeout=CLIENT_CLOSE_TIMEOUT)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
def test_init_score(
task,
output,
client):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
if task == 'ranking':
_, _, _, _, dX, dy, dw, dg = _create_ranking_data(
output=output,
group=None
)
model_factory = lgb.DaskLGBMRanker
else:
_, _, _, dX, dy, dw = _create_data(
objective=task,
output=output,
)
dg = None
if task == 'classification':
model_factory = lgb.DaskLGBMClassifier
elif task == 'regression':
model_factory = lgb.DaskLGBMRegressor
params = {
'n_estimators': 1,
'num_leaves': 2,
'time_out': 5
}
init_score = random.random()
if output.startswith('dataframe'):
init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size))
else:
init_scores = da.full_like(dy, fill_value=init_score, dtype=np.float64)
model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set
assert model.booster_.trees_to_dataframe()['value'][0] == 0
client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def sklearn_checks_to_run():
check_names = [
"check_estimator_get_tags_default_keys",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment