[dask] Include support for init_score (#3950)

* include support for init_score * use dataframe from init_score and test difference with and without init_score in local model * revert refactoring * initial docs. test between distributed models with and without init_score * remove ranker from tests * test value for root node and change docs * comma * re-include parametrize * fix incorrect merge * use single init_score and the booster_ attribute * use np.float64 instead of float

[dask] Include support for init_score (#3950)
* include support for init_score * use dataframe from init_score and test difference with and without init_score in local model * revert refactoring * initial docs. test between distributed models with and without init_score * remove ranker from tests * test value for root node and change docs * comma * re-include parametrize * fix incorrect merge * use single init_score and the booster_ attribute * use np.float64 instead of float
37e98782 · jmoralez · GitHub · 19f35772 · 37e98782 · 37e98782
Unverified Commit 37e98782 authored Mar 04, 2021 by jmoralez Committed by GitHub Mar 04, 2021
3 changed files
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -105,12 +105,17 @@ def _train_part(
    else:
        group = None

+    if 'init_score' in list_of_parts[0]:
+        init_score = _concat([x['init_score'] for x in list_of_parts])
+    else:
+        init_score = None
+
    try:
        model = model_factory(**params)
        if is_ranker:
-            model.fit(data, label, sample_weight=weight, group=group, **kwargs)
+            model.fit(data, label, sample_weight=weight, init_score=init_score, group=group, **kwargs)
        else:
-            model.fit(data, label, sample_weight=weight, **kwargs)
+            model.fit(data, label, sample_weight=weight, init_score=init_score, **kwargs)

    finally:
        _safe_call(_LIB.LGBM_NetworkFree())
@@ -168,6 +173,7 @@ def _train(
    params: Dict[str, Any],
    model_factory: Type[LGBMModel],
    sample_weight: Optional[_DaskCollection] = None,
+    init_score: Optional[_DaskCollection] = None,
    group: Optional[_DaskCollection] = None,
    **kwargs: Any
 ) -> LGBMModel:
@@ -187,6 +193,8 @@ def _train(
        Class of the local underlying model.
    sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
        Weights of training data.
+    init_score : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
+        Init score of training data.
    group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
        Group/query data.
        Only used in the learning-to-rank task.
@@ -289,6 +297,11 @@ def _train(
        for i in range(n_parts):
            parts[i]['group'] = group_parts[i]

+    if init_score is not None:
+        init_score_parts = _split_to_parts(data=init_score, is_matrix=False)
+        for i in range(n_parts):
+            parts[i]['init_score'] = init_score_parts[i]
+
    # Start computation in the background
    parts = list(map(delayed, parts))
    parts = client.compute(parts)
@@ -540,6 +553,7 @@ class _DaskLGBMModel:
        X: _DaskMatrixLike,
        y: _DaskCollection,
        sample_weight: Optional[_DaskCollection] = None,
+        init_score: Optional[_DaskCollection] = None,
        group: Optional[_DaskCollection] = None,
        **kwargs: Any
    ) -> "_DaskLGBMModel":
@@ -556,6 +570,7 @@ class _DaskLGBMModel:
            params=params,
            model_factory=model_factory,
            sample_weight=sample_weight,
+            init_score=init_score,
            group=group,
            **kwargs
        )
@@ -657,6 +672,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
        X: _DaskMatrixLike,
        y: _DaskCollection,
        sample_weight: Optional[_DaskCollection] = None,
+        init_score: Optional[_DaskCollection] = None,
        **kwargs: Any
    ) -> "DaskLGBMClassifier":
        """Docstring is inherited from the lightgbm.LGBMClassifier.fit."""
@@ -665,6 +681,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
            X=X,
            y=y,
            sample_weight=sample_weight,
+            init_score=init_score,
            **kwargs
        )

@@ -672,11 +689,12 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
    )

-    # DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
-    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+    # DaskLGBMClassifier does not support evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('group :')]
                 + _base_doc[_base_doc.find('verbose :'):])

    # DaskLGBMClassifier support for callbacks and init_model is not tested
@@ -808,6 +826,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
        X: _DaskMatrixLike,
        y: _DaskCollection,
        sample_weight: Optional[_DaskCollection] = None,
+        init_score: Optional[_DaskCollection] = None,
        **kwargs: Any
    ) -> "DaskLGBMRegressor":
        """Docstring is inherited from the lightgbm.LGBMRegressor.fit."""
@@ -816,6 +835,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
            X=X,
            y=y,
            sample_weight=sample_weight,
+            init_score=init_score,
            **kwargs
        )

@@ -823,11 +843,12 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
    )

-    # DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
-    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+    # DaskLGBMRegressor does not support evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('group :')]
                 + _base_doc[_base_doc.find('verbose :'):])

    # DaskLGBMRegressor support for callbacks and init_model is not tested
@@ -945,14 +966,12 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
        **kwargs: Any
    ) -> "DaskLGBMRanker":
        """Docstring is inherited from the lightgbm.LGBMRanker.fit."""
-        if init_score is not None:
-            raise RuntimeError('init_score is not currently supported in lightgbm.dask')
-
        return self._lgb_dask_fit(
            model_factory=LGBMRanker,
            X=X,
            y=y,
            sample_weight=sample_weight,
+            init_score=init_score,
            group=group,
            **kwargs
        )
@@ -961,13 +980,11 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        init_score_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
    )

-    # DaskLGBMRanker does not support init_score, evaluation data, or early stopping
-    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
-                 + _base_doc[_base_doc.find('init_score :'):])
-
+    # DaskLGBMRanker does not support evaluation data, or early stopping
    _base_doc = (_base_doc[:_base_doc.find('eval_set :')]
                 + _base_doc[_base_doc.find('verbose :'):])


--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -189,7 +189,7 @@ _lgbmmodel_doc_fit = (
        The target values (class labels in classification, real numbers in regression).
    sample_weight : {sample_weight_shape}
        Weights of training data.
-    init_score : array-like of shape = [n_samples] or None, optional (default=None)
+    init_score : {init_score_shape}
        Init score of training data.
    group : {group_shape}
        Group/query data.
@@ -706,6 +706,7 @@ class LGBMModel(_LGBMModelBase):
        X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
        y_shape="array-like of shape = [n_samples]",
        sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
+        init_score_shape="array-like of shape = [n_samples] or None, optional (default=None)",
        group_shape="array-like or None, optional (default=None)"
    ) + "\n\n" + _lgbmmodel_doc_custom_eval_note


--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -3,6 +3,7 @@

 import inspect
 import pickle
+import random
 import socket
 from itertools import groupby
 from os import getenv
@@ -1228,6 +1229,50 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
    client.close(timeout=CLIENT_CLOSE_TIMEOUT)


+@pytest.mark.parametrize('task', tasks)
+@pytest.mark.parametrize('output', data_output)
+def test_init_score(
+        task,
+        output,
+        client):
+    if task == 'ranking' and output == 'scipy_csr_matrix':
+        pytest.skip('LGBMRanker is not currently tested on sparse matrices')
+
+    if task == 'ranking':
+        _, _, _, _, dX, dy, dw, dg = _create_ranking_data(
+            output=output,
+            group=None
+        )
+        model_factory = lgb.DaskLGBMRanker
+    else:
+        _, _, _, dX, dy, dw = _create_data(
+            objective=task,
+            output=output,
+        )
+        dg = None
+        if task == 'classification':
+            model_factory = lgb.DaskLGBMClassifier
+        elif task == 'regression':
+            model_factory = lgb.DaskLGBMRegressor
+
+    params = {
+        'n_estimators': 1,
+        'num_leaves': 2,
+        'time_out': 5
+    }
+    init_score = random.random()
+    if output.startswith('dataframe'):
+        init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size))
+    else:
+        init_scores = da.full_like(dy, fill_value=init_score, dtype=np.float64)
+    model = model_factory(client=client, **params)
+    model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
+    # value of the root node is 0 when init_score is set
+    assert model.booster_.trees_to_dataframe()['value'][0] == 0
+
+    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
+
+
 def sklearn_checks_to_run():
    check_names = [
        "check_estimator_get_tags_default_keys",