[python] fixed picklability of sklearn models with custom obj and updated...

[python] fixed picklability of sklearn models with custom obj and updated docstings for custom obj (#2191) * refactored joblib test * fixed picklability of sklearn models with custom obj and updated docstings for custom obj * pickled model should be able to predict without refitting

[python] fixed picklability of sklearn models with custom obj and updated...
[python] fixed picklability of sklearn models with custom obj and updated docstings for custom obj (#2191) * refactored joblib test * fixed picklability of sklearn models with custom obj and updated docstings for custom obj * pickled model should be able to predict without refitting
2459362a · Nikita Titov · Guolin Ke · e5b6e50e · 2459362a · 2459362a
Commit 2459362a authored May 27, 2019 by Nikita Titov Committed by Guolin Ke May 27, 2019
4 changed files
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -59,7 +59,7 @@ def is_numeric(obj):


 def is_numpy_1d_array(data):
-    """Check whether data is a 1-D numpy array."""
+    """Check whether data is a numpy 1-D array."""
    return isinstance(data, np.ndarray) and len(data.shape) == 1


@@ -69,7 +69,7 @@ def is_1d_list(data):


 def list_to_1d_numpy(data, dtype=np.float32, name='list'):
-    """Convert data to 1-D numpy array."""
+    """Convert data to numpy 1-D array."""
    if is_numpy_1d_array(data):
        if data.dtype == dtype:
            return data
@@ -1853,9 +1853,20 @@ class Booster(object):
            If None, last training data is used.
        fobj : callable or None, optional (default=None)
            Customized objective function.
+            Should accept two parameters: preds, train_data,
+            and return (grad, hess).
+
+                preds : list or numpy 1-D array
+                    The predicted values.
+                train_data : Dataset
+                    The training dataset.
+                grad : list or numpy 1-D array
+                    The value of the first order derivative (gradient) for each sample point.
+                hess : list or numpy 1-D array
+                    The value of the second order derivative (Hessian) for each sample point.

-            For multi-class task, the score is group by class_id first, then group by row_id.
-            If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
+            For multi-class task, the preds is group by class_id first, then group by row_id.
+            If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
            and you should group grad and hess in this way as well.

        Returns
@@ -1902,9 +1913,9 @@ class Booster(object):

        Parameters
        ----------
-        grad : 1-D numpy array or 1-D list
+        grad : list or numpy 1-D array
            The first order derivative (gradient).
-        hess : 1-D numpy array or 1-D list
+        hess : list or numpy 1-D array
            The second order derivative (Hessian).

        Returns
@@ -1994,8 +2005,20 @@ class Booster(object):
            Name of the data.
        feval : callable or None, optional (default=None)
            Customized evaluation function.
-            Should accept two parameters: preds, train_data,
+            Should accept two parameters: preds, eval_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+
+                preds : list or numpy 1-D array
+                    The predicted values.
+                eval_data : Dataset
+                    The evaluation dataset.
+                eval_name : string
+                    The name of evaluation function.
+                eval_result : float
+                    The eval result.
+                is_higher_better : bool
+                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
+
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].

@@ -2030,6 +2053,18 @@ class Booster(object):
            Customized evaluation function.
            Should accept two parameters: preds, train_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+
+                preds : list or numpy 1-D array
+                    The predicted values.
+                train_data : Dataset
+                    The training dataset.
+                eval_name : string
+                    The name of evaluation function.
+                eval_result : float
+                    The eval result.
+                is_higher_better : bool
+                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
+
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].

@@ -2047,8 +2082,20 @@ class Booster(object):
        ----------
        feval : callable or None, optional (default=None)
            Customized evaluation function.
-            Should accept two parameters: preds, train_data,
+            Should accept two parameters: preds, valid_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+
+                preds : list or numpy 1-D array
+                    The predicted values.
+                valid_data : Dataset
+                    The validation dataset.
+                eval_name : string
+                    The name of evaluation function.
+                eval_result : float
+                    The eval result.
+                is_higher_better : bool
+                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
+
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].


--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -39,10 +39,38 @@ def train(params, train_set, num_boost_round=100,
        Names of ``valid_sets``.
    fobj : callable or None, optional (default=None)
        Customized objective function.
+        Should accept two parameters: preds, train_data,
+        and return (grad, hess).
+
+            preds : list or numpy 1-D array
+                The predicted values.
+            train_data : Dataset
+                The training dataset.
+            grad : list or numpy 1-D array
+                The value of the first order derivative (gradient) for each sample point.
+            hess : list or numpy 1-D array
+                The value of the second order derivative (Hessian) for each sample point.
+
+        For multi-class task, the preds is group by class_id first, then group by row_id.
+        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
+        and you should group grad and hess in this way as well.
+
    feval : callable or None, optional (default=None)
        Customized evaluation function.
        Should accept two parameters: preds, train_data,
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+
+            preds : list or numpy 1-D array
+                The predicted values.
+            train_data : Dataset
+                The training dataset.
+            eval_name : string
+                The name of evaluation function.
+            eval_result : float
+                The eval result.
+            is_higher_better : bool
+                Is eval result higher better, e.g. AUC is ``is_higher_better``.
+
        For multi-class task, the preds is group by class_id first, then group by row_id.
        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
        To ignore the default metric corresponding to the used objective,
@@ -373,11 +401,39 @@ def cv(params, train_set, num_boost_round=100,
        Evaluation metrics to be monitored while CV.
        If not None, the metric in ``params`` will be overridden.
    fobj : callable or None, optional (default=None)
-        Custom objective function.
+        Customized objective function.
+        Should accept two parameters: preds, train_data,
+        and return (grad, hess).
+
+            preds : list or numpy 1-D array
+                The predicted values.
+            train_data : Dataset
+                The training dataset.
+            grad : list or numpy 1-D array
+                The value of the first order derivative (gradient) for each sample point.
+            hess : list or numpy 1-D array
+                The value of the second order derivative (Hessian) for each sample point.
+
+        For multi-class task, the preds is group by class_id first, then group by row_id.
+        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
+        and you should group grad and hess in this way as well.
+
    feval : callable or None, optional (default=None)
        Customized evaluation function.
        Should accept two parameters: preds, train_data,
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+
+            preds : list or numpy 1-D array
+                The predicted values.
+            train_data : Dataset
+                The training dataset.
+            eval_name : string
+                The name of evaluation function.
+            eval_result : float
+                The eval result.
+            is_higher_better : bool
+                Is eval result higher better, e.g. AUC is ``is_higher_better``.
+
        For multi-class task, the preds is group by class_id first, then group by row_id.
        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
        To ignore the default metric corresponding to the used objective,

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -15,46 +15,63 @@ from .compat import (SKLEARN_INSTALLED, _LGBMClassifierBase,
 from .engine import train


-def _objective_function_wrapper(func):
-    """Decorate an objective function.
+class _ObjectiveFunctionWrapper(object):
+    """Proxy class for objective function."""

-    Note
-    ----
-    For multi-class task, the y_pred is group by class_id first, then group by row_id.
-    If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
-    and you should group grad and hess in this way as well.
+    def __init__(self, func):
+        """Construct a proxy class.

-    Parameters
-    ----------
-    func : callable
-        Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
+        This class transforms objective function to match objective function with signature ``new_func(preds, dataset)``
+        as expected by ``lightgbm.engine.train``.

-            y_true : array-like of shape = [n_samples]
-                The target values.
-            y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
-                The predicted values.
-            group : array-like
-                Group/query data, used for ranking task.
+        Parameters
+        ----------
+        func : callable
+            Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group)
+            and returns (grad, hess):
+
+                y_true : array-like of shape = [n_samples]
+                    The target values.
+                y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                    The predicted values.
+                group : array-like
+                    Group/query data, used for ranking task.
+                grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                    The value of the first order derivative (gradient) for each sample point.
+                hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                    The value of the second order derivative (Hessian) for each sample point.
+
+        Note
+        ----
+        For multi-class task, the y_pred is group by class_id first, then group by row_id.
+        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
+        and you should group grad and hess in this way as well.
+        """
+        self.func = func

-    Returns
-    -------
-    new_func : callable
-        The new objective function as expected by ``lightgbm.engine.train``.
-        The signature is ``new_func(preds, dataset)``:
+    def __call__(self, preds, dataset):
+        """Call passed function with appropriate arguments.

-            preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
-                The predicted values.
-            dataset : Dataset
-                The training set from which the labels will be extracted using ``dataset.get_label()``.
-    """
-    def inner(preds, dataset):
-        """Call passed function with appropriate arguments."""
+        Parameters
+        ----------
+        preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            The predicted values.
+        dataset : Dataset
+            The training dataset.
+
+        Returns
+        -------
+        grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            The value of the first order derivative (gradient) for each sample point.
+        hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            The value of the second order derivative (Hessian) for each sample point.
+        """
        labels = dataset.get_label()
-        argc = argc_(func)
+        argc = argc_(self.func)
        if argc == 2:
-            grad, hess = func(labels, preds)
+            grad, hess = self.func(labels, preds)
        elif argc == 3:
-            grad, hess = func(labels, preds, dataset.get_group())
+            grad, hess = self.func(labels, preds, dataset.get_group())
        else:
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
        """weighted for objective"""
@@ -75,59 +92,78 @@ def _objective_function_wrapper(func):
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
-    return inner


-def _eval_function_wrapper(func):
-    """Decorate an eval function.
+class _EvalFunctionWrapper(object):
+    """Proxy class for evaluation function."""

-    Note
-    ----
-    For multi-class task, the y_pred is group by class_id first, then group by row_id.
-    If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
+    def __init__(self, func):
+        """Construct a proxy class.

-    Parameters
-    ----------
-    func : callable
-        Expects a callable with following signatures:
-        ``func(y_true, y_pred)``,
-        ``func(y_true, y_pred, weight)``
-        or ``func(y_true, y_pred, weight, group)``
-        and returns (eval_name->string, eval_result->float, is_bigger_better->bool):
+        This class transforms evaluation function to match evaluation function with signature ``new_func(preds, dataset)``
+        as expected by ``lightgbm.engine.train``.

-            y_true : array-like of shape = [n_samples]
-                The target values.
-            y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
-                The predicted values.
-            weight : array-like of shape = [n_samples]
-                The weight of samples.
-            group : array-like
-                Group/query data, used for ranking task.
+        Parameters
+        ----------
+        func : callable
+            Expects a callable with following signatures:
+            ``func(y_true, y_pred)``,
+            ``func(y_true, y_pred, weight)``
+            or ``func(y_true, y_pred, weight, group)``
+            and returns (eval_name, eval_result, is_higher_better) or
+            list of (eval_name, eval_result, is_higher_better):
+
+                y_true : array-like of shape = [n_samples]
+                    The target values.
+                y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                    The predicted values.
+                weight : array-like of shape = [n_samples]
+                    The weight of samples.
+                group : array-like
+                    Group/query data, used for ranking task.
+                eval_name : string
+                    The name of evaluation function.
+                eval_result : float
+                    The eval result.
+                is_higher_better : bool
+                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

-    Returns
-    -------
-    new_func : callable
-        The new eval function as expected by ``lightgbm.engine.train``.
-        The signature is ``new_func(preds, dataset)``:
+        Note
+        ----
+        For multi-class task, the y_pred is group by class_id first, then group by row_id.
+        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
+        """
+        self.func = func

-            preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
-                The predicted values.
-            dataset : Dataset
-                The training set from which the labels will be extracted using ``dataset.get_label()``.
-    """
-    def inner(preds, dataset):
-        """Call passed function with appropriate arguments."""
+    def __call__(self, preds, dataset):
+        """Call passed function with appropriate arguments.
+
+        Parameters
+        ----------
+        preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            The predicted values.
+        dataset : Dataset
+            The training dataset.
+
+        Returns
+        -------
+        eval_name : string
+            The name of evaluation function.
+        eval_result : float
+            The eval result.
+        is_higher_better : bool
+            Is eval result higher better, e.g. AUC is ``is_higher_better``.
+        """
        labels = dataset.get_label()
-        argc = argc_(func)
+        argc = argc_(self.func)
        if argc == 2:
-            return func(labels, preds)
+            return self.func(labels, preds)
        elif argc == 3:
-            return func(labels, preds, dataset.get_weight())
+            return self.func(labels, preds, dataset.get_weight())
        elif argc == 4:
-            return func(labels, preds, dataset.get_weight(), dataset.get_group())
+            return self.func(labels, preds, dataset.get_weight(), dataset.get_group())
        else:
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
-    return inner


 class LGBMModel(_LGBMModelBase):
@@ -248,9 +284,9 @@ class LGBMModel(_LGBMModelBase):
            group : array-like
                Group/query data, used for ranking task.
            grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
-                The value of the gradient for each sample point.
+                The value of the first order derivative (gradient) for each sample point.
            hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
-                The value of the second derivative for each sample point.
+                The value of the second order derivative (Hessian) for each sample point.

        For multi-class task, the y_pred is group by class_id first, then group by row_id.
        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
@@ -414,8 +450,8 @@ class LGBMModel(_LGBMModelBase):
        Custom eval function expects a callable with following signatures:
        ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
        ``func(y_true, y_pred, weight, group)``
-        and returns (eval_name, eval_result, is_bigger_better) or
-        list of (eval_name, eval_result, is_bigger_better):
+        and returns (eval_name, eval_result, is_higher_better) or
+        list of (eval_name, eval_result, is_higher_better):

            y_true : array-like of shape = [n_samples]
                The target values.
@@ -426,11 +462,11 @@ class LGBMModel(_LGBMModelBase):
            group : array-like
                Group/query data, used for ranking task.
            eval_name : string
-                The name of evaluation.
+                The name of evaluation function.
            eval_result : float
                The eval result.
-            is_bigger_better : bool
-                Is eval result bigger better, e.g. AUC is bigger_better.
+            is_higher_better : bool
+                Is eval result higher better, e.g. AUC is ``is_higher_better``.

        For multi-class task, the y_pred is group by class_id first, then group by row_id.
        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
@@ -445,7 +481,7 @@ class LGBMModel(_LGBMModelBase):
            else:
                raise ValueError("Unknown LGBMModel type.")
        if callable(self._objective):
-            self._fobj = _objective_function_wrapper(self._objective)
+            self._fobj = _ObjectiveFunctionWrapper(self._objective)
        else:
            self._fobj = None
        evals_result = {}
@@ -466,7 +502,7 @@ class LGBMModel(_LGBMModelBase):
            params['objective'] = 'None'  # objective = nullptr for unknown objective

        if callable(eval_metric):
-            feval = _eval_function_wrapper(eval_metric)
+            feval = _EvalFunctionWrapper(eval_metric)
        else:
            feval = None
            # register default metric for consistency with callable eval_metric case

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -26,6 +26,17 @@ def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])


+def custom_asymmetric_obj(y_true, y_pred):
+    residual = (y_true - y_pred).astype("float")
+    grad = np.where(residual < 0, -2 * 10.0 * residual, -2 * residual)
+    hess = np.where(residual < 0, 2 * 10.0, 2.0)
+    return grad, hess
+
+
+def mse(y_true, y_pred):
+    return 'custom MSE', mean_squared_error(y_true, y_pred), False
+
+
 class TestSklearn(unittest.TestCase):

    def test_binary(self):
@@ -143,27 +154,27 @@ class TestSklearn(unittest.TestCase):
    def test_joblib(self):
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
-        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
-        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
+        gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj,
+                                silent=True, importance_type='split')
+        gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
+                eval_metric=mse, early_stopping_rounds=5, verbose=False,
+                callbacks=[lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))])

-        joblib.dump(gbm, 'lgb.pkl')
+        joblib.dump(gbm, 'lgb.pkl')  # test model with custom functions
        gbm_pickle = joblib.load('lgb.pkl')
        self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
-        self.assertListEqual(list(gbm.feature_importances_), list(gbm_pickle.feature_importances_))
-
-        X, y = load_boston(True)
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
-        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
-        gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
-        for key in gbm.evals_result_:
-            for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
-                self.assertAlmostEqual(*evals, places=5)
+        np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_)
+        self.assertAlmostEqual(gbm_pickle.learning_rate, 0.1)
+        self.assertTrue(callable(gbm_pickle.objective))
+
+        for eval_set in gbm.evals_result_:
+            for metric in gbm.evals_result_[eval_set]:
+                np.testing.assert_array_almost_equal(gbm.evals_result_[eval_set][metric],
+                                                     gbm_pickle.evals_result_[eval_set][metric])
        pred_origin = gbm.predict(X_test)
        pred_pickle = gbm_pickle.predict(X_test)
-        self.assertEqual(len(pred_origin), len(pred_pickle))
-        for preds in zip(pred_origin, pred_pickle):
-            self.assertAlmostEqual(*preds, places=5)
+        np.testing.assert_array_almost_equal(pred_origin, pred_pickle)

    def test_feature_importances_single_leaf(self):
        clf = lgb.LGBMClassifier(n_estimators=100)