Commit 2459362a authored by Nikita Titov's avatar Nikita Titov Committed by Guolin Ke
Browse files

[python] fixed picklability of sklearn models with custom obj and updated...

[python] fixed picklability of sklearn models with custom obj and updated docstings for custom obj (#2191)

* refactored joblib test

* fixed picklability of sklearn models with custom obj and updated docstings for custom obj

* pickled model should be able to predict without refitting
parent e5b6e50e
......@@ -59,7 +59,7 @@ def is_numeric(obj):
def is_numpy_1d_array(data):
"""Check whether data is a 1-D numpy array."""
"""Check whether data is a numpy 1-D array."""
return isinstance(data, np.ndarray) and len(data.shape) == 1
......@@ -69,7 +69,7 @@ def is_1d_list(data):
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""Convert data to 1-D numpy array."""
"""Convert data to numpy 1-D array."""
if is_numpy_1d_array(data):
if data.dtype == dtype:
return data
......@@ -1853,9 +1853,20 @@ class Booster(object):
If None, last training data is used.
fobj : callable or None, optional (default=None)
Customized objective function.
Should accept two parameters: preds, train_data,
and return (grad, hess).
preds : list or numpy 1-D array
The predicted values.
train_data : Dataset
The training dataset.
grad : list or numpy 1-D array
The value of the first order derivative (gradient) for each sample point.
hess : list or numpy 1-D array
The value of the second order derivative (Hessian) for each sample point.
For multi-class task, the score is group by class_id first, then group by row_id.
If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
and you should group grad and hess in this way as well.
Returns
......@@ -1902,9 +1913,9 @@ class Booster(object):
Parameters
----------
grad : 1-D numpy array or 1-D list
grad : list or numpy 1-D array
The first order derivative (gradient).
hess : 1-D numpy array or 1-D list
hess : list or numpy 1-D array
The second order derivative (Hessian).
Returns
......@@ -1994,8 +2005,20 @@ class Booster(object):
Name of the data.
feval : callable or None, optional (default=None)
Customized evaluation function.
Should accept two parameters: preds, train_data,
Should accept two parameters: preds, eval_data,
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
preds : list or numpy 1-D array
The predicted values.
eval_data : Dataset
The evaluation dataset.
eval_name : string
The name of evaluation function.
eval_result : float
The eval result.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
......@@ -2030,6 +2053,18 @@ class Booster(object):
Customized evaluation function.
Should accept two parameters: preds, train_data,
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
preds : list or numpy 1-D array
The predicted values.
train_data : Dataset
The training dataset.
eval_name : string
The name of evaluation function.
eval_result : float
The eval result.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
......@@ -2047,8 +2082,20 @@ class Booster(object):
----------
feval : callable or None, optional (default=None)
Customized evaluation function.
Should accept two parameters: preds, train_data,
Should accept two parameters: preds, valid_data,
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
preds : list or numpy 1-D array
The predicted values.
valid_data : Dataset
The validation dataset.
eval_name : string
The name of evaluation function.
eval_result : float
The eval result.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
......
......@@ -39,10 +39,38 @@ def train(params, train_set, num_boost_round=100,
Names of ``valid_sets``.
fobj : callable or None, optional (default=None)
Customized objective function.
Should accept two parameters: preds, train_data,
and return (grad, hess).
preds : list or numpy 1-D array
The predicted values.
train_data : Dataset
The training dataset.
grad : list or numpy 1-D array
The value of the first order derivative (gradient) for each sample point.
hess : list or numpy 1-D array
The value of the second order derivative (Hessian) for each sample point.
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
and you should group grad and hess in this way as well.
feval : callable or None, optional (default=None)
Customized evaluation function.
Should accept two parameters: preds, train_data,
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
preds : list or numpy 1-D array
The predicted values.
train_data : Dataset
The training dataset.
eval_name : string
The name of evaluation function.
eval_result : float
The eval result.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
To ignore the default metric corresponding to the used objective,
......@@ -373,11 +401,39 @@ def cv(params, train_set, num_boost_round=100,
Evaluation metrics to be monitored while CV.
If not None, the metric in ``params`` will be overridden.
fobj : callable or None, optional (default=None)
Custom objective function.
Customized objective function.
Should accept two parameters: preds, train_data,
and return (grad, hess).
preds : list or numpy 1-D array
The predicted values.
train_data : Dataset
The training dataset.
grad : list or numpy 1-D array
The value of the first order derivative (gradient) for each sample point.
hess : list or numpy 1-D array
The value of the second order derivative (Hessian) for each sample point.
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
and you should group grad and hess in this way as well.
feval : callable or None, optional (default=None)
Customized evaluation function.
Should accept two parameters: preds, train_data,
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
preds : list or numpy 1-D array
The predicted values.
train_data : Dataset
The training dataset.
eval_name : string
The name of evaluation function.
eval_result : float
The eval result.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
For multi-class task, the preds is group by class_id first, then group by row_id.
If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
To ignore the default metric corresponding to the used objective,
......
......@@ -15,46 +15,63 @@ from .compat import (SKLEARN_INSTALLED, _LGBMClassifierBase,
from .engine import train
def _objective_function_wrapper(func):
"""Decorate an objective function.
class _ObjectiveFunctionWrapper(object):
"""Proxy class for objective function."""
Note
----
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
and you should group grad and hess in this way as well.
def __init__(self, func):
"""Construct a proxy class.
Parameters
----------
func : callable
Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
This class transforms objective function to match objective function with signature ``new_func(preds, dataset)``
as expected by ``lightgbm.engine.train``.
y_true : array-like of shape = [n_samples]
The target values.
y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
group : array-like
Group/query data, used for ranking task.
Parameters
----------
func : callable
Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group)
and returns (grad, hess):
y_true : array-like of shape = [n_samples]
The target values.
y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
group : array-like
Group/query data, used for ranking task.
grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the first order derivative (gradient) for each sample point.
hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the second order derivative (Hessian) for each sample point.
Note
----
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
and you should group grad and hess in this way as well.
"""
self.func = func
Returns
-------
new_func : callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
def __call__(self, preds, dataset):
"""Call passed function with appropriate arguments.
preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
dataset : Dataset
The training set from which the labels will be extracted using ``dataset.get_label()``.
"""
def inner(preds, dataset):
"""Call passed function with appropriate arguments."""
Parameters
----------
preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
dataset : Dataset
The training dataset.
Returns
-------
grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the first order derivative (gradient) for each sample point.
hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the second order derivative (Hessian) for each sample point.
"""
labels = dataset.get_label()
argc = argc_(func)
argc = argc_(self.func)
if argc == 2:
grad, hess = func(labels, preds)
grad, hess = self.func(labels, preds)
elif argc == 3:
grad, hess = func(labels, preds, dataset.get_group())
grad, hess = self.func(labels, preds, dataset.get_group())
else:
raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
"""weighted for objective"""
......@@ -75,59 +92,78 @@ def _objective_function_wrapper(func):
grad[idx] *= weight[i]
hess[idx] *= weight[i]
return grad, hess
return inner
def _eval_function_wrapper(func):
"""Decorate an eval function.
class _EvalFunctionWrapper(object):
"""Proxy class for evaluation function."""
Note
----
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
def __init__(self, func):
"""Construct a proxy class.
Parameters
----------
func : callable
Expects a callable with following signatures:
``func(y_true, y_pred)``,
``func(y_true, y_pred, weight)``
or ``func(y_true, y_pred, weight, group)``
and returns (eval_name->string, eval_result->float, is_bigger_better->bool):
This class transforms evaluation function to match evaluation function with signature ``new_func(preds, dataset)``
as expected by ``lightgbm.engine.train``.
y_true : array-like of shape = [n_samples]
The target values.
y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
weight : array-like of shape = [n_samples]
The weight of samples.
group : array-like
Group/query data, used for ranking task.
Parameters
----------
func : callable
Expects a callable with following signatures:
``func(y_true, y_pred)``,
``func(y_true, y_pred, weight)``
or ``func(y_true, y_pred, weight, group)``
and returns (eval_name, eval_result, is_higher_better) or
list of (eval_name, eval_result, is_higher_better):
y_true : array-like of shape = [n_samples]
The target values.
y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
weight : array-like of shape = [n_samples]
The weight of samples.
group : array-like
Group/query data, used for ranking task.
eval_name : string
The name of evaluation function.
eval_result : float
The eval result.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
Returns
-------
new_func : callable
The new eval function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
Note
----
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
"""
self.func = func
preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
dataset : Dataset
The training set from which the labels will be extracted using ``dataset.get_label()``.
"""
def inner(preds, dataset):
"""Call passed function with appropriate arguments."""
def __call__(self, preds, dataset):
"""Call passed function with appropriate arguments.
Parameters
----------
preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
dataset : Dataset
The training dataset.
Returns
-------
eval_name : string
The name of evaluation function.
eval_result : float
The eval result.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
"""
labels = dataset.get_label()
argc = argc_(func)
argc = argc_(self.func)
if argc == 2:
return func(labels, preds)
return self.func(labels, preds)
elif argc == 3:
return func(labels, preds, dataset.get_weight())
return self.func(labels, preds, dataset.get_weight())
elif argc == 4:
return func(labels, preds, dataset.get_weight(), dataset.get_group())
return self.func(labels, preds, dataset.get_weight(), dataset.get_group())
else:
raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
return inner
class LGBMModel(_LGBMModelBase):
......@@ -248,9 +284,9 @@ class LGBMModel(_LGBMModelBase):
group : array-like
Group/query data, used for ranking task.
grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the gradient for each sample point.
The value of the first order derivative (gradient) for each sample point.
hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the second derivative for each sample point.
The value of the second order derivative (Hessian) for each sample point.
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
......@@ -414,8 +450,8 @@ class LGBMModel(_LGBMModelBase):
Custom eval function expects a callable with following signatures:
``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
``func(y_true, y_pred, weight, group)``
and returns (eval_name, eval_result, is_bigger_better) or
list of (eval_name, eval_result, is_bigger_better):
and returns (eval_name, eval_result, is_higher_better) or
list of (eval_name, eval_result, is_higher_better):
y_true : array-like of shape = [n_samples]
The target values.
......@@ -426,11 +462,11 @@ class LGBMModel(_LGBMModelBase):
group : array-like
Group/query data, used for ranking task.
eval_name : string
The name of evaluation.
The name of evaluation function.
eval_result : float
The eval result.
is_bigger_better : bool
Is eval result bigger better, e.g. AUC is bigger_better.
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
......@@ -445,7 +481,7 @@ class LGBMModel(_LGBMModelBase):
else:
raise ValueError("Unknown LGBMModel type.")
if callable(self._objective):
self._fobj = _objective_function_wrapper(self._objective)
self._fobj = _ObjectiveFunctionWrapper(self._objective)
else:
self._fobj = None
evals_result = {}
......@@ -466,7 +502,7 @@ class LGBMModel(_LGBMModelBase):
params['objective'] = 'None' # objective = nullptr for unknown objective
if callable(eval_metric):
feval = _eval_function_wrapper(eval_metric)
feval = _EvalFunctionWrapper(eval_metric)
else:
feval = None
# register default metric for consistency with callable eval_metric case
......
......@@ -26,6 +26,17 @@ def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def custom_asymmetric_obj(y_true, y_pred):
residual = (y_true - y_pred).astype("float")
grad = np.where(residual < 0, -2 * 10.0 * residual, -2 * residual)
hess = np.where(residual < 0, 2 * 10.0, 2.0)
return grad, hess
def mse(y_true, y_pred):
return 'custom MSE', mean_squared_error(y_true, y_pred), False
class TestSklearn(unittest.TestCase):
def test_binary(self):
......@@ -143,27 +154,27 @@ class TestSklearn(unittest.TestCase):
def test_joblib(self):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj,
silent=True, importance_type='split')
gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric=mse, early_stopping_rounds=5, verbose=False,
callbacks=[lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))])
joblib.dump(gbm, 'lgb.pkl')
joblib.dump(gbm, 'lgb.pkl') # test model with custom functions
gbm_pickle = joblib.load('lgb.pkl')
self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
self.assertListEqual(list(gbm.feature_importances_), list(gbm_pickle.feature_importances_))
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
for key in gbm.evals_result_:
for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
self.assertAlmostEqual(*evals, places=5)
np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_)
self.assertAlmostEqual(gbm_pickle.learning_rate, 0.1)
self.assertTrue(callable(gbm_pickle.objective))
for eval_set in gbm.evals_result_:
for metric in gbm.evals_result_[eval_set]:
np.testing.assert_array_almost_equal(gbm.evals_result_[eval_set][metric],
gbm_pickle.evals_result_[eval_set][metric])
pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle))
for preds in zip(pred_origin, pred_pickle):
self.assertAlmostEqual(*preds, places=5)
np.testing.assert_array_almost_equal(pred_origin, pred_pickle)
def test_feature_importances_single_leaf(self):
clf = lgb.LGBMClassifier(n_estimators=100)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment