"src/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "9f78cceee4911dd56f4635dfd36d4482363db5aa"
Unverified Commit 1d59a045 authored by momijiame's avatar momijiame Committed by GitHub
Browse files

[python] add return_cvbooster flag to cv func and publish _CVBooster (#283,#2105,#1445) (#3204)



* [python] add return_cvbooster flag to cv function and rename _CVBooster to make public (#283,#2105)

* [python] Reduce expected metric of unit testing

* [docs] add the CVBooster to the documentation

* [python] reflect the review comments

- Add some clarifications to the documentation
- Rename CVBooster.append to make private
- Decrease iteration rounds of testing to save CI time
- Use CVBooster as root member of lgb

* [python] add more checks in testing for cv
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* [python] add docstring for instance attributes of CVBooster
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* [python] fix docstring
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 66600b29
...@@ -11,6 +11,7 @@ Data Structure API ...@@ -11,6 +11,7 @@ Data Structure API
Dataset Dataset
Booster Booster
CVBooster
Training API Training API
------------ ------------
......
...@@ -8,7 +8,7 @@ from __future__ import absolute_import ...@@ -8,7 +8,7 @@ from __future__ import absolute_import
from .basic import Booster, Dataset from .basic import Booster, Dataset
from .callback import (early_stopping, print_evaluation, record_evaluation, from .callback import (early_stopping, print_evaluation, record_evaluation,
reset_parameter) reset_parameter)
from .engine import cv, train from .engine import cv, train, CVBooster
import os import os
...@@ -29,7 +29,7 @@ if os.path.isfile(os.path.join(dir_path, 'VERSION.txt')): ...@@ -29,7 +29,7 @@ if os.path.isfile(os.path.join(dir_path, 'VERSION.txt')):
with open(os.path.join(dir_path, 'VERSION.txt')) as version_file: with open(os.path.join(dir_path, 'VERSION.txt')) as version_file:
__version__ = version_file.read().strip() __version__ = version_file.read().strip()
__all__ = ['Dataset', 'Booster', __all__ = ['Dataset', 'Booster', 'CVBooster',
'train', 'cv', 'train', 'cv',
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
......
...@@ -276,19 +276,35 @@ def train(params, train_set, num_boost_round=100, ...@@ -276,19 +276,35 @@ def train(params, train_set, num_boost_round=100,
return booster return booster
class _CVBooster(object): class CVBooster(object):
"""Auxiliary data struct to hold all boosters of CV.""" """CVBooster in LightGBM.
Auxiliary data structure to hold and redirect all boosters of ``cv`` function.
This class has the same methods as Booster class.
All method calls are actually performed for underlying Boosters and then all returned results are returned in a list.
Attributes
----------
boosters : list of Booster
The list of underlying fitted models.
best_iteration : int
The best iteration of fitted model.
"""
def __init__(self): def __init__(self):
"""Initialize the CVBooster.
Generally, no need to instantiate manually.
"""
self.boosters = [] self.boosters = []
self.best_iteration = -1 self.best_iteration = -1
def append(self, booster): def _append(self, booster):
"""Add a booster to _CVBooster.""" """Add a booster to CVBooster."""
self.boosters.append(booster) self.boosters.append(booster)
def __getattr__(self, name): def __getattr__(self, name):
"""Redirect methods call of _CVBooster.""" """Redirect methods call of CVBooster."""
def handler_function(*args, **kwargs): def handler_function(*args, **kwargs):
"""Call methods with each booster, and concatenate their results.""" """Call methods with each booster, and concatenate their results."""
ret = [] ret = []
...@@ -341,7 +357,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi ...@@ -341,7 +357,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
train_id = [np.concatenate([test_id[i] for i in range_(nfold) if k != i]) for k in range_(nfold)] train_id = [np.concatenate([test_id[i] for i in range_(nfold) if k != i]) for k in range_(nfold)]
folds = zip_(train_id, test_id) folds = zip_(train_id, test_id)
ret = _CVBooster() ret = CVBooster()
for train_idx, test_idx in folds: for train_idx, test_idx in folds:
train_set = full_data.subset(sorted(train_idx)) train_set = full_data.subset(sorted(train_idx))
valid_set = full_data.subset(sorted(test_idx)) valid_set = full_data.subset(sorted(test_idx))
...@@ -354,7 +370,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi ...@@ -354,7 +370,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
if eval_train_metric: if eval_train_metric:
cvbooster.add_valid(train_set, 'train') cvbooster.add_valid(train_set, 'train')
cvbooster.add_valid(valid_set, 'valid') cvbooster.add_valid(valid_set, 'valid')
ret.append(cvbooster) ret._append(cvbooster)
return ret return ret
...@@ -380,7 +396,8 @@ def cv(params, train_set, num_boost_round=100, ...@@ -380,7 +396,8 @@ def cv(params, train_set, num_boost_round=100,
feature_name='auto', categorical_feature='auto', feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None, eval_train_metric=False): callbacks=None, eval_train_metric=False,
return_cvbooster=False):
"""Perform the cross-validation with given paramaters. """Perform the cross-validation with given paramaters.
Parameters Parameters
...@@ -486,6 +503,8 @@ def cv(params, train_set, num_boost_round=100, ...@@ -486,6 +503,8 @@ def cv(params, train_set, num_boost_round=100,
eval_train_metric : bool, optional (default=False) eval_train_metric : bool, optional (default=False)
Whether to display the train metric in progress. Whether to display the train metric in progress.
The score of the metric is calculated again after each training step, so there is some impact on performance. The score of the metric is calculated again after each training step, so there is some impact on performance.
return_cvbooster : bool, optional (default=False)
Whether to return Booster models trained on each fold through ``CVBooster``.
Returns Returns
------- -------
...@@ -495,6 +514,7 @@ def cv(params, train_set, num_boost_round=100, ...@@ -495,6 +514,7 @@ def cv(params, train_set, num_boost_round=100,
{'metric1-mean': [values], 'metric1-stdv': [values], {'metric1-mean': [values], 'metric1-stdv': [values],
'metric2-mean': [values], 'metric2-stdv': [values], 'metric2-mean': [values], 'metric2-stdv': [values],
...}. ...}.
If ``return_cvbooster=True``, also returns trained boosters via ``cvbooster`` key.
""" """
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError("Training only accepts Dataset object") raise TypeError("Training only accepts Dataset object")
...@@ -586,4 +606,8 @@ def cv(params, train_set, num_boost_round=100, ...@@ -586,4 +606,8 @@ def cv(params, train_set, num_boost_round=100,
for k in results: for k in results:
results[k] = results[k][:cvfolds.best_iteration] results[k] = results[k][:cvfolds.best_iteration]
break break
if return_cvbooster:
results['cvbooster'] = cvfolds
return dict(results) return dict(results)
...@@ -735,6 +735,50 @@ class TestEngine(unittest.TestCase): ...@@ -735,6 +735,50 @@ class TestEngine(unittest.TestCase):
verbose_eval=False) verbose_eval=False)
np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean']) np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])
def test_cvbooster(self):
X, y = load_breast_cancer(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1,
}
lgb_train = lgb.Dataset(X_train, y_train)
# with early stopping
cv_res = lgb.cv(params, lgb_train,
num_boost_round=25,
early_stopping_rounds=5,
verbose_eval=False,
nfold=3,
return_cvbooster=True)
self.assertIn('cvbooster', cv_res)
cvb = cv_res['cvbooster']
self.assertIsInstance(cvb, lgb.CVBooster)
self.assertIsInstance(cvb.boosters, list)
self.assertEqual(len(cvb.boosters), 3)
self.assertTrue(all(isinstance(bst, lgb.Booster) for bst in cvb.boosters))
self.assertGreater(cvb.best_iteration, 0)
# predict by each fold booster
preds = cvb.predict(X_test, num_iteration=cvb.best_iteration)
self.assertIsInstance(preds, list)
self.assertEqual(len(preds), 3)
# fold averaging
avg_pred = np.mean(preds, axis=0)
ret = log_loss(y_test, avg_pred)
self.assertLess(ret, 0.13)
# without early stopping
cv_res = lgb.cv(params, lgb_train,
num_boost_round=20,
verbose_eval=False,
nfold=3,
return_cvbooster=True)
cvb = cv_res['cvbooster']
self.assertEqual(cvb.best_iteration, -1)
preds = cvb.predict(X_test)
avg_pred = np.mean(preds, axis=0)
ret = log_loss(y_test, avg_pred)
self.assertLess(ret, 0.15)
def test_feature_name(self): def test_feature_name(self):
X_train, y_train = load_boston(True) X_train, y_train = load_boston(True)
params = {'verbose': -1} params = {'verbose': -1}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment