Commit 35440b9c authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

[python-package] change default best_iteration to 0 (#495)

* make test fail

* change default best_iteration to 0

* fix test

* change data_splitter to folds in cv

* update docs
parent e18c7856
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* [Training API](Python-API.md#training-api) * [Training API](Python-API.md#training-api)
- [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone) - [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
- [cv](Python-API.md#cvparams-train_set-num_boost_round10-data_splitternone-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone) - [cv](Python-API.md#cvparams-train_set-num_boost_round10-foldsnone-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
* [Scikit-learn API](Python-API.md#scikit-learn-api) * [Scikit-learn API](Python-API.md#scikit-learn-api)
- [Common Methods](Python-API.md#common-methods) - [Common Methods](Python-API.md#common-methods)
...@@ -538,7 +538,7 @@ The methods of each Class is in alphabetical order. ...@@ -538,7 +538,7 @@ The methods of each Class is in alphabetical order.
booster : a trained booster model booster : a trained booster model
#### cv(params, train_set, num_boost_round=10, data_splitter=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None) #### cv(params, train_set, num_boost_round=10, folds=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
Cross-validation with given paramaters. Cross-validation with given paramaters.
...@@ -550,8 +550,9 @@ The methods of each Class is in alphabetical order. ...@@ -550,8 +550,9 @@ The methods of each Class is in alphabetical order.
Data to be trained. Data to be trained.
num_boost_round : int num_boost_round : int
Number of boosting iterations. Number of boosting iterations.
data_splitter : an instance with split(X) method folds : a generator or iterator of (train_idx, test_idx) tuples
Instance with split(X) method. The train indices and test indices for each folds.
This argument has highest priority over other data split arguments.
nfold : int nfold : int
Number of folds in CV. Number of folds in CV.
stratified : bool stratified : bool
......
...@@ -165,7 +165,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -165,7 +165,7 @@ def train(params, train_set, num_boost_round=100,
booster.set_train_data_name(train_data_name) booster.set_train_data_name(train_data_name)
for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets): for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set) booster.add_valid(valid_set, name_valid_set)
booster.best_iteration = -1 booster.best_iteration = 0
"""start training""" """start training"""
for i in range_(init_iteration, init_iteration + num_boost_round): for i in range_(init_iteration, init_iteration + num_boost_round):
...@@ -224,16 +224,15 @@ class CVBooster(object): ...@@ -224,16 +224,15 @@ class CVBooster(object):
return handlerFunction return handlerFunction
def _make_n_folds(full_data, data_splitter, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True): def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True):
""" """
Make an n-fold list of Booster from random indices. Make an n-fold list of Booster from random indices.
""" """
full_data = full_data.construct() full_data = full_data.construct()
num_data = full_data.num_data() num_data = full_data.num_data()
if data_splitter is not None: if folds is not None:
if not hasattr(data_splitter, 'split'): if not hasattr(folds, '__iter__'):
raise AttributeError("data_splitter has no method 'split'") raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx)")
folds = data_splitter.split(np.arange(num_data))
else: else:
if 'objective' in params and params['objective'] == 'lambdarank': if 'objective' in params and params['objective'] == 'lambdarank':
if not SKLEARN_INSTALLED: if not SKLEARN_INSTALLED:
...@@ -287,7 +286,7 @@ def _agg_cv_result(raw_results): ...@@ -287,7 +286,7 @@ def _agg_cv_result(raw_results):
def cv(params, train_set, num_boost_round=10, def cv(params, train_set, num_boost_round=10,
data_splitter=None, nfold=5, stratified=False, shuffle=True, folds=None, nfold=5, stratified=False, shuffle=True,
metrics=None, fobj=None, feval=None, init_model=None, metrics=None, fobj=None, feval=None, init_model=None,
feature_name='auto', categorical_feature='auto', feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
...@@ -304,8 +303,9 @@ def cv(params, train_set, num_boost_round=10, ...@@ -304,8 +303,9 @@ def cv(params, train_set, num_boost_round=10,
Data to be trained. Data to be trained.
num_boost_round : int num_boost_round : int
Number of boosting iterations. Number of boosting iterations.
data_splitter : an instance with split(X) method folds : a generator or iterator of (train_idx, test_idx) tuples
Instance with split(X) method. The train indices and test indices for each folds.
This argument has highest priority over other data split arguments.
nfold : int nfold : int
Number of folds in CV. Number of folds in CV.
stratified : bool stratified : bool
...@@ -373,10 +373,9 @@ def cv(params, train_set, num_boost_round=10, ...@@ -373,10 +373,9 @@ def cv(params, train_set, num_boost_round=10,
params['metric'] = metrics params['metric'] = metrics
results = collections.defaultdict(list) results = collections.defaultdict(list)
cvfolds = _make_n_folds(train_set, data_splitter=data_splitter, cvfolds = _make_n_folds(train_set, folds=folds, nfold=nfold,
nfold=nfold, params=params, seed=seed, params=params, seed=seed, fpreproc=fpreproc,
fpreproc=fpreproc, stratified=stratified, stratified=stratified, shuffle=shuffle)
shuffle=shuffle)
# setup callbacks # setup callbacks
if callbacks is None: if callbacks is None:
......
...@@ -108,7 +108,7 @@ class TestEngine(unittest.TestCase): ...@@ -108,7 +108,7 @@ class TestEngine(unittest.TestCase):
valid_names=valid_set_name, valid_names=valid_set_name,
verbose_eval=False, verbose_eval=False,
early_stopping_rounds=5) early_stopping_rounds=5)
self.assertEqual(gbm.best_iteration, -1) self.assertEqual(gbm.best_iteration, 0)
self.assertIn(valid_set_name, gbm.best_score) self.assertIn(valid_set_name, gbm.best_score)
self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
# early stopping occurs # early stopping occurs
...@@ -189,10 +189,10 @@ class TestEngine(unittest.TestCase): ...@@ -189,10 +189,10 @@ class TestEngine(unittest.TestCase):
lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, shuffle=True, lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, shuffle=True,
metrics='l1', verbose_eval=False, metrics='l1', verbose_eval=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
# self defined data_splitter # self defined folds
tss = TimeSeriesSplit(3) tss = TimeSeriesSplit(3)
lgb.cv(params, lgb_train, num_boost_round=10, data_splitter=tss, nfold=5, # test if wrong nfold is ignored folds = tss.split(X_train)
metrics='l2', verbose_eval=False) lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, verbose_eval=False)
# lambdarank # lambdarank
X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train') X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train')
q_train = np.loadtxt('../../examples/lambdarank/rank.train.query') q_train = np.loadtxt('../../examples/lambdarank/rank.train.query')
......
...@@ -27,29 +27,29 @@ class TestSklearn(unittest.TestCase): ...@@ -27,29 +27,29 @@ class TestSklearn(unittest.TestCase):
X, y = load_breast_cancer(True) X, y = load_breast_cancer(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = log_loss(y_test, gbm.predict_proba(X_test)) ret = log_loss(y_test, gbm.predict_proba(X_test))
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['binary_logloss'][-1], places=5) self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['binary_logloss'][gbm.best_iteration - 1], places=5)
def test_regreesion(self): def test_regreesion(self):
X, y = load_boston(True) X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = mean_squared_error(y_test, gbm.predict(X_test)) ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 16) self.assertLess(ret, 16)
self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['l2'][-1], places=5) self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['l2'][gbm.best_iteration - 1], places=5)
def test_multiclass(self): def test_multiclass(self):
X, y = load_digits(10, True) X, y = load_digits(10, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = multi_error(y_test, gbm.predict(X_test)) ret = multi_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
ret = multi_logloss(y_test, gbm.predict_proba(X_test)) ret = multi_logloss(y_test, gbm.predict_proba(X_test))
self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['multi_logloss'][-1], places=5) self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['multi_logloss'][gbm.best_iteration - 1], places=5)
def test_lambdarank(self): def test_lambdarank(self):
X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train') X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train')
...@@ -58,7 +58,7 @@ class TestSklearn(unittest.TestCase): ...@@ -58,7 +58,7 @@ class TestSklearn(unittest.TestCase):
q_test = np.loadtxt('../../examples/lambdarank/rank.test.query') q_test = np.loadtxt('../../examples/lambdarank/rank.test.query')
gbm = lgb.LGBMRanker() gbm = lgb.LGBMRanker()
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
eval_group=[q_test], eval_at=[1, 3], verbose=False, eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)]) callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
def test_regression_with_custom_objective(self): def test_regression_with_custom_objective(self):
...@@ -69,10 +69,10 @@ class TestSklearn(unittest.TestCase): ...@@ -69,10 +69,10 @@ class TestSklearn(unittest.TestCase):
X, y = load_boston(True) X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = mean_squared_error(y_test, gbm.predict(X_test)) ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 100) self.assertLess(ret, 100)
self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['l2'][-1], places=5) self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['l2'][gbm.best_iteration - 1], places=5)
def test_binary_classification_with_custom_objective(self): def test_binary_classification_with_custom_objective(self):
def logregobj(y_true, y_pred): def logregobj(y_true, y_pred):
...@@ -86,7 +86,7 @@ class TestSklearn(unittest.TestCase): ...@@ -86,7 +86,7 @@ class TestSklearn(unittest.TestCase):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)]) return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = binary_error(y_test, gbm.predict(X_test)) ret = binary_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.1) self.assertLess(ret, 0.1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment