Unverified Commit 439c721a authored by Thomas J. Fan's avatar Thomas J. Fan Committed by GitHub
Browse files

[python-package] migrate test_sklearn.py to pytest (#3844)

* TST Migrates test_sklearn.py to pytest

* STY Fixes linting

* FIX Adds reason

* ENH Address comments
parent 113da3af
...@@ -3,7 +3,6 @@ import itertools ...@@ -3,7 +3,6 @@ import itertools
import joblib import joblib
import math import math
import os import os
import unittest
import lightgbm as lgb import lightgbm as lgb
import numpy as np import numpy as np
...@@ -80,38 +79,39 @@ def multi_logloss(y_true, y_pred): ...@@ -80,38 +79,39 @@ def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
class TestSklearn(unittest.TestCase): def test_binary():
def test_binary(self):
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = log_loss(y_test, gbm.predict_proba(X_test)) ret = log_loss(y_test, gbm.predict_proba(X_test))
self.assertLess(ret, 0.12) assert ret < 0.12
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5) assert ret == pytest.approx(gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], abs=1e-5)
def test_regression(self): def test_regression():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = mean_squared_error(y_test, gbm.predict(X_test)) ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 7) assert ret < 7
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5) assert ret == pytest.approx(gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], abs=1e-5)
def test_multiclass(self): def test_multiclass():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = multi_error(y_test, gbm.predict(X_test)) ret = multi_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.05) assert ret < 0.05
ret = multi_logloss(y_test, gbm.predict_proba(X_test)) ret = multi_logloss(y_test, gbm.predict_proba(X_test))
self.assertLess(ret, 0.16) assert ret < 0.16
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5) assert ret == pytest.approx(gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], abs=1e-5)
def test_lambdarank(self):
def test_lambdarank():
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train')) '../../examples/lambdarank/rank.train'))
X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
...@@ -124,11 +124,12 @@ class TestSklearn(unittest.TestCase): ...@@ -124,11 +124,12 @@ class TestSklearn(unittest.TestCase):
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))]) callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
self.assertLessEqual(gbm.best_iteration_, 24) assert gbm.best_iteration_ <= 24
self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.5674) assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674
self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.578) assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578
def test_xendcg(self): def test_xendcg():
dir_path = os.path.dirname(os.path.realpath(__file__)) dir_path = os.path.dirname(os.path.realpath(__file__))
X_train, y_train = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.train')) X_train, y_train = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.train'))
X_test, y_test = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.test')) X_test, y_test = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.test'))
...@@ -139,43 +140,47 @@ class TestSklearn(unittest.TestCase): ...@@ -139,43 +140,47 @@ class TestSklearn(unittest.TestCase):
eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False,
eval_metric='ndcg', eval_metric='ndcg',
callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))]) callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
self.assertLessEqual(gbm.best_iteration_, 24) assert gbm.best_iteration_ <= 24
self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6211) assert gbm.best_score_['valid_0']['ndcg@1'] > 0.6211
self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6253) assert gbm.best_score_['valid_0']['ndcg@3'] > 0.6253
def test_regression_with_custom_objective(self): def test_regression_with_custom_objective():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = mean_squared_error(y_test, gbm.predict(X_test)) ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 7.0) assert ret < 7.0
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5) assert ret == pytest.approx(gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], abs=1e-5)
def test_binary_classification_with_custom_objective(self):
def test_binary_classification_with_custom_objective():
X, y = load_digits(n_class=2, return_X_y=True) X, y = load_digits(n_class=2, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
# prediction result is actually not transformed (is raw) due to custom objective # prediction result is actually not transformed (is raw) due to custom objective
y_pred_raw = gbm.predict_proba(X_test) y_pred_raw = gbm.predict_proba(X_test)
self.assertFalse(np.all(y_pred_raw >= 0)) assert not np.all(y_pred_raw >= 0)
y_pred = 1.0 / (1.0 + np.exp(-y_pred_raw)) y_pred = 1.0 / (1.0 + np.exp(-y_pred_raw))
ret = binary_error(y_test, y_pred) ret = binary_error(y_test, y_pred)
self.assertLess(ret, 0.05) assert ret < 0.05
def test_dart(self): def test_dart():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50) gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
score = gbm.score(X_test, y_test) score = gbm.score(X_test, y_test)
self.assertGreaterEqual(score, 0.8) assert score >= 0.8
self.assertLessEqual(score, 1.) assert score <= 1.
# sklearn <0.23 does not have a stacking classifier and n_features_in_ property # sklearn <0.23 does not have a stacking classifier and n_features_in_ property
@unittest.skipIf(sk_version < parse_version("0.23"), 'scikit-learn version is less than 0.23') @pytest.mark.skipif(sk_version < parse_version("0.23"), reason='scikit-learn version is less than 0.23')
def test_stacking_classifier(self): def test_stacking_classifier():
from sklearn.ensemble import StackingClassifier from sklearn.ensemble import StackingClassifier
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
...@@ -187,22 +192,22 @@ class TestSklearn(unittest.TestCase): ...@@ -187,22 +192,22 @@ class TestSklearn(unittest.TestCase):
passthrough=True) passthrough=True)
clf.fit(X_train, y_train) clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) score = clf.score(X_test, y_test)
self.assertGreaterEqual(score, 0.8) assert score >= 0.8
self.assertLessEqual(score, 1.) assert score <= 1.
self.assertEqual(clf.n_features_in_, 4) # number of input features assert clf.n_features_in_ == 4 # number of input features
self.assertEqual(len(clf.named_estimators_['gbm1'].feature_importances_), 4) assert len(clf.named_estimators_['gbm1'].feature_importances_) == 4
self.assertEqual(clf.named_estimators_['gbm1'].n_features_in_, assert clf.named_estimators_['gbm1'].n_features_in_ == clf.named_estimators_['gbm2'].n_features_in_
clf.named_estimators_['gbm2'].n_features_in_) assert clf.final_estimator_.n_features_in_ == 10 # number of concatenated features
self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features assert len(clf.final_estimator_.feature_importances_) == 10
self.assertEqual(len(clf.final_estimator_.feature_importances_), 10)
classes = clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_ classes = clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_
self.assertTrue(all(classes)) assert all(classes)
classes = clf.classes_ == clf.named_estimators_['gbm1'].classes_ classes = clf.classes_ == clf.named_estimators_['gbm1'].classes_
self.assertTrue(all(classes)) assert all(classes)
# sklearn <0.23 does not have a stacking regressor and n_features_in_ property
@unittest.skipIf(sk_version < parse_version('0.23'), 'scikit-learn version is less than 0.23') # sklearn <0.23 does not have a stacking regressor and n_features_in_ property
def test_stacking_regressor(self): @pytest.mark.skipif(sk_version < parse_version('0.23'), reason='scikit-learn version is less than 0.23')
def test_stacking_regressor():
from sklearn.ensemble import StackingRegressor from sklearn.ensemble import StackingRegressor
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
...@@ -214,16 +219,16 @@ class TestSklearn(unittest.TestCase): ...@@ -214,16 +219,16 @@ class TestSklearn(unittest.TestCase):
passthrough=True) passthrough=True)
reg.fit(X_train, y_train) reg.fit(X_train, y_train)
score = reg.score(X_test, y_test) score = reg.score(X_test, y_test)
self.assertGreaterEqual(score, 0.2) assert score >= 0.2
self.assertLessEqual(score, 1.) assert score <= 1.
self.assertEqual(reg.n_features_in_, 13) # number of input features assert reg.n_features_in_ == 13 # number of input features
self.assertEqual(len(reg.named_estimators_['gbm1'].feature_importances_), 13) assert len(reg.named_estimators_['gbm1'].feature_importances_) == 13
self.assertEqual(reg.named_estimators_['gbm1'].n_features_in_, assert reg.named_estimators_['gbm1'].n_features_in_ == reg.named_estimators_['gbm2'].n_features_in_
reg.named_estimators_['gbm2'].n_features_in_) assert reg.final_estimator_.n_features_in_ == 15 # number of concatenated features
self.assertEqual(reg.final_estimator_.n_features_in_, 15) # number of concatenated features assert len(reg.final_estimator_.feature_importances_) == 15
self.assertEqual(len(reg.final_estimator_.feature_importances_), 15)
def test_grid_search(self): def test_grid_search():
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
y = y.astype(str) # utilize label encoder at it's max power y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
...@@ -243,17 +248,18 @@ class TestSklearn(unittest.TestCase): ...@@ -243,17 +248,18 @@ class TestSklearn(unittest.TestCase):
cv=2) cv=2)
grid.fit(X_train, y_train, **fit_params) grid.fit(X_train, y_train, **fit_params)
score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True
self.assertIn(grid.best_params_['boosting_type'], ['rf', 'gbdt']) assert grid.best_params_['boosting_type'] in ['rf', 'gbdt']
self.assertIn(grid.best_params_['n_estimators'], [4, 6]) assert grid.best_params_['n_estimators'] in [4, 6]
self.assertIn(grid.best_params_['reg_alpha'], [0.01, 0.005]) assert grid.best_params_['reg_alpha'] in [0.01, 0.005]
self.assertLessEqual(grid.best_score_, 1.) assert grid.best_score_ <= 1.
self.assertEqual(grid.best_estimator_.best_iteration_, 1) assert grid.best_estimator_.best_iteration_ == 1
self.assertLess(grid.best_estimator_.best_score_['valid_0']['multi_logloss'], 0.25) assert grid.best_estimator_.best_score_['valid_0']['multi_logloss'] < 0.25
self.assertEqual(grid.best_estimator_.best_score_['valid_0']['error'], 0) assert grid.best_estimator_.best_score_['valid_0']['error'] == 0
self.assertGreaterEqual(score, 0.2) assert score >= 0.2
self.assertLessEqual(score, 1.) assert score <= 1.
def test_random_search(self):
def test_random_search():
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
y = y.astype(str) # utilize label encoder at it's max power y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
...@@ -275,19 +281,20 @@ class TestSklearn(unittest.TestCase): ...@@ -275,19 +281,20 @@ class TestSklearn(unittest.TestCase):
n_iter=n_iter, random_state=42) n_iter=n_iter, random_state=42)
rand.fit(X_train, y_train, **fit_params) rand.fit(X_train, y_train, **fit_params)
score = rand.score(X_test, y_test) # utilizes RandomizedSearchCV default refit=True score = rand.score(X_test, y_test) # utilizes RandomizedSearchCV default refit=True
self.assertIn(rand.best_params_['boosting_type'], ['rf', 'gbdt']) assert rand.best_params_['boosting_type'] in ['rf', 'gbdt']
self.assertIn(rand.best_params_['n_estimators'], list(range(3, 10))) assert rand.best_params_['n_estimators'] in list(range(3, 10))
self.assertGreaterEqual(rand.best_params_['reg_alpha'], 0.01) # Left-closed boundary point assert rand.best_params_['reg_alpha'] >= 0.01 # Left-closed boundary point
self.assertLessEqual(rand.best_params_['reg_alpha'], 0.06) # Right-closed boundary point assert rand.best_params_['reg_alpha'] <= 0.06 # Right-closed boundary point
self.assertLessEqual(rand.best_score_, 1.) assert rand.best_score_ <= 1.
self.assertLess(rand.best_estimator_.best_score_['valid_0']['multi_logloss'], 0.25) assert rand.best_estimator_.best_score_['valid_0']['multi_logloss'] < 0.25
self.assertEqual(rand.best_estimator_.best_score_['valid_0']['error'], 0) assert rand.best_estimator_.best_score_['valid_0']['error'] == 0
self.assertGreaterEqual(score, 0.2) assert score >= 0.2
self.assertLessEqual(score, 1.) assert score <= 1.
# sklearn < 0.22 does not have the post fit attribute: classes_
@unittest.skipIf(sk_version < parse_version('0.22'), 'scikit-learn version is less than 0.22') # sklearn < 0.22 does not have the post fit attribute: classes_
def test_multioutput_classifier(self): @pytest.mark.skipif(sk_version < parse_version('0.22'), reason='scikit-learn version is less than 0.22')
def test_multioutput_classifier():
n_outputs = 3 n_outputs = 3
X, y = make_multilabel_classification(n_samples=100, n_features=20, X, y = make_multilabel_classification(n_samples=100, n_features=20,
n_classes=n_outputs, random_state=0) n_classes=n_outputs, random_state=0)
...@@ -297,17 +304,18 @@ class TestSklearn(unittest.TestCase): ...@@ -297,17 +304,18 @@ class TestSklearn(unittest.TestCase):
clf = MultiOutputClassifier(estimator=lgb.LGBMClassifier(n_estimators=10)) clf = MultiOutputClassifier(estimator=lgb.LGBMClassifier(n_estimators=10))
clf.fit(X_train, y_train) clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) score = clf.score(X_test, y_test)
self.assertGreaterEqual(score, 0.2) assert score >= 0.2
self.assertLessEqual(score, 1.) assert score <= 1.
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs),
np.concatenate(clf.classes_)) np.concatenate(clf.classes_))
for classifier in clf.estimators_: for classifier in clf.estimators_:
self.assertIsInstance(classifier, lgb.LGBMClassifier) assert isinstance(classifier, lgb.LGBMClassifier)
self.assertIsInstance(classifier.booster_, lgb.Booster) assert isinstance(classifier.booster_, lgb.Booster)
# sklearn < 0.23 does not have as_frame parameter # sklearn < 0.23 does not have as_frame parameter
@unittest.skipIf(sk_version < parse_version('0.23'), 'scikit-learn version is less than 0.23') @pytest.mark.skipif(sk_version < parse_version('0.23'), reason='scikit-learn version is less than 0.23')
def test_multioutput_regressor(self): def test_multioutput_regressor():
bunch = load_linnerud(as_frame=True) # returns a Bunch instance bunch = load_linnerud(as_frame=True) # returns a Bunch instance
X, y = bunch['data'], bunch['target'] X, y = bunch['data'], bunch['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
...@@ -316,15 +324,16 @@ class TestSklearn(unittest.TestCase): ...@@ -316,15 +324,16 @@ class TestSklearn(unittest.TestCase):
reg.fit(X_train, y_train) reg.fit(X_train, y_train)
y_pred = reg.predict(X_test) y_pred = reg.predict(X_test)
_, score, _ = mse(y_test, y_pred) _, score, _ = mse(y_test, y_pred)
self.assertGreaterEqual(score, 0.2) assert score >= 0.2
self.assertLessEqual(score, 120.) assert score <= 120.
for regressor in reg.estimators_: for regressor in reg.estimators_:
self.assertIsInstance(regressor, lgb.LGBMRegressor) assert isinstance(regressor, lgb.LGBMRegressor)
self.assertIsInstance(regressor.booster_, lgb.Booster) assert isinstance(regressor.booster_, lgb.Booster)
# sklearn < 0.22 does not have the post fit attribute: classes_ # sklearn < 0.22 does not have the post fit attribute: classes_
@unittest.skipIf(sk_version < parse_version('0.22'), 'scikit-learn version is less than 0.22') @pytest.mark.skipif(sk_version < parse_version('0.22'), reason='scikit-learn version is less than 0.22')
def test_classifier_chain(self): def test_classifier_chain():
n_outputs = 3 n_outputs = 3
X, y = make_multilabel_classification(n_samples=100, n_features=20, X, y = make_multilabel_classification(n_samples=100, n_features=20,
n_classes=n_outputs, random_state=0) n_classes=n_outputs, random_state=0)
...@@ -335,18 +344,19 @@ class TestSklearn(unittest.TestCase): ...@@ -335,18 +344,19 @@ class TestSklearn(unittest.TestCase):
order=order, random_state=42) order=order, random_state=42)
clf.fit(X_train, y_train) clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) score = clf.score(X_test, y_test)
self.assertGreaterEqual(score, 0.2) assert score >= 0.2
self.assertLessEqual(score, 1.) assert score <= 1.
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs),
np.concatenate(clf.classes_)) np.concatenate(clf.classes_))
self.assertListEqual(order, clf.order_) assert order == clf.order_
for classifier in clf.estimators_: for classifier in clf.estimators_:
self.assertIsInstance(classifier, lgb.LGBMClassifier) assert isinstance(classifier, lgb.LGBMClassifier)
self.assertIsInstance(classifier.booster_, lgb.Booster) assert isinstance(classifier.booster_, lgb.Booster)
# sklearn < 0.23 does not have as_frame parameter
@unittest.skipIf(sk_version < parse_version('0.23'), 'scikit-learn version is less than 0.23') # sklearn < 0.23 does not have as_frame parameter
def test_regressor_chain(self): @pytest.mark.skipif(sk_version < parse_version('0.23'), reason='scikit-learn version is less than 0.23')
def test_regressor_chain():
bunch = load_linnerud(as_frame=True) # returns a Bunch instance bunch = load_linnerud(as_frame=True) # returns a Bunch instance
X, y = bunch['data'], bunch['target'] X, y = bunch['data'], bunch['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
...@@ -356,31 +366,33 @@ class TestSklearn(unittest.TestCase): ...@@ -356,31 +366,33 @@ class TestSklearn(unittest.TestCase):
reg.fit(X_train, y_train) reg.fit(X_train, y_train)
y_pred = reg.predict(X_test) y_pred = reg.predict(X_test)
_, score, _ = mse(y_test, y_pred) _, score, _ = mse(y_test, y_pred)
self.assertGreaterEqual(score, 0.2) assert score >= 0.2
self.assertLessEqual(score, 120.) assert score <= 120.
self.assertListEqual(order, reg.order_) assert order == reg.order_
for regressor in reg.estimators_: for regressor in reg.estimators_:
self.assertIsInstance(regressor, lgb.LGBMRegressor) assert isinstance(regressor, lgb.LGBMRegressor)
self.assertIsInstance(regressor.booster_, lgb.Booster) assert isinstance(regressor.booster_, lgb.Booster)
def test_clone_and_property(self): def test_clone_and_property():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
gbm = lgb.LGBMRegressor(n_estimators=10, silent=True) gbm = lgb.LGBMRegressor(n_estimators=10, silent=True)
gbm.fit(X, y, verbose=False) gbm.fit(X, y, verbose=False)
gbm_clone = clone(gbm) gbm_clone = clone(gbm)
self.assertIsInstance(gbm.booster_, lgb.Booster) assert isinstance(gbm.booster_, lgb.Booster)
self.assertIsInstance(gbm.feature_importances_, np.ndarray) assert isinstance(gbm.feature_importances_, np.ndarray)
X, y = load_digits(n_class=2, return_X_y=True) X, y = load_digits(n_class=2, return_X_y=True)
clf = lgb.LGBMClassifier(n_estimators=10, silent=True) clf = lgb.LGBMClassifier(n_estimators=10, silent=True)
clf.fit(X, y, verbose=False) clf.fit(X, y, verbose=False)
self.assertListEqual(sorted(clf.classes_), [0, 1]) assert sorted(clf.classes_) == [0, 1]
self.assertEqual(clf.n_classes_, 2) assert clf.n_classes_ == 2
self.assertIsInstance(clf.booster_, lgb.Booster) assert isinstance(clf.booster_, lgb.Booster)
self.assertIsInstance(clf.feature_importances_, np.ndarray) assert isinstance(clf.feature_importances_, np.ndarray)
def test_joblib(self):
def test_joblib():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj,
...@@ -391,11 +403,11 @@ class TestSklearn(unittest.TestCase): ...@@ -391,11 +403,11 @@ class TestSklearn(unittest.TestCase):
joblib.dump(gbm, 'lgb.pkl') # test model with custom functions joblib.dump(gbm, 'lgb.pkl') # test model with custom functions
gbm_pickle = joblib.load('lgb.pkl') gbm_pickle = joblib.load('lgb.pkl')
self.assertIsInstance(gbm_pickle.booster_, lgb.Booster) assert isinstance(gbm_pickle.booster_, lgb.Booster)
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params()) assert gbm.get_params() == gbm_pickle.get_params()
np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_) np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_)
self.assertAlmostEqual(gbm_pickle.learning_rate, 0.1) assert gbm_pickle.learning_rate == pytest.approx(0.1)
self.assertTrue(callable(gbm_pickle.objective)) assert callable(gbm_pickle.objective)
for eval_set in gbm.evals_result_: for eval_set in gbm.evals_result_:
for metric in gbm.evals_result_[eval_set]: for metric in gbm.evals_result_[eval_set]:
...@@ -405,7 +417,8 @@ class TestSklearn(unittest.TestCase): ...@@ -405,7 +417,8 @@ class TestSklearn(unittest.TestCase):
pred_pickle = gbm_pickle.predict(X_test) pred_pickle = gbm_pickle.predict(X_test)
np.testing.assert_allclose(pred_origin, pred_pickle) np.testing.assert_allclose(pred_origin, pred_pickle)
def test_random_state_object(self):
def test_random_state_object():
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
state1 = np.random.RandomState(123) state1 = np.random.RandomState(123)
...@@ -413,8 +426,8 @@ class TestSklearn(unittest.TestCase): ...@@ -413,8 +426,8 @@ class TestSklearn(unittest.TestCase):
clf1 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state1) clf1 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state1)
clf2 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state2) clf2 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state2)
# Test if random_state is properly stored # Test if random_state is properly stored
self.assertIs(clf1.random_state, state1) assert clf1.random_state is state1
self.assertIs(clf2.random_state, state2) assert clf2.random_state is state2
# Test if two random states produce identical models # Test if two random states produce identical models
clf1.fit(X_train, y_train) clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train) clf2.fit(X_train, y_train)
...@@ -424,28 +437,27 @@ class TestSklearn(unittest.TestCase): ...@@ -424,28 +437,27 @@ class TestSklearn(unittest.TestCase):
np.testing.assert_array_equal(clf1.feature_importances_, clf2.feature_importances_) np.testing.assert_array_equal(clf1.feature_importances_, clf2.feature_importances_)
df1 = clf1.booster_.model_to_string(num_iteration=0) df1 = clf1.booster_.model_to_string(num_iteration=0)
df2 = clf2.booster_.model_to_string(num_iteration=0) df2 = clf2.booster_.model_to_string(num_iteration=0)
self.assertMultiLineEqual(df1, df2) assert df1 == df2
# Test if subsequent fits sample from random_state object and produce different models # Test if subsequent fits sample from random_state object and produce different models
clf1.fit(X_train, y_train) clf1.fit(X_train, y_train)
y_pred1_refit = clf1.predict(X_test, raw_score=True) y_pred1_refit = clf1.predict(X_test, raw_score=True)
df3 = clf1.booster_.model_to_string(num_iteration=0) df3 = clf1.booster_.model_to_string(num_iteration=0)
self.assertIs(clf1.random_state, state1) assert clf1.random_state is state1
self.assertIs(clf2.random_state, state2) assert clf2.random_state is state2
self.assertRaises(AssertionError, with pytest.raises(AssertionError):
np.testing.assert_allclose, np.testing.assert_allclose(y_pred1, y_pred1_refit)
y_pred1, y_pred1_refit) assert df1 != df3
self.assertRaises(AssertionError,
self.assertMultiLineEqual,
df1, df3)
def test_feature_importances_single_leaf(self):
def test_feature_importances_single_leaf():
data = load_iris(return_X_y=False) data = load_iris(return_X_y=False)
clf = lgb.LGBMClassifier(n_estimators=10) clf = lgb.LGBMClassifier(n_estimators=10)
clf.fit(data.data, data.target) clf.fit(data.data, data.target)
importances = clf.feature_importances_ importances = clf.feature_importances_
self.assertEqual(len(importances), 4) assert len(importances) == 4
def test_feature_importances_type(self): def test_feature_importances_type():
data = load_iris(return_X_y=False) data = load_iris(return_X_y=False)
clf = lgb.LGBMClassifier(n_estimators=10) clf = lgb.LGBMClassifier(n_estimators=10)
clf.fit(data.data, data.target) clf.fit(data.data, data.target)
...@@ -456,11 +468,11 @@ class TestSklearn(unittest.TestCase): ...@@ -456,11 +468,11 @@ class TestSklearn(unittest.TestCase):
# Test that the largest element is NOT the same, the smallest can be the same, i.e. zero # Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
importance_split_top1 = sorted(importances_split, reverse=True)[0] importance_split_top1 = sorted(importances_split, reverse=True)[0]
importance_gain_top1 = sorted(importances_gain, reverse=True)[0] importance_gain_top1 = sorted(importances_gain, reverse=True)[0]
self.assertNotEqual(importance_split_top1, importance_gain_top1) assert importance_split_top1 != importance_gain_top1
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self): def test_pandas_categorical():
import pandas as pd pd = pytest.importorskip("pandas")
np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat) np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int "B": np.random.permutation([1, 2, 3] * 100), # int
...@@ -497,32 +509,28 @@ class TestSklearn(unittest.TestCase): ...@@ -497,32 +509,28 @@ class TestSklearn(unittest.TestCase):
pred5 = gbm5.predict(X_test, raw_score=True) pred5 = gbm5.predict(X_test, raw_score=True)
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[]) gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
pred6 = gbm6.predict(X_test, raw_score=True) pred6 = gbm6.predict(X_test, raw_score=True)
self.assertRaises(AssertionError, with pytest.raises(AssertionError):
np.testing.assert_allclose, np.testing.assert_allclose(pred0, pred1)
pred0, pred1) with pytest.raises(AssertionError):
self.assertRaises(AssertionError, np.testing.assert_allclose(pred0, pred2)
np.testing.assert_allclose,
pred0, pred2)
np.testing.assert_allclose(pred1, pred2) np.testing.assert_allclose(pred1, pred2)
np.testing.assert_allclose(pred0, pred3) np.testing.assert_allclose(pred0, pred3)
np.testing.assert_allclose(pred_prob, pred4) np.testing.assert_allclose(pred_prob, pred4)
self.assertRaises(AssertionError, with pytest.raises(AssertionError):
np.testing.assert_allclose, np.testing.assert_allclose(pred0, pred5) # ordered cat features aren't treated as cat features by default
pred0, pred5) # ordered cat features aren't treated as cat features by default with pytest.raises(AssertionError):
self.assertRaises(AssertionError, np.testing.assert_allclose(pred0, pred6)
np.testing.assert_allclose, assert gbm0.booster_.pandas_categorical == cat_values
pred0, pred6) assert gbm1.booster_.pandas_categorical == cat_values
self.assertListEqual(gbm0.booster_.pandas_categorical, cat_values) assert gbm2.booster_.pandas_categorical == cat_values
self.assertListEqual(gbm1.booster_.pandas_categorical, cat_values) assert gbm3.booster_.pandas_categorical == cat_values
self.assertListEqual(gbm2.booster_.pandas_categorical, cat_values) assert gbm4.pandas_categorical == cat_values
self.assertListEqual(gbm3.booster_.pandas_categorical, cat_values) assert gbm5.booster_.pandas_categorical == cat_values
self.assertListEqual(gbm4.pandas_categorical, cat_values) assert gbm6.booster_.pandas_categorical == cat_values
self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm6.booster_.pandas_categorical, cat_values)
def test_pandas_sparse():
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') pd = pytest.importorskip("pandas")
def test_pandas_sparse(self):
import pandas as pd
try: try:
from pandas.arrays import SparseArray from pandas.arrays import SparseArray
except ImportError: # support old versions except ImportError: # support old versions
...@@ -536,7 +544,7 @@ class TestSklearn(unittest.TestCase): ...@@ -536,7 +544,7 @@ class TestSklearn(unittest.TestCase):
"C": SparseArray(np.random.permutation([True, False] * 30))}) "C": SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0': if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]): for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype)) assert pd.api.types.is_sparse(dtype)
gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y) gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True) pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'): if hasattr(X_test, 'sparse'):
...@@ -545,7 +553,8 @@ class TestSklearn(unittest.TestCase): ...@@ -545,7 +553,8 @@ class TestSklearn(unittest.TestCase):
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True) pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense) np.testing.assert_allclose(pred_sparse, pred_dense)
def test_predict(self):
def test_predict():
# With default params # With default params
iris = load_iris(return_X_y=False) iris = load_iris(return_X_y=False)
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
...@@ -587,9 +596,8 @@ class TestSklearn(unittest.TestCase): ...@@ -587,9 +596,8 @@ class TestSklearn(unittest.TestCase):
res_sklearn_params = clf.predict_proba(X_test, res_sklearn_params = clf.predict_proba(X_test,
pred_early_stop=True, pred_early_stop=True,
pred_early_stop_margin=1.0) pred_early_stop_margin=1.0)
self.assertRaises(AssertionError, with pytest.raises(AssertionError):
np.testing.assert_allclose, np.testing.assert_allclose(res_engine, res_sklearn_params)
res_engine, res_sklearn_params)
# Tests start_iteration # Tests start_iteration
# Tests same probabilities, starting from iteration 10 # Tests same probabilities, starting from iteration 10
...@@ -622,24 +630,25 @@ class TestSklearn(unittest.TestCase): ...@@ -622,24 +630,25 @@ class TestSklearn(unittest.TestCase):
res_sklearn_params = clf.predict_proba(X_test, res_sklearn_params = clf.predict_proba(X_test,
pred_early_stop=True, pred_early_stop=True,
pred_early_stop_margin=1.0, start_iteration=10) pred_early_stop_margin=1.0, start_iteration=10)
self.assertRaises(AssertionError, with pytest.raises(AssertionError):
np.testing.assert_allclose, np.testing.assert_allclose(res_engine, res_sklearn_params)
res_engine, res_sklearn_params)
def test_evaluate_train_set(self): def test_evaluate_train_set():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=10, silent=True) gbm = lgb.LGBMRegressor(n_estimators=10, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
self.assertEqual(len(gbm.evals_result_), 2) assert len(gbm.evals_result_) == 2
self.assertIn('training', gbm.evals_result_) assert 'training' in gbm.evals_result_
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('valid_1', gbm.evals_result_) assert 'valid_1' in gbm.evals_result_
self.assertEqual(len(gbm.evals_result_['valid_1']), 1) assert len(gbm.evals_result_['valid_1']) == 1
self.assertIn('l2', gbm.evals_result_['valid_1']) assert 'l2' in gbm.evals_result_['valid_1']
def test_metrics(self):
def test_metrics():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
params = {'n_estimators': 2, 'verbose': -1} params = {'n_estimators': 2, 'verbose': -1}
params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False}
...@@ -647,37 +656,37 @@ class TestSklearn(unittest.TestCase): ...@@ -647,37 +656,37 @@ class TestSklearn(unittest.TestCase):
# no custom objective, no custom metric # no custom objective, no custom metric
# default metric # default metric
gbm = lgb.LGBMRegressor(**params).fit(**params_fit) gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
# non-default metric # non-default metric
gbm = lgb.LGBMRegressor(metric='mape', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(metric='mape', **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# no metric # no metric
gbm = lgb.LGBMRegressor(metric='None', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(metric='None', **params).fit(**params_fit)
self.assertIs(gbm.evals_result_, None) assert gbm.evals_result_ is None
# non-default metric in eval_metric # non-default metric in eval_metric
gbm = lgb.LGBMRegressor(**params).fit(eval_metric='mape', **params_fit) gbm = lgb.LGBMRegressor(**params).fit(eval_metric='mape', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# non-default metric with non-default metric in eval_metric # non-default metric with non-default metric in eval_metric
gbm = lgb.LGBMRegressor(metric='gamma', **params).fit(eval_metric='mape', **params_fit) gbm = lgb.LGBMRegressor(metric='gamma', **params).fit(eval_metric='mape', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# non-default metric with multiple metrics in eval_metric # non-default metric with multiple metrics in eval_metric
gbm = lgb.LGBMRegressor(metric='gamma', gbm = lgb.LGBMRegressor(metric='gamma',
**params).fit(eval_metric=['l2', 'mape'], **params_fit) **params).fit(eval_metric=['l2', 'mape'], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# non-default metric with multiple metrics in eval_metric for LGBMClassifier # non-default metric with multiple metrics in eval_metric for LGBMClassifier
X_classification, y_classification = load_breast_cancer(return_X_y=True) X_classification, y_classification = load_breast_cancer(return_X_y=True)
...@@ -688,224 +697,225 @@ class TestSklearn(unittest.TestCase): ...@@ -688,224 +697,225 @@ class TestSklearn(unittest.TestCase):
'verbose': False} 'verbose': False}
gbm = lgb.LGBMClassifier(**params_classification).fit(eval_metric=['fair', 'error'], gbm = lgb.LGBMClassifier(**params_classification).fit(eval_metric=['fair', 'error'],
**params_fit_classification) **params_fit_classification)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('fair', gbm.evals_result_['training']) assert 'fair' in gbm.evals_result_['training']
self.assertIn('binary_error', gbm.evals_result_['training']) assert 'binary_error' in gbm.evals_result_['training']
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
# default metric for non-default objective # default metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(objective='regression_l1', **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
# non-default metric for non-default objective # non-default metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape', gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape',
**params).fit(**params_fit) **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# no metric # no metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None', gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None',
**params).fit(**params_fit) **params).fit(**params_fit)
self.assertIs(gbm.evals_result_, None) assert gbm.evals_result_ is None
# non-default metric in eval_metric for non-default objective # non-default metric in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', gbm = lgb.LGBMRegressor(objective='regression_l1',
**params).fit(eval_metric='mape', **params_fit) **params).fit(eval_metric='mape', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# non-default metric with non-default metric in eval_metric for non-default objective # non-default metric with non-default metric in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma', gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma',
**params).fit(eval_metric='mape', **params_fit) **params).fit(eval_metric='mape', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# non-default metric with multiple metrics in eval_metric for non-default objective # non-default metric with multiple metrics in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma', gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma',
**params).fit(eval_metric=['l2', 'mape'], **params_fit) **params).fit(eval_metric=['l2', 'mape'], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# custom objective, no custom metric # custom objective, no custom metric
# default regression metric for custom objective # default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(**params_fit) gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
# non-default regression metric for custom objective # non-default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# multiple regression metrics for custom objective # multiple regression metrics for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(**params_fit) **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
# no metric # no metric
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='None', gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='None',
**params).fit(**params_fit) **params).fit(**params_fit)
self.assertIs(gbm.evals_result_, None) assert gbm.evals_result_ is None
# default regression metric with non-default metric in eval_metric for custom objective # default regression metric with non-default metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, gbm = lgb.LGBMRegressor(objective=custom_dummy_obj,
**params).fit(eval_metric='mape', **params_fit) **params).fit(eval_metric='mape', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# non-default regression metric with metric in eval_metric for custom objective # non-default regression metric with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape',
**params).fit(eval_metric='gamma', **params_fit) **params).fit(eval_metric='gamma', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
# multiple regression metrics with metric in eval_metric for custom objective # multiple regression metrics with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(eval_metric='l2', **params_fit) **params).fit(eval_metric='l2', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
# multiple regression metrics with multiple metrics in eval_metric for custom objective # multiple regression metrics with multiple metrics in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(eval_metric=['l2', 'mape'], **params_fit) **params).fit(eval_metric=['l2', 'mape'], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 4) assert len(gbm.evals_result_['training']) == 4
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
# no custom objective, custom metric # no custom objective, custom metric
# default metric with custom metric # default metric with custom metric
gbm = lgb.LGBMRegressor(**params).fit(eval_metric=constant_metric, **params_fit) gbm = lgb.LGBMRegressor(**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# non-default metric with custom metric # non-default metric with custom metric
gbm = lgb.LGBMRegressor(metric='mape', gbm = lgb.LGBMRegressor(metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# multiple metrics with custom metric # multiple metrics with custom metric
gbm = lgb.LGBMRegressor(metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(metric=['l1', 'gamma'],
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# custom metric (disable default metric) # custom metric (disable default metric)
gbm = lgb.LGBMRegressor(metric='None', gbm = lgb.LGBMRegressor(metric='None',
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# default metric for non-default objective with custom metric # default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', gbm = lgb.LGBMRegressor(objective='regression_l1',
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# non-default metric for non-default objective with custom metric # non-default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape', gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# multiple metrics for non-default objective with custom metric # multiple metrics for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective='regression_l1', metric=['l1', 'gamma'],
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('l1', gbm.evals_result_['training']) assert 'l1' in gbm.evals_result_['training']
self.assertIn('gamma', gbm.evals_result_['training']) assert 'gamma' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# custom metric (disable default metric for non-default objective) # custom metric (disable default metric for non-default objective)
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None', gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None',
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# custom objective, custom metric # custom objective, custom metric
# custom metric for custom objective # custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, gbm = lgb.LGBMRegressor(objective=custom_dummy_obj,
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# non-default regression metric with custom metric for custom objective # non-default regression metric with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
# multiple regression metrics with custom metric for custom objective # multiple regression metrics with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l2', 'mape'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l2', 'mape'],
**params).fit(eval_metric=constant_metric, **params_fit) **params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('l2', gbm.evals_result_['training']) assert 'l2' in gbm.evals_result_['training']
self.assertIn('mape', gbm.evals_result_['training']) assert 'mape' in gbm.evals_result_['training']
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
X, y = load_digits(n_class=3, return_X_y=True) X, y = load_digits(n_class=3, return_X_y=True)
params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False}
# default metric and invalid binary metric is replaced with multiclass alternative # default metric and invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='binary_error', **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric='binary_error', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('multi_logloss', gbm.evals_result_['training']) assert 'multi_logloss' in gbm.evals_result_['training']
self.assertIn('multi_error', gbm.evals_result_['training']) assert 'multi_error' in gbm.evals_result_['training']
# invalid objective is replaced with default multiclass one # invalid objective is replaced with default multiclass one
# and invalid binary metric is replaced with multiclass alternative # and invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(objective='invalid_obj', gbm = lgb.LGBMClassifier(objective='invalid_obj',
**params).fit(eval_metric='binary_error', **params_fit) **params).fit(eval_metric='binary_error', **params_fit)
self.assertEqual(gbm.objective_, 'multiclass') assert gbm.objective_ == 'multiclass'
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('multi_logloss', gbm.evals_result_['training']) assert 'multi_logloss' in gbm.evals_result_['training']
self.assertIn('multi_error', gbm.evals_result_['training']) assert 'multi_error' in gbm.evals_result_['training']
# default metric for non-default multiclass objective # default metric for non-default multiclass objective
# and invalid binary metric is replaced with multiclass alternative # and invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(objective='ovr', gbm = lgb.LGBMClassifier(objective='ovr',
**params).fit(eval_metric='binary_error', **params_fit) **params).fit(eval_metric='binary_error', **params_fit)
self.assertEqual(gbm.objective_, 'ovr') assert gbm.objective_ == 'ovr'
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('multi_logloss', gbm.evals_result_['training']) assert 'multi_logloss' in gbm.evals_result_['training']
self.assertIn('multi_error', gbm.evals_result_['training']) assert 'multi_error' in gbm.evals_result_['training']
X, y = load_digits(n_class=2, return_X_y=True) X, y = load_digits(n_class=2, return_X_y=True)
params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False}
# default metric and invalid multiclass metric is replaced with binary alternative # default metric and invalid multiclass metric is replaced with binary alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='multi_error', **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric='multi_error', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2) assert len(gbm.evals_result_['training']) == 2
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
self.assertIn('binary_error', gbm.evals_result_['training']) assert 'binary_error' in gbm.evals_result_['training']
# invalid multiclass metric is replaced with binary alternative for custom objective # invalid multiclass metric is replaced with binary alternative for custom objective
gbm = lgb.LGBMClassifier(objective=custom_dummy_obj, gbm = lgb.LGBMClassifier(objective=custom_dummy_obj,
**params).fit(eval_metric='multi_logloss', **params_fit) **params).fit(eval_metric='multi_logloss', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
def test_multiple_eval_metrics(self):
def test_multiple_eval_metrics():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
...@@ -914,35 +924,36 @@ class TestSklearn(unittest.TestCase): ...@@ -914,35 +924,36 @@ class TestSklearn(unittest.TestCase):
# Verify that can receive a list of metrics, only callable # Verify that can receive a list of metrics, only callable
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
self.assertIn('decreasing_metric', gbm.evals_result_['training']) assert 'decreasing_metric' in gbm.evals_result_['training']
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
# Verify that can receive a list of custom and built-in metrics # Verify that can receive a list of custom and built-in metrics
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric, 'fair'], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric, 'fair'], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 4) assert len(gbm.evals_result_['training']) == 4
self.assertIn('error', gbm.evals_result_['training']) assert 'error' in gbm.evals_result_['training']
self.assertIn('decreasing_metric', gbm.evals_result_['training']) assert 'decreasing_metric' in gbm.evals_result_['training']
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
self.assertIn('fair', gbm.evals_result_['training']) assert 'fair' in gbm.evals_result_['training']
# Verify that works as expected when eval_metric is empty # Verify that works as expected when eval_metric is empty
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1) assert len(gbm.evals_result_['training']) == 1
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
# Verify that can receive a list of metrics, only built-in # Verify that can receive a list of metrics, only built-in
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error'], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error'], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
# Verify that eval_metric is robust to receiving a list with None # Verify that eval_metric is robust to receiving a list with None
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error', None], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error', None], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3) assert len(gbm.evals_result_['training']) == 3
self.assertIn('binary_logloss', gbm.evals_result_['training']) assert 'binary_logloss' in gbm.evals_result_['training']
def test_inf_handle(self): def test_inf_handle():
nrows = 100 nrows = 100
ncols = 10 ncols = 10
X = np.random.randn(nrows, ncols) X = np.random.randn(nrows, ncols)
...@@ -954,7 +965,8 @@ class TestSklearn(unittest.TestCase): ...@@ -954,7 +965,8 @@ class TestSklearn(unittest.TestCase):
gbm = lgb.LGBMRegressor(**params).fit(**params_fit) gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.inf) np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.inf)
def test_nan_handle(self):
def test_nan_handle():
nrows = 100 nrows = 100
ncols = 10 ncols = 10
X = np.random.randn(nrows, ncols) X = np.random.randn(nrows, ncols)
...@@ -966,25 +978,27 @@ class TestSklearn(unittest.TestCase): ...@@ -966,25 +978,27 @@ class TestSklearn(unittest.TestCase):
gbm = lgb.LGBMRegressor(**params).fit(**params_fit) gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan) np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan)
def test_first_metric_only(self):
def test_first_metric_only():
def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only): def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only):
params['first_metric_only'] = first_metric_only params['first_metric_only'] = first_metric_only
gbm = lgb.LGBMRegressor(**params).fit(**params_fit) gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_), len(eval_set_names)) assert len(gbm.evals_result_) == len(eval_set_names)
for eval_set_name in eval_set_names: for eval_set_name in eval_set_names:
self.assertIn(eval_set_name, gbm.evals_result_) assert eval_set_name in gbm.evals_result_
self.assertEqual(len(gbm.evals_result_[eval_set_name]), len(metric_names)) assert len(gbm.evals_result_[eval_set_name]) == len(metric_names)
for metric_name in metric_names: for metric_name in metric_names:
self.assertIn(metric_name, gbm.evals_result_[eval_set_name]) assert metric_name in gbm.evals_result_[eval_set_name]
actual = len(gbm.evals_result_[eval_set_name][metric_name]) actual = len(gbm.evals_result_[eval_set_name][metric_name])
expected = assumed_iteration + (params_fit['early_stopping_rounds'] expected = assumed_iteration + (params_fit['early_stopping_rounds']
if eval_set_name != 'training' if eval_set_name != 'training'
and assumed_iteration != gbm.n_estimators else 0) and assumed_iteration != gbm.n_estimators else 0)
self.assertEqual(expected, actual) if eval_set_name != 'training':
self.assertEqual(assumed_iteration if eval_set_name != 'training' else gbm.n_estimators, assert assumed_iteration == gbm.best_iteration_
gbm.best_iteration_) else:
assert gbm.n_estimators == gbm.best_iteration_
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
...@@ -1003,7 +1017,7 @@ class TestSklearn(unittest.TestCase): ...@@ -1003,7 +1017,7 @@ class TestSklearn(unittest.TestCase):
iter_valid1_l2 = 18 iter_valid1_l2 = 18
iter_valid2_l1 = 11 iter_valid2_l1 = 11
iter_valid2_l2 = 7 iter_valid2_l2 = 7
self.assertEqual(len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])), 4) assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 4
iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1]) iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2]) iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
iter_min = min([iter_min_l1, iter_min_l2]) iter_min = min([iter_min_l1, iter_min_l2])
...@@ -1066,7 +1080,8 @@ class TestSklearn(unittest.TestCase): ...@@ -1066,7 +1080,8 @@ class TestSklearn(unittest.TestCase):
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min, False) fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min, False)
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True) fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True)
def test_class_weight(self):
def test_class_weight():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_str = y_train.astype('str') y_train_str = y_train.astype('str')
...@@ -1100,22 +1115,22 @@ class TestSklearn(unittest.TestCase): ...@@ -1100,22 +1115,22 @@ class TestSklearn(unittest.TestCase):
np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], np.testing.assert_allclose(gbm.evals_result_[eval_set][metric],
gbm_str.evals_result_[eval_set][metric]) gbm_str.evals_result_[eval_set][metric])
def test_continue_training_with_model(self):
def test_continue_training_with_model():
X, y = load_digits(n_class=3, return_X_y=True) X, y = load_digits(n_class=3, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test),
verbose=False) verbose=False)
gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test),
verbose=False, init_model=init_gbm) verbose=False, init_model=init_gbm)
self.assertEqual(len(init_gbm.evals_result_['valid_0']['multi_logloss']), assert len(init_gbm.evals_result_['valid_0']['multi_logloss']) == len(gbm.evals_result_['valid_0']['multi_logloss'])
len(gbm.evals_result_['valid_0']['multi_logloss'])) assert len(init_gbm.evals_result_['valid_0']['multi_logloss']) == 5
self.assertEqual(len(init_gbm.evals_result_['valid_0']['multi_logloss']), 5) assert gbm.evals_result_['valid_0']['multi_logloss'][-1] < init_gbm.evals_result_['valid_0']['multi_logloss'][-1]
self.assertLess(gbm.evals_result_['valid_0']['multi_logloss'][-1],
init_gbm.evals_result_['valid_0']['multi_logloss'][-1])
# sklearn < 0.22 requires passing "attributes" argument
# sklearn < 0.22 requires passing "attributes" argument @pytest.mark.skipif(sk_version < parse_version('0.22'), reason='scikit-learn version is less than 0.22')
@unittest.skipIf(sk_version < parse_version('0.22'), 'scikit-learn version is less than 0.22') def test_check_is_fitted():
def test_check_is_fitted(self):
X, y = load_digits(n_class=2, return_X_y=True) X, y = load_digits(n_class=2, return_X_y=True)
est = lgb.LGBMModel(n_estimators=5, objective="binary") est = lgb.LGBMModel(n_estimators=5, objective="binary")
clf = lgb.LGBMClassifier(n_estimators=5) clf = lgb.LGBMClassifier(n_estimators=5)
...@@ -1123,9 +1138,8 @@ class TestSklearn(unittest.TestCase): ...@@ -1123,9 +1138,8 @@ class TestSklearn(unittest.TestCase):
rnk = lgb.LGBMRanker(n_estimators=5) rnk = lgb.LGBMRanker(n_estimators=5)
models = (est, clf, reg, rnk) models = (est, clf, reg, rnk)
for model in models: for model in models:
self.assertRaises(lgb.compat.LGBMNotFittedError, with pytest.raises(lgb.compat.LGBMNotFittedError):
check_is_fitted, check_is_fitted(model)
model)
est.fit(X, y) est.fit(X, y)
clf.fit(X, y) clf.fit(X, y)
reg.fit(X, y) reg.fit(X, y)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment