Unverified Commit 6dbe736e authored by Thomas J. Fan's avatar Thomas J. Fan Committed by GitHub
Browse files

[python][tests] migrate test_engine.py to pytest (#3800)

* TST Migrates tset_engine.py to pytest

* ENH Apply suggestions

* ENH Uses temp path

* ENH Fixes typos
parent 9cc3777c
...@@ -6,7 +6,6 @@ import os ...@@ -6,7 +6,6 @@ import os
import pickle import pickle
import psutil import psutil
import random import random
import unittest
import lightgbm as lgb import lightgbm as lgb
import numpy as np import numpy as np
...@@ -14,6 +13,7 @@ from scipy.sparse import csr_matrix, isspmatrix_csr, isspmatrix_csc ...@@ -14,6 +13,7 @@ from scipy.sparse import csr_matrix, isspmatrix_csr, isspmatrix_csc
from sklearn.datasets import load_svmlight_file, make_multilabel_classification from sklearn.datasets import load_svmlight_file, make_multilabel_classification
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, roc_auc_score, average_precision_score from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
import pytest
from .utils import load_boston, load_breast_cancer, load_digits, load_iris from .utils import load_boston, load_breast_cancer, load_digits, load_iris
...@@ -48,8 +48,7 @@ def categorize(continuous_x): ...@@ -48,8 +48,7 @@ def categorize(continuous_x):
return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01))
class TestEngine(unittest.TestCase): def test_binary():
def test_binary(self):
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -67,11 +66,12 @@ class TestEngine(unittest.TestCase): ...@@ -67,11 +66,12 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = log_loss(y_test, gbm.predict(X_test)) ret = log_loss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.14) assert ret < 0.14
self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50) assert len(evals_result['valid_0']['binary_logloss']) == 50
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) assert evals_result['valid_0']['binary_logloss'][-1] == pytest.approx(ret)
def test_rf(self):
def test_rf():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -93,10 +93,11 @@ class TestEngine(unittest.TestCase): ...@@ -93,10 +93,11 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = log_loss(y_test, gbm.predict(X_test)) ret = log_loss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.19) assert ret < 0.19
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) assert evals_result['valid_0']['binary_logloss'][-1] == pytest.approx(ret)
def test_regression(self): def test_regression():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -112,10 +113,11 @@ class TestEngine(unittest.TestCase): ...@@ -112,10 +113,11 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = mean_squared_error(y_test, gbm.predict(X_test)) ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 7) assert ret < 7
self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
def test_missing_value_handle(self): def test_missing_value_handle():
X_train = np.zeros((100, 1)) X_train = np.zeros((100, 1))
y_train = np.zeros(100) y_train = np.zeros(100)
trues = random.sample(range(100), 20) trues = random.sample(range(100), 20)
...@@ -137,10 +139,11 @@ class TestEngine(unittest.TestCase): ...@@ -137,10 +139,11 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = mean_squared_error(y_train, gbm.predict(X_train)) ret = mean_squared_error(y_train, gbm.predict(X_train))
self.assertLess(ret, 0.005) assert ret < 0.005
self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
def test_missing_value_handle_more_na(self):
def test_missing_value_handle_more_na():
X_train = np.ones((100, 1)) X_train = np.ones((100, 1))
y_train = np.ones(100) y_train = np.ones(100)
trues = random.sample(range(100), 80) trues = random.sample(range(100), 80)
...@@ -162,10 +165,11 @@ class TestEngine(unittest.TestCase): ...@@ -162,10 +165,11 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = mean_squared_error(y_train, gbm.predict(X_train)) ret = mean_squared_error(y_train, gbm.predict(X_train))
self.assertLess(ret, 0.005) assert ret < 0.005
self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
def test_missing_value_handle_na(self): def test_missing_value_handle_na():
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
y = [1, 1, 1, 1, 0, 0, 0, 0, 1] y = [1, 1, 1, 1, 0, 0, 0, 0, 1]
...@@ -194,10 +198,11 @@ class TestEngine(unittest.TestCase): ...@@ -194,10 +198,11 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y) np.testing.assert_allclose(pred, y)
ret = roc_auc_score(y_train, pred) ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999) assert ret > 0.999
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) assert evals_result['valid_0']['auc'][-1] == pytest.approx(ret)
def test_missing_value_handle_zero(self):
def test_missing_value_handle_zero():
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
y = [0, 1, 1, 1, 0, 0, 0, 0, 0] y = [0, 1, 1, 1, 0, 0, 0, 0, 0]
...@@ -226,10 +231,11 @@ class TestEngine(unittest.TestCase): ...@@ -226,10 +231,11 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y) np.testing.assert_allclose(pred, y)
ret = roc_auc_score(y_train, pred) ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999) assert ret > 0.999
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) assert evals_result['valid_0']['auc'][-1] == pytest.approx(ret)
def test_missing_value_handle_none(self): def test_missing_value_handle_none():
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
y = [0, 1, 1, 1, 0, 0, 0, 0, 0] y = [0, 1, 1, 1, 0, 0, 0, 0, 0]
...@@ -256,13 +262,14 @@ class TestEngine(unittest.TestCase): ...@@ -256,13 +262,14 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[0], pred[1]) assert pred[0] == pytest.approx(pred[1])
self.assertAlmostEqual(pred[-1], pred[0]) assert pred[-1] == pytest.approx(pred[0])
ret = roc_auc_score(y_train, pred) ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.83) assert ret > 0.83
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) assert evals_result['valid_0']['auc'][-1] == pytest.approx(ret)
def test_categorical_handle(self):
def test_categorical_handle():
x = [0, 1, 2, 3, 4, 5, 6, 7] x = [0, 1, 2, 3, 4, 5, 6, 7]
y = [0, 1, 0, 1, 0, 1, 0, 1] y = [0, 1, 0, 1, 0, 1, 0, 1]
...@@ -296,10 +303,11 @@ class TestEngine(unittest.TestCase): ...@@ -296,10 +303,11 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y) np.testing.assert_allclose(pred, y)
ret = roc_auc_score(y_train, pred) ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999) assert ret > 0.999
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) assert evals_result['valid_0']['auc'][-1] == pytest.approx(ret)
def test_categorical_handle_na(self): def test_categorical_handle_na():
x = [0, np.nan, 0, np.nan, 0, np.nan] x = [0, np.nan, 0, np.nan, 0, np.nan]
y = [0, 1, 0, 1, 0, 1] y = [0, 1, 0, 1, 0, 1]
...@@ -333,10 +341,11 @@ class TestEngine(unittest.TestCase): ...@@ -333,10 +341,11 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y) np.testing.assert_allclose(pred, y)
ret = roc_auc_score(y_train, pred) ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999) assert ret > 0.999
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) assert evals_result['valid_0']['auc'][-1] == pytest.approx(ret)
def test_categorical_non_zero_inputs(self):
def test_categorical_non_zero_inputs():
x = [1, 1, 1, 1, 1, 1, 2, 2] x = [1, 1, 1, 1, 1, 1, 2, 2]
y = [1, 1, 1, 1, 1, 1, 0, 0] y = [1, 1, 1, 1, 1, 1, 0, 0]
...@@ -370,10 +379,11 @@ class TestEngine(unittest.TestCase): ...@@ -370,10 +379,11 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y) np.testing.assert_allclose(pred, y)
ret = roc_auc_score(y_train, pred) ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999) assert ret > 0.999
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) assert evals_result['valid_0']['auc'][-1] == pytest.approx(ret)
def test_multiclass(self): def test_multiclass():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -391,10 +401,11 @@ class TestEngine(unittest.TestCase): ...@@ -391,10 +401,11 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = multi_logloss(y_test, gbm.predict(X_test)) ret = multi_logloss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.16) assert ret < 0.16
self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret)
def test_multiclass_rf(self): def test_multiclass_rf():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -419,10 +430,11 @@ class TestEngine(unittest.TestCase): ...@@ -419,10 +430,11 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = multi_logloss(y_test, gbm.predict(X_test)) ret = multi_logloss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.23) assert ret < 0.23
self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret)
def test_multiclass_prediction_early_stopping(self):
def test_multiclass_prediction_early_stopping():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -439,16 +451,17 @@ class TestEngine(unittest.TestCase): ...@@ -439,16 +451,17 @@ class TestEngine(unittest.TestCase):
"pred_early_stop_freq": 5, "pred_early_stop_freq": 5,
"pred_early_stop_margin": 1.5} "pred_early_stop_margin": 1.5}
ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter)) ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
self.assertLess(ret, 0.8) assert ret < 0.8
self.assertGreater(ret, 0.6) # loss will be higher than when evaluating the full model assert ret > 0.6 # loss will be higher than when evaluating the full model
pred_parameter = {"pred_early_stop": True, pred_parameter = {"pred_early_stop": True,
"pred_early_stop_freq": 5, "pred_early_stop_freq": 5,
"pred_early_stop_margin": 5.5} "pred_early_stop_margin": 5.5}
ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter)) ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
self.assertLess(ret, 0.2) assert ret < 0.2
def test_multi_class_error(self): def test_multi_class_error():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error', params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error',
'num_leaves': 4, 'verbose': -1} 'num_leaves': 4, 'verbose': -1}
...@@ -463,21 +476,21 @@ class TestEngine(unittest.TestCase): ...@@ -463,21 +476,21 @@ class TestEngine(unittest.TestCase):
np.testing.assert_allclose(predict_1, predict_default) np.testing.assert_allclose(predict_1, predict_default)
# check against independent calculation for k = 1 # check against independent calculation for k = 1
err = top_k_error(y, predict_1, 1) err = top_k_error(y, predict_1, 1)
self.assertAlmostEqual(results['training']['multi_error'][-1], err) assert results['training']['multi_error'][-1] == pytest.approx(err)
# check against independent calculation for k = 2 # check against independent calculation for k = 2
results = {} results = {}
est = lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10, est = lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10,
valid_sets=[lgb_data], evals_result=results, verbose_eval=False) valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
predict_2 = est.predict(X) predict_2 = est.predict(X)
err = top_k_error(y, predict_2, 2) err = top_k_error(y, predict_2, 2)
self.assertAlmostEqual(results['training']['multi_error@2'][-1], err) assert results['training']['multi_error@2'][-1] == pytest.approx(err)
# check against independent calculation for k = 10 # check against independent calculation for k = 10
results = {} results = {}
est = lgb.train(dict(params, multi_error_top_k=10), lgb_data, num_boost_round=10, est = lgb.train(dict(params, multi_error_top_k=10), lgb_data, num_boost_round=10,
valid_sets=[lgb_data], evals_result=results, verbose_eval=False) valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
predict_3 = est.predict(X) predict_3 = est.predict(X)
err = top_k_error(y, predict_3, 10) err = top_k_error(y, predict_3, 10)
self.assertAlmostEqual(results['training']['multi_error@10'][-1], err) assert results['training']['multi_error@10'][-1] == pytest.approx(err)
# check cases where predictions are equal # check cases where predictions are equal
X = np.array([[0, 0], [0, 0]]) X = np.array([[0, 0], [0, 0]])
y = np.array([0, 1]) y = np.array([0, 1])
...@@ -486,13 +499,14 @@ class TestEngine(unittest.TestCase): ...@@ -486,13 +499,14 @@ class TestEngine(unittest.TestCase):
results = {} results = {}
lgb.train(params, lgb_data, num_boost_round=10, lgb.train(params, lgb_data, num_boost_round=10,
valid_sets=[lgb_data], evals_result=results, verbose_eval=False) valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
self.assertAlmostEqual(results['training']['multi_error'][-1], 1) assert results['training']['multi_error'][-1] == pytest.approx(1)
results = {} results = {}
lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10, lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10,
valid_sets=[lgb_data], evals_result=results, verbose_eval=False) valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
self.assertAlmostEqual(results['training']['multi_error@2'][-1], 0) assert results['training']['multi_error@2'][-1] == pytest.approx(0)
def test_auc_mu(self):
def test_auc_mu():
# should give same result as binary auc for 2 classes # should give same result as binary auc for 2 classes
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
y_new = np.zeros((len(y))) y_new = np.zeros((len(y)))
...@@ -522,7 +536,7 @@ class TestEngine(unittest.TestCase): ...@@ -522,7 +536,7 @@ class TestEngine(unittest.TestCase):
'seed': 0} 'seed': 0}
results_auc_mu = {} results_auc_mu = {}
lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc_mu) lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc_mu)
self.assertAlmostEqual(results_auc_mu['training']['auc_mu'][-1], 0.5) assert results_auc_mu['training']['auc_mu'][-1] == pytest.approx(0.5)
# test that weighted data gives different auc_mu # test that weighted data gives different auc_mu
lgb_X = lgb.Dataset(X, label=y) lgb_X = lgb.Dataset(X, label=y)
lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.abs(np.random.normal(size=y.shape))) lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.abs(np.random.normal(size=y.shape)))
...@@ -532,14 +546,14 @@ class TestEngine(unittest.TestCase): ...@@ -532,14 +546,14 @@ class TestEngine(unittest.TestCase):
lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_unweighted) lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_unweighted)
lgb.train(params, lgb_X_weighted, num_boost_round=10, valid_sets=[lgb_X_weighted], lgb.train(params, lgb_X_weighted, num_boost_round=10, valid_sets=[lgb_X_weighted],
evals_result=results_weighted) evals_result=results_weighted)
self.assertLess(results_weighted['training']['auc_mu'][-1], 1) assert results_weighted['training']['auc_mu'][-1] < 1
self.assertNotEqual(results_unweighted['training']['auc_mu'][-1], results_weighted['training']['auc_mu'][-1]) assert results_unweighted['training']['auc_mu'][-1] != results_weighted['training']['auc_mu'][-1]
# test that equal data weights give same auc_mu as unweighted data # test that equal data weights give same auc_mu as unweighted data
lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.ones(y.shape) * 0.5) lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.ones(y.shape) * 0.5)
lgb.train(params, lgb_X_weighted, num_boost_round=10, valid_sets=[lgb_X_weighted], lgb.train(params, lgb_X_weighted, num_boost_round=10, valid_sets=[lgb_X_weighted],
evals_result=results_weighted) evals_result=results_weighted)
self.assertAlmostEqual(results_unweighted['training']['auc_mu'][-1], results_weighted['training']['auc_mu'][-1], assert results_unweighted['training']['auc_mu'][-1] == pytest.approx(
places=5) results_weighted['training']['auc_mu'][-1], abs=1e-5)
# should give 1 when accuracy = 1 # should give 1 when accuracy = 1
X = X[:10, :] X = X[:10, :]
y = y[:10] y = y[:10]
...@@ -551,7 +565,7 @@ class TestEngine(unittest.TestCase): ...@@ -551,7 +565,7 @@ class TestEngine(unittest.TestCase):
'verbose': -1} 'verbose': -1}
results = {} results = {}
lgb.train(params, lgb_X, num_boost_round=100, valid_sets=[lgb_X], evals_result=results) lgb.train(params, lgb_X, num_boost_round=100, valid_sets=[lgb_X], evals_result=results)
self.assertAlmostEqual(results['training']['auc_mu'][-1], 1) assert results['training']['auc_mu'][-1] == pytest.approx(1)
# test loading class weights # test loading class weights
Xy = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), Xy = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/multiclass_classification/multiclass.train')) '../../examples/multiclass_classification/multiclass.train'))
...@@ -569,9 +583,10 @@ class TestEngine(unittest.TestCase): ...@@ -569,9 +583,10 @@ class TestEngine(unittest.TestCase):
params['auc_mu_weights'] = [] params['auc_mu_weights'] = []
results_no_weight = {} results_no_weight = {}
lgb.train(params, lgb_X, num_boost_round=5, valid_sets=[lgb_X], evals_result=results_no_weight) lgb.train(params, lgb_X, num_boost_round=5, valid_sets=[lgb_X], evals_result=results_no_weight)
self.assertNotEqual(results_weight['training']['auc_mu'][-1], results_no_weight['training']['auc_mu'][-1]) assert results_weight['training']['auc_mu'][-1] != results_no_weight['training']['auc_mu'][-1]
def test_early_stopping(self): def test_early_stopping():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
params = { params = {
'objective': 'binary', 'objective': 'binary',
...@@ -589,9 +604,9 @@ class TestEngine(unittest.TestCase): ...@@ -589,9 +604,9 @@ class TestEngine(unittest.TestCase):
valid_names=valid_set_name, valid_names=valid_set_name,
verbose_eval=False, verbose_eval=False,
early_stopping_rounds=5) early_stopping_rounds=5)
self.assertEqual(gbm.best_iteration, 10) assert gbm.best_iteration == 10
self.assertIn(valid_set_name, gbm.best_score) assert valid_set_name in gbm.best_score
self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) assert 'binary_logloss' in gbm.best_score[valid_set_name]
# early stopping occurs # early stopping occurs
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train,
num_boost_round=40, num_boost_round=40,
...@@ -599,11 +614,12 @@ class TestEngine(unittest.TestCase): ...@@ -599,11 +614,12 @@ class TestEngine(unittest.TestCase):
valid_names=valid_set_name, valid_names=valid_set_name,
verbose_eval=False, verbose_eval=False,
early_stopping_rounds=5) early_stopping_rounds=5)
self.assertLessEqual(gbm.best_iteration, 39) assert gbm.best_iteration <= 39
self.assertIn(valid_set_name, gbm.best_score) assert valid_set_name in gbm.best_score
self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) assert 'binary_logloss' in gbm.best_score[valid_set_name]
def test_continue_train(self):
def test_continue_train():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -626,12 +642,13 @@ class TestEngine(unittest.TestCase): ...@@ -626,12 +642,13 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result, evals_result=evals_result,
init_model='model.txt') init_model='model.txt')
ret = mean_absolute_error(y_test, gbm.predict(X_test)) ret = mean_absolute_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 2.0) assert ret < 2.0
self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) assert evals_result['valid_0']['l1'][-1] == pytest.approx(ret)
np.testing.assert_allclose(evals_result['valid_0']['l1'], evals_result['valid_0']['custom_mae']) np.testing.assert_allclose(evals_result['valid_0']['l1'], evals_result['valid_0']['custom_mae'])
os.remove(model_name) os.remove(model_name)
def test_continue_train_reused_dataset(self):
def test_continue_train_reused_dataset():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
params = { params = {
'objective': 'regression', 'objective': 'regression',
...@@ -642,9 +659,10 @@ class TestEngine(unittest.TestCase): ...@@ -642,9 +659,10 @@ class TestEngine(unittest.TestCase):
init_gbm_2 = lgb.train(params, lgb_train, num_boost_round=5, init_model=init_gbm) init_gbm_2 = lgb.train(params, lgb_train, num_boost_round=5, init_model=init_gbm)
init_gbm_3 = lgb.train(params, lgb_train, num_boost_round=5, init_model=init_gbm_2) init_gbm_3 = lgb.train(params, lgb_train, num_boost_round=5, init_model=init_gbm_2)
gbm = lgb.train(params, lgb_train, num_boost_round=5, init_model=init_gbm_3) gbm = lgb.train(params, lgb_train, num_boost_round=5, init_model=init_gbm_3)
self.assertEqual(gbm.current_iteration(), 20) assert gbm.current_iteration() == 20
def test_continue_train_dart(self):
def test_continue_train_dart():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -664,10 +682,11 @@ class TestEngine(unittest.TestCase): ...@@ -664,10 +682,11 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result, evals_result=evals_result,
init_model=init_gbm) init_model=init_gbm)
ret = mean_absolute_error(y_test, gbm.predict(X_test)) ret = mean_absolute_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 2.0) assert ret < 2.0
self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) assert evals_result['valid_0']['l1'][-1] == pytest.approx(ret)
def test_continue_train_multiclass(self): def test_continue_train_multiclass():
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -687,10 +706,11 @@ class TestEngine(unittest.TestCase): ...@@ -687,10 +706,11 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result, evals_result=evals_result,
init_model=init_gbm) init_model=init_gbm)
ret = multi_logloss(y_test, gbm.predict(X_test)) ret = multi_logloss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.1) assert ret < 0.1
self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret)
def test_cv(self): def test_cv():
X_train, y_train = load_boston(return_X_y=True) X_train, y_train = load_boston(return_X_y=True)
params = {'verbose': -1} params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
...@@ -699,25 +719,25 @@ class TestEngine(unittest.TestCase): ...@@ -699,25 +719,25 @@ class TestEngine(unittest.TestCase):
cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
nfold=3, stratified=False, shuffle=False, nfold=3, stratified=False, shuffle=False,
metrics='l1', verbose_eval=False) metrics='l1', verbose_eval=False)
self.assertIn('l1-mean', cv_res) assert 'l1-mean' in cv_res
self.assertNotIn('l2-mean', cv_res) assert 'l2-mean' not in cv_res
self.assertEqual(len(cv_res['l1-mean']), 10) assert len(cv_res['l1-mean']) == 10
# shuffle = True, callbacks # shuffle = True, callbacks
cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
metrics='l1', verbose_eval=False, metrics='l1', verbose_eval=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
self.assertIn('l1-mean', cv_res) assert 'l1-mean' in cv_res
self.assertEqual(len(cv_res['l1-mean']), 10) assert len(cv_res['l1-mean']) == 10
# enable display training loss # enable display training loss
cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
nfold=3, stratified=False, shuffle=False, nfold=3, stratified=False, shuffle=False,
metrics='l1', verbose_eval=False, eval_train_metric=True) metrics='l1', verbose_eval=False, eval_train_metric=True)
self.assertIn('train l1-mean', cv_res) assert 'train l1-mean' in cv_res
self.assertIn('valid l1-mean', cv_res) assert 'valid l1-mean' in cv_res
self.assertNotIn('train l2-mean', cv_res) assert 'train l2-mean' not in cv_res
self.assertNotIn('valid l2-mean', cv_res) assert 'valid l2-mean' not in cv_res
self.assertEqual(len(cv_res['train l1-mean']), 10) assert len(cv_res['train l1-mean']) == 10
self.assertEqual(len(cv_res['valid l1-mean']), 10) assert len(cv_res['valid l1-mean']) == 10
# self defined folds # self defined folds
tss = TimeSeriesSplit(3) tss = TimeSeriesSplit(3)
folds = tss.split(X_train) folds = tss.split(X_train)
...@@ -736,20 +756,21 @@ class TestEngine(unittest.TestCase): ...@@ -736,20 +756,21 @@ class TestEngine(unittest.TestCase):
# ... with l2 metric # ... with l2 metric
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
metrics='l2', verbose_eval=False) metrics='l2', verbose_eval=False)
self.assertEqual(len(cv_res_lambda), 2) assert len(cv_res_lambda) == 2
self.assertFalse(np.isnan(cv_res_lambda['l2-mean']).any()) assert not np.isnan(cv_res_lambda['l2-mean']).any()
# ... with NDCG (default) metric # ... with NDCG (default) metric
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
verbose_eval=False) verbose_eval=False)
self.assertEqual(len(cv_res_lambda), 2) assert len(cv_res_lambda) == 2
self.assertFalse(np.isnan(cv_res_lambda['ndcg@3-mean']).any()) assert not np.isnan(cv_res_lambda['ndcg@3-mean']).any()
# self defined folds with lambdarank # self defined folds with lambdarank
cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10,
folds=GroupKFold(n_splits=3), folds=GroupKFold(n_splits=3),
verbose_eval=False) verbose_eval=False)
np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean']) np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])
def test_cvbooster(self):
def test_cvbooster():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -765,21 +786,21 @@ class TestEngine(unittest.TestCase): ...@@ -765,21 +786,21 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
nfold=3, nfold=3,
return_cvbooster=True) return_cvbooster=True)
self.assertIn('cvbooster', cv_res) assert 'cvbooster' in cv_res
cvb = cv_res['cvbooster'] cvb = cv_res['cvbooster']
self.assertIsInstance(cvb, lgb.CVBooster) assert isinstance(cvb, lgb.CVBooster)
self.assertIsInstance(cvb.boosters, list) assert isinstance(cvb.boosters, list)
self.assertEqual(len(cvb.boosters), 3) assert len(cvb.boosters) == 3
self.assertTrue(all(isinstance(bst, lgb.Booster) for bst in cvb.boosters)) assert all(isinstance(bst, lgb.Booster) for bst in cvb.boosters)
self.assertGreater(cvb.best_iteration, 0) assert cvb.best_iteration > 0
# predict by each fold booster # predict by each fold booster
preds = cvb.predict(X_test, num_iteration=cvb.best_iteration) preds = cvb.predict(X_test, num_iteration=cvb.best_iteration)
self.assertIsInstance(preds, list) assert isinstance(preds, list)
self.assertEqual(len(preds), 3) assert len(preds) == 3
# fold averaging # fold averaging
avg_pred = np.mean(preds, axis=0) avg_pred = np.mean(preds, axis=0)
ret = log_loss(y_test, avg_pred) ret = log_loss(y_test, avg_pred)
self.assertLess(ret, 0.13) assert ret < 0.13
# without early stopping # without early stopping
cv_res = lgb.cv(params, lgb_train, cv_res = lgb.cv(params, lgb_train,
num_boost_round=20, num_boost_round=20,
...@@ -787,25 +808,27 @@ class TestEngine(unittest.TestCase): ...@@ -787,25 +808,27 @@ class TestEngine(unittest.TestCase):
nfold=3, nfold=3,
return_cvbooster=True) return_cvbooster=True)
cvb = cv_res['cvbooster'] cvb = cv_res['cvbooster']
self.assertEqual(cvb.best_iteration, -1) assert cvb.best_iteration == -1
preds = cvb.predict(X_test) preds = cvb.predict(X_test)
avg_pred = np.mean(preds, axis=0) avg_pred = np.mean(preds, axis=0)
ret = log_loss(y_test, avg_pred) ret = log_loss(y_test, avg_pred)
self.assertLess(ret, 0.15) assert ret < 0.15
def test_feature_name(self): def test_feature_name():
X_train, y_train = load_boston(return_X_y=True) X_train, y_train = load_boston(return_X_y=True)
params = {'verbose': -1} params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])] feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names) gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
self.assertListEqual(feature_names, gbm.feature_name()) assert feature_names == gbm.feature_name()
# test feature_names with whitespaces # test feature_names with whitespaces
feature_names_with_space = ['f ' + str(i) for i in range(X_train.shape[-1])] feature_names_with_space = ['f ' + str(i) for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space) gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
self.assertListEqual(feature_names, gbm.feature_name()) assert feature_names == gbm.feature_name()
def test_feature_name_with_non_ascii(self):
def test_feature_name_with_non_ascii():
X_train = np.random.normal(size=(100, 4)) X_train = np.random.normal(size=(100, 4))
y_train = np.random.random(100) y_train = np.random.random(100)
# This has non-ascii strings. # This has non-ascii strings.
...@@ -814,13 +837,14 @@ class TestEngine(unittest.TestCase): ...@@ -814,13 +837,14 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names) gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
self.assertListEqual(feature_names, gbm.feature_name()) assert feature_names == gbm.feature_name()
gbm.save_model('lgb.model') gbm.save_model('lgb.model')
gbm2 = lgb.Booster(model_file='lgb.model') gbm2 = lgb.Booster(model_file='lgb.model')
self.assertListEqual(feature_names, gbm2.feature_name()) assert feature_names == gbm2.feature_name()
def test_save_load_copy_pickle(self): def test_save_load_copy_pickle():
def train_and_predict(init_model=None, return_model=False): def train_and_predict(init_model=None, return_model=False):
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
...@@ -838,7 +862,7 @@ class TestEngine(unittest.TestCase): ...@@ -838,7 +862,7 @@ class TestEngine(unittest.TestCase):
other_ret = [] other_ret = []
gbm.save_model('lgb.model') gbm.save_model('lgb.model')
with open('lgb.model') as f: # check all params are logged into model file correctly with open('lgb.model') as f: # check all params are logged into model file correctly
self.assertNotEqual(f.read().find("[num_iterations: 10]"), -1) assert f.read().find("[num_iterations: 10]") != -1
other_ret.append(train_and_predict(init_model='lgb.model')) other_ret.append(train_and_predict(init_model='lgb.model'))
gbm_load = lgb.Booster(model_file='lgb.model') gbm_load = lgb.Booster(model_file='lgb.model')
other_ret.append(train_and_predict(init_model=gbm_load)) other_ret.append(train_and_predict(init_model=gbm_load))
...@@ -852,11 +876,11 @@ class TestEngine(unittest.TestCase): ...@@ -852,11 +876,11 @@ class TestEngine(unittest.TestCase):
gbm_pickles = pickle.loads(pickle.dumps(gbm)) gbm_pickles = pickle.loads(pickle.dumps(gbm))
other_ret.append(train_and_predict(init_model=gbm_pickles)) other_ret.append(train_and_predict(init_model=gbm_pickles))
for ret in other_ret: for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5) assert ret_origin == pytest.approx(ret)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self): def test_pandas_categorical():
import pandas as pd pd = pytest.importorskip("pandas")
np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat) np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int "B": np.random.permutation([1, 2, 3] * 100), # int
...@@ -885,19 +909,19 @@ class TestEngine(unittest.TestCase): ...@@ -885,19 +909,19 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10) gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
pred0 = gbm0.predict(X_test) pred0 = gbm0.predict(X_test)
self.assertEqual(lgb_train.categorical_feature, 'auto') assert lgb_train.categorical_feature == 'auto'
lgb_train = lgb.Dataset(X, pd.DataFrame(y)) # also test that label can be one-column pd.DataFrame lgb_train = lgb.Dataset(X, pd.DataFrame(y)) # also test that label can be one-column pd.DataFrame
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0]) gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0])
pred1 = gbm1.predict(X_test) pred1 = gbm1.predict(X_test)
self.assertListEqual(lgb_train.categorical_feature, [0]) assert lgb_train.categorical_feature == [0]
lgb_train = lgb.Dataset(X, pd.Series(y)) # also test that label can be pd.Series lgb_train = lgb.Dataset(X, pd.Series(y)) # also test that label can be pd.Series
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A']) gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A'])
pred2 = gbm2.predict(X_test) pred2 = gbm2.predict(X_test)
self.assertListEqual(lgb_train.categorical_feature, ['A']) assert lgb_train.categorical_feature == ['A']
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D']) gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = gbm3.predict(X_test) pred3 = gbm3.predict(X_test)
self.assertListEqual(lgb_train.categorical_feature, ['A', 'B', 'C', 'D']) assert lgb_train.categorical_feature == ['A', 'B', 'C', 'D']
gbm3.save_model('categorical.model') gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model') gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test) pred4 = gbm4.predict(X_test)
...@@ -909,40 +933,36 @@ class TestEngine(unittest.TestCase): ...@@ -909,40 +933,36 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D', 'E']) gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D', 'E'])
pred7 = gbm6.predict(X_test) pred7 = gbm6.predict(X_test)
self.assertListEqual(lgb_train.categorical_feature, ['A', 'B', 'C', 'D', 'E']) assert lgb_train.categorical_feature == ['A', 'B', 'C', 'D', 'E']
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm7 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[]) gbm7 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[])
pred8 = gbm7.predict(X_test) pred8 = gbm7.predict(X_test)
self.assertListEqual(lgb_train.categorical_feature, []) assert lgb_train.categorical_feature == []
self.assertRaises(AssertionError, with pytest.raises(AssertionError):
np.testing.assert_allclose, np.testing.assert_allclose(pred0, pred1)
pred0, pred1) with pytest.raises(AssertionError):
self.assertRaises(AssertionError, np.testing.assert_allclose(pred0, pred2)
np.testing.assert_allclose,
pred0, pred2)
np.testing.assert_allclose(pred1, pred2) np.testing.assert_allclose(pred1, pred2)
np.testing.assert_allclose(pred0, pred3) np.testing.assert_allclose(pred0, pred3)
np.testing.assert_allclose(pred0, pred4) np.testing.assert_allclose(pred0, pred4)
np.testing.assert_allclose(pred0, pred5) np.testing.assert_allclose(pred0, pred5)
np.testing.assert_allclose(pred0, pred6) np.testing.assert_allclose(pred0, pred6)
self.assertRaises(AssertionError, with pytest.raises(AssertionError):
np.testing.assert_allclose, np.testing.assert_allclose(pred0, pred7) # ordered cat features aren't treated as cat features by default
pred0, pred7) # ordered cat features aren't treated as cat features by default with pytest.raises(AssertionError):
self.assertRaises(AssertionError, np.testing.assert_allclose(pred0, pred8) # ordered cat features aren't treated as cat features by default
np.testing.assert_allclose, assert gbm0.pandas_categorical == cat_values
pred0, pred8) assert gbm1.pandas_categorical == cat_values
self.assertListEqual(gbm0.pandas_categorical, cat_values) assert gbm2.pandas_categorical == cat_values
self.assertListEqual(gbm1.pandas_categorical, cat_values) assert gbm3.pandas_categorical == cat_values
self.assertListEqual(gbm2.pandas_categorical, cat_values) assert gbm4.pandas_categorical == cat_values
self.assertListEqual(gbm3.pandas_categorical, cat_values) assert gbm5.pandas_categorical == cat_values
self.assertListEqual(gbm4.pandas_categorical, cat_values) assert gbm6.pandas_categorical == cat_values
self.assertListEqual(gbm5.pandas_categorical, cat_values) assert gbm7.pandas_categorical == cat_values
self.assertListEqual(gbm6.pandas_categorical, cat_values)
self.assertListEqual(gbm7.pandas_categorical, cat_values)
def test_pandas_sparse():
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') pd = pytest.importorskip("pandas")
def test_pandas_sparse(self):
import pandas as pd
try: try:
from pandas.arrays import SparseArray from pandas.arrays import SparseArray
except ImportError: # support old versions except ImportError: # support old versions
...@@ -956,7 +976,7 @@ class TestEngine(unittest.TestCase): ...@@ -956,7 +976,7 @@ class TestEngine(unittest.TestCase):
"C": SparseArray(np.random.permutation([True, False] * 30))}) "C": SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0': if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]): for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype)) assert pd.api.types.is_sparse(dtype)
params = { params = {
'objective': 'binary', 'objective': 'binary',
'verbose': -1 'verbose': -1
...@@ -970,7 +990,8 @@ class TestEngine(unittest.TestCase): ...@@ -970,7 +990,8 @@ class TestEngine(unittest.TestCase):
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True) pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense) np.testing.assert_allclose(pred_sparse, pred_dense)
def test_reference_chain(self):
def test_reference_chain():
X = np.random.normal(size=(100, 2)) X = np.random.normal(size=(100, 2))
y = np.random.normal(size=100) y = np.random.normal(size=100)
tmp_dat = lgb.Dataset(X, y) tmp_dat = lgb.Dataset(X, y)
...@@ -979,13 +1000,14 @@ class TestEngine(unittest.TestCase): ...@@ -979,13 +1000,14 @@ class TestEngine(unittest.TestCase):
tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18)) tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
params = {'objective': 'regression_l2', 'metric': 'rmse'} params = {'objective': 'regression_l2', 'metric': 'rmse'}
evals_result = {} evals_result = {}
gbm = lgb.train(params, tmp_dat_train, num_boost_round=20, lgb.train(params, tmp_dat_train, num_boost_round=20,
valid_sets=[tmp_dat_train, tmp_dat_val], valid_sets=[tmp_dat_train, tmp_dat_val],
verbose_eval=False, evals_result=evals_result) verbose_eval=False, evals_result=evals_result)
self.assertEqual(len(evals_result['training']['rmse']), 20) assert len(evals_result['training']['rmse']) == 20
self.assertEqual(len(evals_result['valid_1']['rmse']), 20) assert len(evals_result['valid_1']['rmse']) == 20
def test_contribs(self):
def test_contribs():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -996,10 +1018,11 @@ class TestEngine(unittest.TestCase): ...@@ -996,10 +1018,11 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train, num_boost_round=20) gbm = lgb.train(params, lgb_train, num_boost_round=20)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True) assert (np.linalg.norm(gbm.predict(X_test, raw_score=True)
- np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4) - np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)) < 1e-4)
def test_contribs_sparse(self): def test_contribs_sparse():
n_features = 20 n_features = 20
n_samples = 100 n_samples = 100
# generate CSR sparse dataset # generate CSR sparse dataset
...@@ -1017,21 +1040,22 @@ class TestEngine(unittest.TestCase): ...@@ -1017,21 +1040,22 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train, num_boost_round=20) gbm = lgb.train(params, lgb_train, num_boost_round=20)
contribs_csr = gbm.predict(X_test, pred_contrib=True) contribs_csr = gbm.predict(X_test, pred_contrib=True)
self.assertTrue(isspmatrix_csr(contribs_csr)) assert isspmatrix_csr(contribs_csr)
# convert data to dense and get back same contribs # convert data to dense and get back same contribs
contribs_dense = gbm.predict(X_test.toarray(), pred_contrib=True) contribs_dense = gbm.predict(X_test.toarray(), pred_contrib=True)
# validate the values are the same # validate the values are the same
np.testing.assert_allclose(contribs_csr.toarray(), contribs_dense) np.testing.assert_allclose(contribs_csr.toarray(), contribs_dense)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True) assert (np.linalg.norm(gbm.predict(X_test, raw_score=True)
- np.sum(contribs_dense, axis=1)), 1e-4) - np.sum(contribs_dense, axis=1)) < 1e-4)
# validate using CSC matrix # validate using CSC matrix
X_test_csc = X_test.tocsc() X_test_csc = X_test.tocsc()
contribs_csc = gbm.predict(X_test_csc, pred_contrib=True) contribs_csc = gbm.predict(X_test_csc, pred_contrib=True)
self.assertTrue(isspmatrix_csc(contribs_csc)) assert isspmatrix_csc(contribs_csc)
# validate the values are the same # validate the values are the same
np.testing.assert_allclose(contribs_csc.toarray(), contribs_dense) np.testing.assert_allclose(contribs_csc.toarray(), contribs_dense)
def test_contribs_sparse_multiclass(self):
def test_contribs_sparse_multiclass():
n_features = 20 n_features = 20
n_samples = 100 n_samples = 100
n_labels = 4 n_labels = 4
...@@ -1051,9 +1075,9 @@ class TestEngine(unittest.TestCase): ...@@ -1051,9 +1075,9 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train, num_boost_round=20) gbm = lgb.train(params, lgb_train, num_boost_round=20)
contribs_csr = gbm.predict(X_test, pred_contrib=True) contribs_csr = gbm.predict(X_test, pred_contrib=True)
self.assertTrue(isinstance(contribs_csr, list)) assert isinstance(contribs_csr, list)
for perclass_contribs_csr in contribs_csr: for perclass_contribs_csr in contribs_csr:
self.assertTrue(isspmatrix_csr(perclass_contribs_csr)) assert isspmatrix_csr(perclass_contribs_csr)
# convert data to dense and get back same contribs # convert data to dense and get back same contribs
contribs_dense = gbm.predict(X_test.toarray(), pred_contrib=True) contribs_dense = gbm.predict(X_test.toarray(), pred_contrib=True)
# validate the values are the same # validate the values are the same
...@@ -1062,22 +1086,22 @@ class TestEngine(unittest.TestCase): ...@@ -1062,22 +1086,22 @@ class TestEngine(unittest.TestCase):
contribs_csr_array.shape[1] * contribs_csr_array.shape[2])) contribs_csr_array.shape[1] * contribs_csr_array.shape[2]))
np.testing.assert_allclose(contribs_csr_arr_re, contribs_dense) np.testing.assert_allclose(contribs_csr_arr_re, contribs_dense)
contribs_dense_re = contribs_dense.reshape(contribs_csr_array.shape) contribs_dense_re = contribs_dense.reshape(contribs_csr_array.shape)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True) assert np.linalg.norm(gbm.predict(X_test, raw_score=True) - np.sum(contribs_dense_re, axis=2)) < 1e-4
- np.sum(contribs_dense_re, axis=2)), 1e-4)
# validate using CSC matrix # validate using CSC matrix
X_test_csc = X_test.tocsc() X_test_csc = X_test.tocsc()
contribs_csc = gbm.predict(X_test_csc, pred_contrib=True) contribs_csc = gbm.predict(X_test_csc, pred_contrib=True)
self.assertTrue(isinstance(contribs_csc, list)) assert isinstance(contribs_csc, list)
for perclass_contribs_csc in contribs_csc: for perclass_contribs_csc in contribs_csc:
self.assertTrue(isspmatrix_csc(perclass_contribs_csc)) assert isspmatrix_csc(perclass_contribs_csc)
# validate the values are the same # validate the values are the same
contribs_csc_array = np.swapaxes(np.array([sparse_array.todense() for sparse_array in contribs_csc]), 0, 1) contribs_csc_array = np.swapaxes(np.array([sparse_array.todense() for sparse_array in contribs_csc]), 0, 1)
contribs_csc_array = contribs_csc_array.reshape((contribs_csc_array.shape[0], contribs_csc_array = contribs_csc_array.reshape((contribs_csc_array.shape[0],
contribs_csc_array.shape[1] * contribs_csc_array.shape[2])) contribs_csc_array.shape[1] * contribs_csc_array.shape[2]))
np.testing.assert_allclose(contribs_csc_array, contribs_dense) np.testing.assert_allclose(contribs_csc_array, contribs_dense)
@unittest.skipIf(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, 'not enough RAM')
def test_int32_max_sparse_contribs(self): @pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason='not enough RAM')
def test_int32_max_sparse_contribs():
params = { params = {
'objective': 'binary' 'objective': 'binary'
} }
...@@ -1093,12 +1117,13 @@ class TestEngine(unittest.TestCase): ...@@ -1093,12 +1117,13 @@ class TestEngine(unittest.TestCase):
y_pred_csr = gbm.predict(test_features, pred_contrib=True) y_pred_csr = gbm.predict(test_features, pred_contrib=True)
# Note there is an extra column added to the output for the expected value # Note there is an extra column added to the output for the expected value
csr_output_shape = (csr_input_shape[0], csr_input_shape[1] + 1) csr_output_shape = (csr_input_shape[0], csr_input_shape[1] + 1)
self.assertTupleEqual(y_pred_csr.shape, csr_output_shape) assert y_pred_csr.shape == csr_output_shape
y_pred_csc = gbm.predict(test_features.tocsc(), pred_contrib=True) y_pred_csc = gbm.predict(test_features.tocsc(), pred_contrib=True)
# Note output CSC shape should be same as CSR output shape # Note output CSC shape should be same as CSR output shape
self.assertTupleEqual(y_pred_csc.shape, csr_output_shape) assert y_pred_csc.shape == csr_output_shape
def test_sliced_data(self): def test_sliced_data():
def train_and_get_predictions(features, labels): def train_and_get_predictions(features, labels):
dataset = lgb.Dataset(features, label=labels) dataset = lgb.Dataset(features, label=labels)
lgb_params = { lgb_params = {
...@@ -1136,17 +1161,18 @@ class TestEngine(unittest.TestCase): ...@@ -1136,17 +1161,18 @@ class TestEngine(unittest.TestCase):
stacked_features = np.concatenate((stacked_features, np.ones(9, dtype=np.float32).reshape((1, 9))), axis=0) stacked_features = np.concatenate((stacked_features, np.ones(9, dtype=np.float32).reshape((1, 9))), axis=0)
# test sliced 2d matrix # test sliced 2d matrix
sliced_features = stacked_features[2:102, 2:7] sliced_features = stacked_features[2:102, 2:7]
self.assertTrue(np.all(sliced_features == features)) assert np.all(sliced_features == features)
sliced_pred = train_and_get_predictions(sliced_features, sliced_labels) sliced_pred = train_and_get_predictions(sliced_features, sliced_labels)
np.testing.assert_allclose(origin_pred, sliced_pred) np.testing.assert_allclose(origin_pred, sliced_pred)
# test sliced CSR # test sliced CSR
stacked_csr = csr_matrix(stacked_features) stacked_csr = csr_matrix(stacked_features)
sliced_csr = stacked_csr[2:102, 2:7] sliced_csr = stacked_csr[2:102, 2:7]
self.assertTrue(np.all(sliced_csr == features)) assert np.all(sliced_csr == features)
sliced_pred = train_and_get_predictions(sliced_csr, sliced_labels) sliced_pred = train_and_get_predictions(sliced_csr, sliced_labels)
np.testing.assert_allclose(origin_pred, sliced_pred) np.testing.assert_allclose(origin_pred, sliced_pred)
def test_init_with_subset(self):
def test_init_with_subset():
data = np.random.random((50, 2)) data = np.random.random((50, 2))
y = [1] * 25 + [0] * 25 y = [1] * 25 + [0] * 25
lgb_train = lgb.Dataset(data, y, free_raw_data=False) lgb_train = lgb.Dataset(data, y, free_raw_data=False)
...@@ -1162,13 +1188,13 @@ class TestEngine(unittest.TestCase): ...@@ -1162,13 +1188,13 @@ class TestEngine(unittest.TestCase):
train_set=subset_data_1, train_set=subset_data_1,
num_boost_round=10, num_boost_round=10,
keep_training_booster=True) keep_training_booster=True)
gbm = lgb.train(params=params, lgb.train(params=params,
train_set=subset_data_2, train_set=subset_data_2,
num_boost_round=10, num_boost_round=10,
init_model=init_gbm) init_model=init_gbm)
self.assertEqual(lgb_train.get_data().shape[0], 50) assert lgb_train.get_data().shape[0] == 50
self.assertEqual(subset_data_1.get_data().shape[0], 30) assert subset_data_1.get_data().shape[0] == 30
self.assertEqual(subset_data_2.get_data().shape[0], 20) assert subset_data_2.get_data().shape[0] == 20
lgb_train.save_binary("lgb_train_data.bin") lgb_train.save_binary("lgb_train_data.bin")
lgb_train_from_file = lgb.Dataset('lgb_train_data.bin', free_raw_data=False) lgb_train_from_file = lgb.Dataset('lgb_train_data.bin', free_raw_data=False)
subset_data_3 = lgb_train_from_file.subset(subset_index_1) subset_data_3 = lgb_train_from_file.subset(subset_index_1)
...@@ -1178,15 +1204,16 @@ class TestEngine(unittest.TestCase): ...@@ -1178,15 +1204,16 @@ class TestEngine(unittest.TestCase):
num_boost_round=10, num_boost_round=10,
keep_training_booster=True) keep_training_booster=True)
with np.testing.assert_raises_regex(lgb.basic.LightGBMError, "Unknown format of training data"): with np.testing.assert_raises_regex(lgb.basic.LightGBMError, "Unknown format of training data"):
gbm = lgb.train(params=params, lgb.train(params=params,
train_set=subset_data_4, train_set=subset_data_4,
num_boost_round=10, num_boost_round=10,
init_model=init_gbm_2) init_model=init_gbm_2)
self.assertEqual(lgb_train_from_file.get_data(), "lgb_train_data.bin") assert lgb_train_from_file.get_data() == "lgb_train_data.bin"
self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin") assert subset_data_3.get_data() == "lgb_train_data.bin"
self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin") assert subset_data_4.get_data() == "lgb_train_data.bin"
def generate_trainset_for_monotone_constraints_tests(self, x3_to_category=True): def generate_trainset_for_monotone_constraints_tests(x3_to_category=True):
number_of_dpoints = 3000 number_of_dpoints = 3000
x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
...@@ -1211,7 +1238,8 @@ class TestEngine(unittest.TestCase): ...@@ -1211,7 +1238,8 @@ class TestEngine(unittest.TestCase):
trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features, free_raw_data=False) trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features, free_raw_data=False)
return trainset return trainset
def test_monotone_constraints(self):
def test_monotone_constraints():
def is_increasing(y): def is_increasing(y):
return (np.diff(y) >= 0.0).all() return (np.diff(y) >= 0.0).all()
...@@ -1243,7 +1271,7 @@ class TestEngine(unittest.TestCase): ...@@ -1243,7 +1271,7 @@ class TestEngine(unittest.TestCase):
return True return True
for test_with_categorical_variable in [True, False]: for test_with_categorical_variable in [True, False]:
trainset = self.generate_trainset_for_monotone_constraints_tests(test_with_categorical_variable) trainset = generate_trainset_for_monotone_constraints_tests(test_with_categorical_variable)
for monotone_constraints_method in ["basic", "intermediate", "advanced"]: for monotone_constraints_method in ["basic", "intermediate", "advanced"]:
params = { params = {
'min_data': 20, 'min_data': 20,
...@@ -1253,9 +1281,10 @@ class TestEngine(unittest.TestCase): ...@@ -1253,9 +1281,10 @@ class TestEngine(unittest.TestCase):
"use_missing": False, "use_missing": False,
} }
constrained_model = lgb.train(params, trainset) constrained_model = lgb.train(params, trainset)
self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable)) assert is_correctly_constrained(constrained_model, test_with_categorical_variable)
def test_monotone_penalty(self): def test_monotone_penalty():
def are_first_splits_non_monotone(tree, n, monotone_constraints): def are_first_splits_non_monotone(tree, n, monotone_constraints):
if n <= 0: if n <= 0:
return True return True
...@@ -1277,7 +1306,7 @@ class TestEngine(unittest.TestCase): ...@@ -1277,7 +1306,7 @@ class TestEngine(unittest.TestCase):
max_depth = 5 max_depth = 5
monotone_constraints = [1, -1, 0] monotone_constraints = [1, -1, 0]
penalization_parameter = 2.0 penalization_parameter = 2.0
trainset = self.generate_trainset_for_monotone_constraints_tests(x3_to_category=False) trainset = generate_trainset_for_monotone_constraints_tests(x3_to_category=False)
for monotone_constraints_method in ["basic", "intermediate", "advanced"]: for monotone_constraints_method in ["basic", "intermediate", "advanced"]:
params = { params = {
'max_depth': max_depth, 'max_depth': max_depth,
...@@ -1288,16 +1317,17 @@ class TestEngine(unittest.TestCase): ...@@ -1288,16 +1317,17 @@ class TestEngine(unittest.TestCase):
constrained_model = lgb.train(params, trainset, 10) constrained_model = lgb.train(params, trainset, 10)
dumped_model = constrained_model.dump_model()["tree_info"] dumped_model = constrained_model.dump_model()["tree_info"]
for tree in dumped_model: for tree in dumped_model:
self.assertTrue(are_first_splits_non_monotone(tree["tree_structure"], int(penalization_parameter), assert are_first_splits_non_monotone(tree["tree_structure"], int(penalization_parameter),
monotone_constraints)) monotone_constraints)
self.assertTrue(are_there_monotone_splits(tree["tree_structure"], monotone_constraints)) assert are_there_monotone_splits(tree["tree_structure"], monotone_constraints)
# test if a penalty as high as the depth indeed prohibits all monotone splits
def test_monotone_penalty_max(self): # test if a penalty as high as the depth indeed prohibits all monotone splits
def test_monotone_penalty_max():
max_depth = 5 max_depth = 5
monotone_constraints = [1, -1, 0] monotone_constraints = [1, -1, 0]
penalization_parameter = max_depth penalization_parameter = max_depth
trainset_constrained_model = self.generate_trainset_for_monotone_constraints_tests(x3_to_category=False) trainset_constrained_model = generate_trainset_for_monotone_constraints_tests(x3_to_category=False)
x = trainset_constrained_model.data x = trainset_constrained_model.data
y = trainset_constrained_model.label y = trainset_constrained_model.label
x3_negatively_correlated_with_y = x[:, 2] x3_negatively_correlated_with_y = x[:, 2]
...@@ -1325,7 +1355,8 @@ class TestEngine(unittest.TestCase): ...@@ -1325,7 +1355,8 @@ class TestEngine(unittest.TestCase):
# Check that a very high penalization is the same as not using the features at all # Check that a very high penalization is the same as not using the features at all
np.testing.assert_array_equal(constrained_model.predict(x), unconstrained_model_predictions) np.testing.assert_array_equal(constrained_model.predict(x), unconstrained_model_predictions)
def test_max_bin_by_feature(self):
def test_max_bin_by_feature():
col1 = np.arange(0, 100)[:, np.newaxis] col1 = np.arange(0, 100)[:, np.newaxis]
col2 = np.zeros((100, 1)) col2 = np.zeros((100, 1))
col2[20:] = 1 col2[20:] = 1
...@@ -1342,13 +1373,14 @@ class TestEngine(unittest.TestCase): ...@@ -1342,13 +1373,14 @@ class TestEngine(unittest.TestCase):
} }
lgb_data = lgb.Dataset(X, label=y) lgb_data = lgb.Dataset(X, label=y)
est = lgb.train(params, lgb_data, num_boost_round=1) est = lgb.train(params, lgb_data, num_boost_round=1)
self.assertEqual(len(np.unique(est.predict(X))), 100) assert len(np.unique(est.predict(X))) == 100
params['max_bin_by_feature'] = [2, 100] params['max_bin_by_feature'] = [2, 100]
lgb_data = lgb.Dataset(X, label=y) lgb_data = lgb.Dataset(X, label=y)
est = lgb.train(params, lgb_data, num_boost_round=1) est = lgb.train(params, lgb_data, num_boost_round=1)
self.assertEqual(len(np.unique(est.predict(X))), 3) assert len(np.unique(est.predict(X))) == 3
def test_small_max_bin(self): def test_small_max_bin():
np.random.seed(0) np.random.seed(0)
y = np.random.choice([0, 1], 100) y = np.random.choice([0, 1], 100)
x = np.zeros((100, 1)) x = np.zeros((100, 1))
...@@ -1368,7 +1400,8 @@ class TestEngine(unittest.TestCase): ...@@ -1368,7 +1400,8 @@ class TestEngine(unittest.TestCase):
lgb.train(params, lgb_x, num_boost_round=5) lgb.train(params, lgb_x, num_boost_round=5)
np.random.seed() # reset seed np.random.seed() # reset seed
def test_refit(self):
def test_refit():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -1382,9 +1415,10 @@ class TestEngine(unittest.TestCase): ...@@ -1382,9 +1415,10 @@ class TestEngine(unittest.TestCase):
err_pred = log_loss(y_test, gbm.predict(X_test)) err_pred = log_loss(y_test, gbm.predict(X_test))
new_gbm = gbm.refit(X_test, y_test) new_gbm = gbm.refit(X_test, y_test)
new_err_pred = log_loss(y_test, new_gbm.predict(X_test)) new_err_pred = log_loss(y_test, new_gbm.predict(X_test))
self.assertGreater(err_pred, new_err_pred) assert err_pred > new_err_pred
def test_mape_rf(self): def test_mape_rf():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
params = { params = {
'boosting_type': 'rf', 'boosting_type': 'rf',
...@@ -1399,9 +1433,10 @@ class TestEngine(unittest.TestCase): ...@@ -1399,9 +1433,10 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train, num_boost_round=20) gbm = lgb.train(params, lgb_train, num_boost_round=20)
pred = gbm.predict(X) pred = gbm.predict(X)
pred_mean = pred.mean() pred_mean = pred.mean()
self.assertGreater(pred_mean, 20) assert pred_mean > 20
def test_mape_dart(self):
def test_mape_dart():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
params = { params = {
'boosting_type': 'dart', 'boosting_type': 'dart',
...@@ -1416,9 +1451,10 @@ class TestEngine(unittest.TestCase): ...@@ -1416,9 +1451,10 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train, num_boost_round=40) gbm = lgb.train(params, lgb_train, num_boost_round=40)
pred = gbm.predict(X) pred = gbm.predict(X)
pred_mean = pred.mean() pred_mean = pred.mean()
self.assertGreater(pred_mean, 18) assert pred_mean > 18
def check_constant_features(self, y_true, expected_pred, more_params): def check_constant_features(y_true, expected_pred, more_params):
X_train = np.ones((len(y_true), 1)) X_train = np.ones((len(y_true), 1))
y_train = np.array(y_true) y_train = np.array(y_true)
params = { params = {
...@@ -1435,40 +1471,45 @@ class TestEngine(unittest.TestCase): ...@@ -1435,40 +1471,45 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_train = lgb.Dataset(X_train, y_train, params=params)
gbm = lgb.train(params, lgb_train, num_boost_round=2) gbm = lgb.train(params, lgb_train, num_boost_round=2)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
self.assertTrue(np.allclose(pred, expected_pred)) assert np.allclose(pred, expected_pred)
def test_constant_features_regression(self):
def test_constant_features_regression():
params = { params = {
'objective': 'regression' 'objective': 'regression'
} }
self.check_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params) check_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params)
self.check_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params) check_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params)
self.check_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params) check_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params)
def test_constant_features_binary(self): def test_constant_features_binary():
params = { params = {
'objective': 'binary' 'objective': 'binary'
} }
self.check_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params) check_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params)
self.check_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params) check_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params)
def test_constant_features_multiclass(self):
def test_constant_features_multiclass():
params = { params = {
'objective': 'multiclass', 'objective': 'multiclass',
'num_class': 3 'num_class': 3
} }
self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
def test_constant_features_multiclassova(self): def test_constant_features_multiclassova():
params = { params = {
'objective': 'multiclassova', 'objective': 'multiclassova',
'num_class': 3 'num_class': 3
} }
self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
def test_fpreproc(self): def test_fpreproc():
def preprocess_data(dtrain, dtest, params): def preprocess_data(dtrain, dtest, params):
train_data = dtrain.construct().get_data() train_data = dtrain.construct().get_data()
test_data = dtest.construct().get_data() test_data = dtest.construct().get_data()
...@@ -1485,10 +1526,11 @@ class TestEngine(unittest.TestCase): ...@@ -1485,10 +1526,11 @@ class TestEngine(unittest.TestCase):
dataset = lgb.Dataset(X, y, free_raw_data=False) dataset = lgb.Dataset(X, y, free_raw_data=False)
params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data)
self.assertIn('multi_logloss-mean', results) assert 'multi_logloss-mean' in results
self.assertEqual(len(results['multi_logloss-mean']), 10) assert len(results['multi_logloss-mean']) == 10
def test_metrics(self):
def test_metrics():
X, y = load_digits(n_class=2, return_X_y=True) X, y = load_digits(n_class=2, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, silent=True) lgb_train = lgb.Dataset(X_train, y_train, silent=True)
...@@ -1523,277 +1565,277 @@ class TestEngine(unittest.TestCase): ...@@ -1523,277 +1565,277 @@ class TestEngine(unittest.TestCase):
# no fobj, no feval # no fobj, no feval
# default metric # default metric
res = get_cv_result() res = get_cv_result()
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
# non-default metric in params # non-default metric in params
res = get_cv_result(params=params_obj_metric_err_verbose) res = get_cv_result(params=params_obj_metric_err_verbose)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# default metric in args # default metric in args
res = get_cv_result(metrics='binary_logloss') res = get_cv_result(metrics='binary_logloss')
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
# non-default metric in args # non-default metric in args
res = get_cv_result(metrics='binary_error') res = get_cv_result(metrics='binary_error')
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# metric in args overwrites one in params # metric in args overwrites one in params
res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error') res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error')
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# multiple metrics in params # multiple metrics in params
res = get_cv_result(params=params_obj_metric_multi_verbose) res = get_cv_result(params=params_obj_metric_multi_verbose)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# multiple metrics in args # multiple metrics in args
res = get_cv_result(metrics=['binary_logloss', 'binary_error']) res = get_cv_result(metrics=['binary_logloss', 'binary_error'])
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# remove default metric by 'None' in list # remove default metric by 'None' in list
res = get_cv_result(metrics=['None']) res = get_cv_result(metrics=['None'])
self.assertEqual(len(res), 0) assert len(res) == 0
# remove default metric by 'None' aliases # remove default metric by 'None' aliases
for na_alias in ('None', 'na', 'null', 'custom'): for na_alias in ('None', 'na', 'null', 'custom'):
res = get_cv_result(metrics=na_alias) res = get_cv_result(metrics=na_alias)
self.assertEqual(len(res), 0) assert len(res) == 0
# fobj, no feval # fobj, no feval
# no default metric # no default metric
res = get_cv_result(params=params_verbose, fobj=dummy_obj) res = get_cv_result(params=params_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 0) assert len(res) == 0
# metric in params # metric in params
res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj) res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# metric in args # metric in args
res = get_cv_result(params=params_verbose, fobj=dummy_obj, metrics='binary_error') res = get_cv_result(params=params_verbose, fobj=dummy_obj, metrics='binary_error')
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# metric in args overwrites its' alias in params # metric in args overwrites its' alias in params
res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj, metrics='binary_error') res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj, metrics='binary_error')
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# multiple metrics in params # multiple metrics in params
res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj) res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# multiple metrics in args # multiple metrics in args
res = get_cv_result(params=params_verbose, fobj=dummy_obj, res = get_cv_result(params=params_verbose, fobj=dummy_obj,
metrics=['binary_logloss', 'binary_error']) metrics=['binary_logloss', 'binary_error'])
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
# no fobj, feval # no fobj, feval
# default metric with custom one # default metric with custom one
res = get_cv_result(feval=constant_metric) res = get_cv_result(feval=constant_metric)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# non-default metric in params with custom one # non-default metric in params with custom one
res = get_cv_result(params=params_obj_metric_err_verbose, feval=constant_metric) res = get_cv_result(params=params_obj_metric_err_verbose, feval=constant_metric)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# default metric in args with custom one # default metric in args with custom one
res = get_cv_result(metrics='binary_logloss', feval=constant_metric) res = get_cv_result(metrics='binary_logloss', feval=constant_metric)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# non-default metric in args with custom one # non-default metric in args with custom one
res = get_cv_result(metrics='binary_error', feval=constant_metric) res = get_cv_result(metrics='binary_error', feval=constant_metric)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# metric in args overwrites one in params, custom one is evaluated too # metric in args overwrites one in params, custom one is evaluated too
res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error', feval=constant_metric) res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error', feval=constant_metric)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# multiple metrics in params with custom one # multiple metrics in params with custom one
res = get_cv_result(params=params_obj_metric_multi_verbose, feval=constant_metric) res = get_cv_result(params=params_obj_metric_multi_verbose, feval=constant_metric)
self.assertEqual(len(res), 6) assert len(res) == 6
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# multiple metrics in args with custom one # multiple metrics in args with custom one
res = get_cv_result(metrics=['binary_logloss', 'binary_error'], feval=constant_metric) res = get_cv_result(metrics=['binary_logloss', 'binary_error'], feval=constant_metric)
self.assertEqual(len(res), 6) assert len(res) == 6
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# custom metric is evaluated despite 'None' is passed # custom metric is evaluated despite 'None' is passed
res = get_cv_result(metrics=['None'], feval=constant_metric) res = get_cv_result(metrics=['None'], feval=constant_metric)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('error-mean', res) assert 'error-mean' in res
# fobj, feval # fobj, feval
# no default metric, only custom one # no default metric, only custom one
res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric) res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('error-mean', res) assert 'error-mean' in res
# metric in params with custom one # metric in params with custom one
res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj, feval=constant_metric) res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# metric in args with custom one # metric in args with custom one
res = get_cv_result(params=params_verbose, fobj=dummy_obj, res = get_cv_result(params=params_verbose, fobj=dummy_obj,
feval=constant_metric, metrics='binary_error') feval=constant_metric, metrics='binary_error')
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# metric in args overwrites one in params, custom one is evaluated too # metric in args overwrites one in params, custom one is evaluated too
res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj, res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj,
feval=constant_metric, metrics='binary_error') feval=constant_metric, metrics='binary_error')
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# multiple metrics in params with custom one # multiple metrics in params with custom one
res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric) res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 6) assert len(res) == 6
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# multiple metrics in args with custom one # multiple metrics in args with custom one
res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric, res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric,
metrics=['binary_logloss', 'binary_error']) metrics=['binary_logloss', 'binary_error'])
self.assertEqual(len(res), 6) assert len(res) == 6
self.assertIn('binary_logloss-mean', res) assert 'binary_logloss-mean' in res
self.assertIn('binary_error-mean', res) assert 'binary_error-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# custom metric is evaluated despite 'None' is passed # custom metric is evaluated despite 'None' is passed
res = get_cv_result(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric) res = get_cv_result(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('error-mean', res) assert 'error-mean' in res
# no fobj, no feval # no fobj, no feval
# default metric # default metric
train_booster() train_booster()
self.assertEqual(len(evals_result['valid_0']), 1) assert len(evals_result['valid_0']) == 1
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
# default metric in params # default metric in params
train_booster(params=params_obj_metric_log_verbose) train_booster(params=params_obj_metric_log_verbose)
self.assertEqual(len(evals_result['valid_0']), 1) assert len(evals_result['valid_0']) == 1
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
# non-default metric in params # non-default metric in params
train_booster(params=params_obj_metric_err_verbose) train_booster(params=params_obj_metric_err_verbose)
self.assertEqual(len(evals_result['valid_0']), 1) assert len(evals_result['valid_0']) == 1
self.assertIn('binary_error', evals_result['valid_0']) assert 'binary_error' in evals_result['valid_0']
# multiple metrics in params # multiple metrics in params
train_booster(params=params_obj_metric_multi_verbose) train_booster(params=params_obj_metric_multi_verbose)
self.assertEqual(len(evals_result['valid_0']), 2) assert len(evals_result['valid_0']) == 2
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('binary_error', evals_result['valid_0']) assert 'binary_error' in evals_result['valid_0']
# remove default metric by 'None' aliases # remove default metric by 'None' aliases
for na_alias in ('None', 'na', 'null', 'custom'): for na_alias in ('None', 'na', 'null', 'custom'):
params = {'objective': 'binary', 'metric': na_alias, 'verbose': -1} params = {'objective': 'binary', 'metric': na_alias, 'verbose': -1}
train_booster(params=params) train_booster(params=params)
self.assertEqual(len(evals_result), 0) assert len(evals_result) == 0
# fobj, no feval # fobj, no feval
# no default metric # no default metric
train_booster(params=params_verbose, fobj=dummy_obj) train_booster(params=params_verbose, fobj=dummy_obj)
self.assertEqual(len(evals_result), 0) assert len(evals_result) == 0
# metric in params # metric in params
train_booster(params=params_metric_log_verbose, fobj=dummy_obj) train_booster(params=params_metric_log_verbose, fobj=dummy_obj)
self.assertEqual(len(evals_result['valid_0']), 1) assert len(evals_result['valid_0']) == 1
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
# multiple metrics in params # multiple metrics in params
train_booster(params=params_metric_multi_verbose, fobj=dummy_obj) train_booster(params=params_metric_multi_verbose, fobj=dummy_obj)
self.assertEqual(len(evals_result['valid_0']), 2) assert len(evals_result['valid_0']) == 2
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('binary_error', evals_result['valid_0']) assert 'binary_error' in evals_result['valid_0']
# no fobj, feval # no fobj, feval
# default metric with custom one # default metric with custom one
train_booster(feval=constant_metric) train_booster(feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2) assert len(evals_result['valid_0']) == 2
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# default metric in params with custom one # default metric in params with custom one
train_booster(params=params_obj_metric_log_verbose, feval=constant_metric) train_booster(params=params_obj_metric_log_verbose, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2) assert len(evals_result['valid_0']) == 2
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# non-default metric in params with custom one # non-default metric in params with custom one
train_booster(params=params_obj_metric_err_verbose, feval=constant_metric) train_booster(params=params_obj_metric_err_verbose, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2) assert len(evals_result['valid_0']) == 2
self.assertIn('binary_error', evals_result['valid_0']) assert 'binary_error' in evals_result['valid_0']
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# multiple metrics in params with custom one # multiple metrics in params with custom one
train_booster(params=params_obj_metric_multi_verbose, feval=constant_metric) train_booster(params=params_obj_metric_multi_verbose, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 3) assert len(evals_result['valid_0']) == 3
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('binary_error', evals_result['valid_0']) assert 'binary_error' in evals_result['valid_0']
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# custom metric is evaluated despite 'None' is passed # custom metric is evaluated despite 'None' is passed
train_booster(params=params_obj_metric_none_verbose, feval=constant_metric) train_booster(params=params_obj_metric_none_verbose, feval=constant_metric)
self.assertEqual(len(evals_result), 1) assert len(evals_result) == 1
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# fobj, feval # fobj, feval
# no default metric, only custom one # no default metric, only custom one
train_booster(params=params_verbose, fobj=dummy_obj, feval=constant_metric) train_booster(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 1) assert len(evals_result['valid_0']) == 1
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# metric in params with custom one # metric in params with custom one
train_booster(params=params_metric_log_verbose, fobj=dummy_obj, feval=constant_metric) train_booster(params=params_metric_log_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2) assert len(evals_result['valid_0']) == 2
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# multiple metrics in params with custom one # multiple metrics in params with custom one
train_booster(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric) train_booster(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 3) assert len(evals_result['valid_0']) == 3
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('binary_error', evals_result['valid_0']) assert 'binary_error' in evals_result['valid_0']
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
# custom metric is evaluated despite 'None' is passed # custom metric is evaluated despite 'None' is passed
train_booster(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric) train_booster(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result), 1) assert len(evals_result) == 1
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
X, y = load_digits(n_class=3, return_X_y=True) X, y = load_digits(n_class=3, return_X_y=True)
lgb_train = lgb.Dataset(X, y, silent=True) lgb_train = lgb.Dataset(X, y, silent=True)
...@@ -1805,64 +1847,65 @@ class TestEngine(unittest.TestCase): ...@@ -1805,64 +1847,65 @@ class TestEngine(unittest.TestCase):
params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1} params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1}
# multiclass default metric # multiclass default metric
res = get_cv_result(params_obj_class_3_verbose) res = get_cv_result(params_obj_class_3_verbose)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('multi_logloss-mean', res) assert 'multi_logloss-mean' in res
# multiclass default metric with custom one # multiclass default metric with custom one
res = get_cv_result(params_obj_class_3_verbose, feval=constant_metric) res = get_cv_result(params_obj_class_3_verbose, feval=constant_metric)
self.assertEqual(len(res), 4) assert len(res) == 4
self.assertIn('multi_logloss-mean', res) assert 'multi_logloss-mean' in res
self.assertIn('error-mean', res) assert 'error-mean' in res
# multiclass metric alias with custom one for custom objective # multiclass metric alias with custom one for custom objective
res = get_cv_result(params_obj_class_3_verbose, fobj=dummy_obj, feval=constant_metric) res = get_cv_result(params_obj_class_3_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('error-mean', res) assert 'error-mean' in res
# no metric for invalid class_num # no metric for invalid class_num
res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj) res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 0) assert len(res) == 0
# custom metric for invalid class_num # custom metric for invalid class_num
res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj, feval=constant_metric) res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('error-mean', res) assert 'error-mean' in res
# multiclass metric alias with custom one with invalid class_num # multiclass metric alias with custom one with invalid class_num
self.assertRaises(lgb.basic.LightGBMError, get_cv_result, with pytest.raises(lgb.basic.LightGBMError):
params_obj_class_1_verbose, metrics=obj_multi_alias, get_cv_result(params_obj_class_1_verbose, metrics=obj_multi_alias,
fobj=dummy_obj, feval=constant_metric) fobj=dummy_obj, feval=constant_metric)
# multiclass default metric without num_class # multiclass default metric without num_class
self.assertRaises(lgb.basic.LightGBMError, get_cv_result, with pytest.raises(lgb.basic.LightGBMError):
params_obj_verbose) get_cv_result(params_obj_verbose)
for metric_multi_alias in obj_multi_aliases + ['multi_logloss']: for metric_multi_alias in obj_multi_aliases + ['multi_logloss']:
# multiclass metric alias # multiclass metric alias
res = get_cv_result(params_obj_class_3_verbose, metrics=metric_multi_alias) res = get_cv_result(params_obj_class_3_verbose, metrics=metric_multi_alias)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('multi_logloss-mean', res) assert 'multi_logloss-mean' in res
# multiclass metric # multiclass metric
res = get_cv_result(params_obj_class_3_verbose, metrics='multi_error') res = get_cv_result(params_obj_class_3_verbose, metrics='multi_error')
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('multi_error-mean', res) assert 'multi_error-mean' in res
# non-valid metric for multiclass objective # non-valid metric for multiclass objective
self.assertRaises(lgb.basic.LightGBMError, get_cv_result, with pytest.raises(lgb.basic.LightGBMError):
params_obj_class_3_verbose, metrics='binary_logloss') get_cv_result(params_obj_class_3_verbose, metrics='binary_logloss')
params_class_3_verbose = {'num_class': 3, 'verbose': -1} params_class_3_verbose = {'num_class': 3, 'verbose': -1}
# non-default num_class for default objective # non-default num_class for default objective
self.assertRaises(lgb.basic.LightGBMError, get_cv_result, with pytest.raises(lgb.basic.LightGBMError):
params_class_3_verbose) get_cv_result(params_class_3_verbose)
# no metric with non-default num_class for custom objective # no metric with non-default num_class for custom objective
res = get_cv_result(params_class_3_verbose, fobj=dummy_obj) res = get_cv_result(params_class_3_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 0) assert len(res) == 0
for metric_multi_alias in obj_multi_aliases + ['multi_logloss']: for metric_multi_alias in obj_multi_aliases + ['multi_logloss']:
# multiclass metric alias for custom objective # multiclass metric alias for custom objective
res = get_cv_result(params_class_3_verbose, metrics=metric_multi_alias, fobj=dummy_obj) res = get_cv_result(params_class_3_verbose, metrics=metric_multi_alias, fobj=dummy_obj)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('multi_logloss-mean', res) assert 'multi_logloss-mean' in res
# multiclass metric for custom objective # multiclass metric for custom objective
res = get_cv_result(params_class_3_verbose, metrics='multi_error', fobj=dummy_obj) res = get_cv_result(params_class_3_verbose, metrics='multi_error', fobj=dummy_obj)
self.assertEqual(len(res), 2) assert len(res) == 2
self.assertIn('multi_error-mean', res) assert 'multi_error-mean' in res
# binary metric with non-default num_class for custom objective # binary metric with non-default num_class for custom objective
self.assertRaises(lgb.basic.LightGBMError, get_cv_result, with pytest.raises(lgb.basic.LightGBMError):
params_class_3_verbose, metrics='binary_error', fobj=dummy_obj) get_cv_result(params_class_3_verbose, metrics='binary_error', fobj=dummy_obj)
def test_multiple_feval_train(self): def test_multiple_feval_train():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'}
...@@ -1880,12 +1923,13 @@ class TestEngine(unittest.TestCase): ...@@ -1880,12 +1923,13 @@ class TestEngine(unittest.TestCase):
feval=[constant_metric, decreasing_metric], feval=[constant_metric, decreasing_metric],
evals_result=evals_result) evals_result=evals_result)
self.assertEqual(len(evals_result['valid_0']), 3) assert len(evals_result['valid_0']) == 3
self.assertIn('binary_logloss', evals_result['valid_0']) assert 'binary_logloss' in evals_result['valid_0']
self.assertIn('error', evals_result['valid_0']) assert 'error' in evals_result['valid_0']
self.assertIn('decreasing_metric', evals_result['valid_0']) assert 'decreasing_metric' in evals_result['valid_0']
def test_multiple_feval_cv(self):
def test_multiple_feval_cv():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'}
...@@ -1899,16 +1943,17 @@ class TestEngine(unittest.TestCase): ...@@ -1899,16 +1943,17 @@ class TestEngine(unittest.TestCase):
feval=[constant_metric, decreasing_metric]) feval=[constant_metric, decreasing_metric])
# Expect three metrics but mean and stdv for each metric # Expect three metrics but mean and stdv for each metric
self.assertEqual(len(cv_results), 6) assert len(cv_results) == 6
self.assertIn('binary_logloss-mean', cv_results) assert 'binary_logloss-mean' in cv_results
self.assertIn('error-mean', cv_results) assert 'error-mean' in cv_results
self.assertIn('decreasing_metric-mean', cv_results) assert 'decreasing_metric-mean' in cv_results
self.assertIn('binary_logloss-stdv', cv_results) assert 'binary_logloss-stdv' in cv_results
self.assertIn('error-stdv', cv_results) assert 'error-stdv' in cv_results
self.assertIn('decreasing_metric-stdv', cv_results) assert 'decreasing_metric-stdv' in cv_results
@unittest.skipIf(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, 'not enough RAM')
def test_model_size(self): @pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason='not enough RAM')
def test_model_size():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
data = lgb.Dataset(X, y) data = lgb.Dataset(X, y)
bst = lgb.train({'verbose': -1}, data, num_boost_round=2) bst = lgb.train({'verbose': -1}, data, num_boost_round=2)
...@@ -1926,28 +1971,29 @@ class TestEngine(unittest.TestCase): ...@@ -1926,28 +1971,29 @@ class TestEngine(unittest.TestCase):
+ (one_tree * multiplier).format(*range(2, total_trees)) + (one_tree * multiplier).format(*range(2, total_trees))
+ model_str[model_str.find('end of trees'):] + model_str[model_str.find('end of trees'):]
+ ' ' * (2**31 - one_tree_size * total_trees)) + ' ' * (2**31 - one_tree_size * total_trees))
self.assertGreater(len(new_model_str), 2**31) assert len(new_model_str) > 2**31
bst.model_from_string(new_model_str, verbose=False) bst.model_from_string(new_model_str, verbose=False)
self.assertEqual(bst.num_trees(), total_trees) assert bst.num_trees() == total_trees
y_pred_new = bst.predict(X, num_iteration=2) y_pred_new = bst.predict(X, num_iteration=2)
np.testing.assert_allclose(y_pred, y_pred_new) np.testing.assert_allclose(y_pred, y_pred_new)
except MemoryError: except MemoryError:
self.skipTest('not enough RAM') pytest.skipTest('not enough RAM')
def test_get_split_value_histogram(self): def test_get_split_value_histogram():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
lgb_train = lgb.Dataset(X, y, categorical_feature=[2]) lgb_train = lgb.Dataset(X, y, categorical_feature=[2])
gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20) gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20)
# test XGBoost-style return value # test XGBoost-style return value
params = {'feature': 0, 'xgboost_style': True} params = {'feature': 0, 'xgboost_style': True}
self.assertTupleEqual(gbm.get_split_value_histogram(**params).shape, (9, 2)) assert gbm.get_split_value_histogram(**params).shape == (9, 2)
self.assertTupleEqual(gbm.get_split_value_histogram(bins=999, **params).shape, (9, 2)) assert gbm.get_split_value_histogram(bins=999, **params).shape == (9, 2)
self.assertTupleEqual(gbm.get_split_value_histogram(bins=-1, **params).shape, (1, 2)) assert gbm.get_split_value_histogram(bins=-1, **params).shape == (1, 2)
self.assertTupleEqual(gbm.get_split_value_histogram(bins=0, **params).shape, (1, 2)) assert gbm.get_split_value_histogram(bins=0, **params).shape == (1, 2)
self.assertTupleEqual(gbm.get_split_value_histogram(bins=1, **params).shape, (1, 2)) assert gbm.get_split_value_histogram(bins=1, **params).shape == (1, 2)
self.assertTupleEqual(gbm.get_split_value_histogram(bins=2, **params).shape, (2, 2)) assert gbm.get_split_value_histogram(bins=2, **params).shape == (2, 2)
self.assertTupleEqual(gbm.get_split_value_histogram(bins=6, **params).shape, (5, 2)) assert gbm.get_split_value_histogram(bins=6, **params).shape == (5, 2)
self.assertTupleEqual(gbm.get_split_value_histogram(bins=7, **params).shape, (6, 2)) assert gbm.get_split_value_histogram(bins=7, **params).shape == (6, 2)
if lgb.compat.PANDAS_INSTALLED: if lgb.compat.PANDAS_INSTALLED:
np.testing.assert_allclose( np.testing.assert_allclose(
gbm.get_split_value_histogram(0, xgboost_style=True).values, gbm.get_split_value_histogram(0, xgboost_style=True).values,
...@@ -1968,25 +2014,27 @@ class TestEngine(unittest.TestCase): ...@@ -1968,25 +2014,27 @@ class TestEngine(unittest.TestCase):
) )
# test numpy-style return value # test numpy-style return value
hist, bins = gbm.get_split_value_histogram(0) hist, bins = gbm.get_split_value_histogram(0)
self.assertEqual(len(hist), 23) assert len(hist) == 23
self.assertEqual(len(bins), 24) assert len(bins) == 24
hist, bins = gbm.get_split_value_histogram(0, bins=999) hist, bins = gbm.get_split_value_histogram(0, bins=999)
self.assertEqual(len(hist), 999) assert len(hist) == 999
self.assertEqual(len(bins), 1000) assert len(bins) == 1000
self.assertRaises(ValueError, gbm.get_split_value_histogram, 0, bins=-1) with pytest.raises(ValueError):
self.assertRaises(ValueError, gbm.get_split_value_histogram, 0, bins=0) gbm.get_split_value_histogram(0, bins=-1)
with pytest.raises(ValueError):
gbm.get_split_value_histogram(0, bins=0)
hist, bins = gbm.get_split_value_histogram(0, bins=1) hist, bins = gbm.get_split_value_histogram(0, bins=1)
self.assertEqual(len(hist), 1) assert len(hist) == 1
self.assertEqual(len(bins), 2) assert len(bins) == 2
hist, bins = gbm.get_split_value_histogram(0, bins=2) hist, bins = gbm.get_split_value_histogram(0, bins=2)
self.assertEqual(len(hist), 2) assert len(hist) == 2
self.assertEqual(len(bins), 3) assert len(bins) == 3
hist, bins = gbm.get_split_value_histogram(0, bins=6) hist, bins = gbm.get_split_value_histogram(0, bins=6)
self.assertEqual(len(hist), 6) assert len(hist) == 6
self.assertEqual(len(bins), 7) assert len(bins) == 7
hist, bins = gbm.get_split_value_histogram(0, bins=7) hist, bins = gbm.get_split_value_histogram(0, bins=7)
self.assertEqual(len(hist), 7) assert len(hist) == 7
self.assertEqual(len(bins), 8) assert len(bins) == 8
hist_idx, bins_idx = gbm.get_split_value_histogram(0) hist_idx, bins_idx = gbm.get_split_value_histogram(0)
hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[0]) hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[0])
np.testing.assert_array_equal(hist_idx, hist_name) np.testing.assert_array_equal(hist_idx, hist_name)
...@@ -2008,9 +2056,11 @@ class TestEngine(unittest.TestCase): ...@@ -2008,9 +2056,11 @@ class TestEngine(unittest.TestCase):
np.testing.assert_array_equal(hist_vals[mask], hist[:, 1]) np.testing.assert_array_equal(hist_vals[mask], hist[:, 1])
np.testing.assert_allclose(bin_edges[1:][mask], hist[:, 0]) np.testing.assert_allclose(bin_edges[1:][mask], hist[:, 0])
# test histogram is disabled for categorical features # test histogram is disabled for categorical features
self.assertRaises(lgb.basic.LightGBMError, gbm.get_split_value_histogram, 2) with pytest.raises(lgb.basic.LightGBMError):
gbm.get_split_value_histogram(2)
def test_early_stopping_for_only_first_metric(self):
def test_early_stopping_for_only_first_metric():
def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration, def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration,
first_metric_only, feval=None): first_metric_only, feval=None):
...@@ -2025,7 +2075,7 @@ class TestEngine(unittest.TestCase): ...@@ -2025,7 +2075,7 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train, gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train,
num_boost_round=25, valid_sets=valid_sets, feval=feval, num_boost_round=25, valid_sets=valid_sets, feval=feval,
early_stopping_rounds=5, verbose_eval=False) early_stopping_rounds=5, verbose_eval=False)
self.assertEqual(assumed_iteration, gbm.best_iteration) assert assumed_iteration == gbm.best_iteration
def metrics_combination_cv_regression(metric_list, assumed_iteration, def metrics_combination_cv_regression(metric_list, assumed_iteration,
first_metric_only, eval_train_metric, feval=None): first_metric_only, eval_train_metric, feval=None):
...@@ -2043,7 +2093,7 @@ class TestEngine(unittest.TestCase): ...@@ -2043,7 +2093,7 @@ class TestEngine(unittest.TestCase):
stratified=False, feval=feval, stratified=False, feval=feval,
early_stopping_rounds=5, verbose_eval=False, early_stopping_rounds=5, verbose_eval=False,
eval_train_metric=eval_train_metric) eval_train_metric=eval_train_metric)
self.assertEqual(assumed_iteration, len(ret[list(ret.keys())[0]])) assert assumed_iteration == len(ret[list(ret.keys())[0]])
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
...@@ -2056,14 +2106,14 @@ class TestEngine(unittest.TestCase): ...@@ -2056,14 +2106,14 @@ class TestEngine(unittest.TestCase):
iter_valid1_l2 = 14 iter_valid1_l2 = 14
iter_valid2_l1 = 2 iter_valid2_l1 = 2
iter_valid2_l2 = 15 iter_valid2_l2 = 15
self.assertEqual(len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])), 4) assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 4
iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1]) iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2]) iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2]) iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
iter_cv_l1 = 4 iter_cv_l1 = 4
iter_cv_l2 = 12 iter_cv_l2 = 12
self.assertEqual(len(set([iter_cv_l1, iter_cv_l2])), 2) assert len(set([iter_cv_l1, iter_cv_l2])) == 2
iter_cv_min = min([iter_cv_l1, iter_cv_l2]) iter_cv_min = min([iter_cv_l1, iter_cv_l2])
# test for lgb.train # test for lgb.train
...@@ -2122,7 +2172,8 @@ class TestEngine(unittest.TestCase): ...@@ -2122,7 +2172,8 @@ class TestEngine(unittest.TestCase):
feval=lambda preds, train_data: [constant_metric(preds, train_data), feval=lambda preds, train_data: [constant_metric(preds, train_data),
decreasing_metric(preds, train_data)]) decreasing_metric(preds, train_data)])
def test_node_level_subcol(self):
def test_node_level_subcol():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -2141,14 +2192,15 @@ class TestEngine(unittest.TestCase): ...@@ -2141,14 +2192,15 @@ class TestEngine(unittest.TestCase):
verbose_eval=False, verbose_eval=False,
evals_result=evals_result) evals_result=evals_result)
ret = log_loss(y_test, gbm.predict(X_test)) ret = log_loss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.14) assert ret < 0.14
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) assert evals_result['valid_0']['binary_logloss'][-1] == pytest.approx(ret)
params['feature_fraction'] = 0.5 params['feature_fraction'] = 0.5
gbm2 = lgb.train(params, lgb_train, num_boost_round=25) gbm2 = lgb.train(params, lgb_train, num_boost_round=25)
ret2 = log_loss(y_test, gbm2.predict(X_test)) ret2 = log_loss(y_test, gbm2.predict(X_test))
self.assertNotEqual(ret, ret2) assert ret != ret2
def test_forced_bins(self): def test_forced_bins():
x = np.zeros((100, 2)) x = np.zeros((100, 2))
x[:, 0] = np.arange(0, 1, 0.01) x[:, 0] = np.arange(0, 1, 0.01)
x[:, 1] = -np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01)
...@@ -2167,16 +2219,16 @@ class TestEngine(unittest.TestCase): ...@@ -2167,16 +2219,16 @@ class TestEngine(unittest.TestCase):
new_x[:, 0] = [0.31, 0.37, 0.41] new_x[:, 0] = [0.31, 0.37, 0.41]
new_x[:, 1] = [0, 0, 0] new_x[:, 1] = [0, 0, 0]
predicted = est.predict(new_x) predicted = est.predict(new_x)
self.assertEqual(len(np.unique(predicted)), 3) assert len(np.unique(predicted)) == 3
new_x[:, 0] = [0, 0, 0] new_x[:, 0] = [0, 0, 0]
new_x[:, 1] = [-0.9, -0.6, -0.3] new_x[:, 1] = [-0.9, -0.6, -0.3]
predicted = est.predict(new_x) predicted = est.predict(new_x)
self.assertEqual(len(np.unique(predicted)), 1) assert len(np.unique(predicted)) == 1
params['forcedbins_filename'] = '' params['forcedbins_filename'] = ''
lgb_x = lgb.Dataset(x, label=y) lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=20) est = lgb.train(params, lgb_x, num_boost_round=20)
predicted = est.predict(new_x) predicted = est.predict(new_x)
self.assertEqual(len(np.unique(predicted)), 3) assert len(np.unique(predicted)) == 3
params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/regression/forced_bins2.json') '../../examples/regression/forced_bins2.json')
params['max_bin'] = 11 params['max_bin'] = 11
...@@ -2184,10 +2236,11 @@ class TestEngine(unittest.TestCase): ...@@ -2184,10 +2236,11 @@ class TestEngine(unittest.TestCase):
est = lgb.train(params, lgb_x, num_boost_round=50) est = lgb.train(params, lgb_x, num_boost_round=50)
predicted = est.predict(x[1:, :1]) predicted = est.predict(x[1:, :1])
_, counts = np.unique(predicted, return_counts=True) _, counts = np.unique(predicted, return_counts=True)
self.assertGreaterEqual(min(counts), 9) assert min(counts) >= 9
self.assertLessEqual(max(counts), 11) assert max(counts) <= 11
def test_binning_same_sign(self):
def test_binning_same_sign():
# test that binning works properly for features with only positive or only negative values # test that binning works properly for features with only positive or only negative values
x = np.zeros((99, 2)) x = np.zeros((99, 2))
x[:, 0] = np.arange(0.01, 1, 0.01) x[:, 0] = np.arange(0.01, 1, 0.01)
...@@ -2204,15 +2257,16 @@ class TestEngine(unittest.TestCase): ...@@ -2204,15 +2257,16 @@ class TestEngine(unittest.TestCase):
new_x = np.zeros((3, 2)) new_x = np.zeros((3, 2))
new_x[:, 0] = [-1, 0, 1] new_x[:, 0] = [-1, 0, 1]
predicted = est.predict(new_x) predicted = est.predict(new_x)
self.assertAlmostEqual(predicted[0], predicted[1]) assert predicted[0] == pytest.approx(predicted[1])
self.assertNotAlmostEqual(predicted[1], predicted[2]) assert predicted[1] != pytest.approx(predicted[2])
new_x = np.zeros((3, 2)) new_x = np.zeros((3, 2))
new_x[:, 1] = [-1, 0, 1] new_x[:, 1] = [-1, 0, 1]
predicted = est.predict(new_x) predicted = est.predict(new_x)
self.assertNotAlmostEqual(predicted[0], predicted[1]) assert predicted[0] != pytest.approx(predicted[1])
self.assertAlmostEqual(predicted[1], predicted[2]) assert predicted[1] == pytest.approx(predicted[2])
def test_dataset_update_params(self): def test_dataset_update_params():
default_params = {"max_bin": 100, default_params = {"max_bin": 100,
"max_bin_by_feature": [20, 10], "max_bin_by_feature": [20, 10],
"bin_construct_sample_cnt": 10000, "bin_construct_sample_cnt": 10000,
...@@ -2292,7 +2346,8 @@ class TestEngine(unittest.TestCase): ...@@ -2292,7 +2346,8 @@ class TestEngine(unittest.TestCase):
with np.testing.assert_raises_regex(lgb.basic.LightGBMError, err_msg): with np.testing.assert_raises_regex(lgb.basic.LightGBMError, err_msg):
lgb.train(new_params, lgb_data, num_boost_round=3) lgb.train(new_params, lgb_data, num_boost_round=3)
def test_dataset_params_with_reference(self):
def test_dataset_params_with_reference():
default_params = {"max_bin": 100} default_params = {"max_bin": 100}
X = np.random.random((100, 2)) X = np.random.random((100, 2))
y = np.random.random(100) y = np.random.random(100)
...@@ -2300,11 +2355,12 @@ class TestEngine(unittest.TestCase): ...@@ -2300,11 +2355,12 @@ class TestEngine(unittest.TestCase):
y_val = np.random.random(100) y_val = np.random.random(100)
lgb_train = lgb.Dataset(X, y, params=default_params, free_raw_data=False).construct() lgb_train = lgb.Dataset(X, y, params=default_params, free_raw_data=False).construct()
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train, free_raw_data=False).construct() lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train, free_raw_data=False).construct()
self.assertDictEqual(lgb_train.get_params(), default_params) assert lgb_train.get_params() == default_params
self.assertDictEqual(lgb_val.get_params(), default_params) assert lgb_val.get_params() == default_params
model = lgb.train(default_params, lgb_train, valid_sets=[lgb_val]) lgb.train(default_params, lgb_train, valid_sets=[lgb_val])
def test_extra_trees(self): def test_extra_trees():
# check extra trees increases regularization # check extra trees increases regularization
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
lgb_x = lgb.Dataset(X, label=y) lgb_x = lgb.Dataset(X, label=y)
...@@ -2320,9 +2376,10 @@ class TestEngine(unittest.TestCase): ...@@ -2320,9 +2376,10 @@ class TestEngine(unittest.TestCase):
est = lgb.train(params, lgb_x, num_boost_round=10) est = lgb.train(params, lgb_x, num_boost_round=10)
predicted_new = est.predict(X) predicted_new = est.predict(X)
err_new = mean_squared_error(y, predicted_new) err_new = mean_squared_error(y, predicted_new)
self.assertLess(err, err_new) assert err < err_new
def test_path_smoothing(self):
def test_path_smoothing():
# check path smoothing increases regularization # check path smoothing increases regularization
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
lgb_x = lgb.Dataset(X, label=y) lgb_x = lgb.Dataset(X, label=y)
...@@ -2337,10 +2394,11 @@ class TestEngine(unittest.TestCase): ...@@ -2337,10 +2394,11 @@ class TestEngine(unittest.TestCase):
est = lgb.train(params, lgb_x, num_boost_round=10) est = lgb.train(params, lgb_x, num_boost_round=10)
predicted_new = est.predict(X) predicted_new = est.predict(X)
err_new = mean_squared_error(y, predicted_new) err_new = mean_squared_error(y, predicted_new)
self.assertLess(err, err_new) assert err < err_new
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') def test_trees_to_dataframe():
def test_trees_to_dataframe(self): pytest.importorskip("pandas")
def _imptcs_to_numpy(X, impcts_dict): def _imptcs_to_numpy(X, impcts_dict):
cols = ['Column_' + str(i) for i in range(X.shape[1])] cols = ['Column_' + str(i) for i in range(X.shape[1])]
...@@ -2370,7 +2428,7 @@ class TestEngine(unittest.TestCase): ...@@ -2370,7 +2428,7 @@ class TestEngine(unittest.TestCase):
np.testing.assert_equal(tree_split, mod_split) np.testing.assert_equal(tree_split, mod_split)
np.testing.assert_allclose(tree_gains, mod_gains) np.testing.assert_allclose(tree_gains, mod_gains)
self.assertEqual(num_trees_from_df, num_trees) assert num_trees_from_df == num_trees
np.testing.assert_equal(obs_counts_from_df, len(y)) np.testing.assert_equal(obs_counts_from_df, len(y))
# test edge case with one leaf # test edge case with one leaf
...@@ -2380,17 +2438,18 @@ class TestEngine(unittest.TestCase): ...@@ -2380,17 +2438,18 @@ class TestEngine(unittest.TestCase):
bst = lgb.train({"objective": "binary", "verbose": -1}, data, num_trees) bst = lgb.train({"objective": "binary", "verbose": -1}, data, num_trees)
tree_df = bst.trees_to_dataframe() tree_df = bst.trees_to_dataframe()
self.assertEqual(len(tree_df), 1) assert len(tree_df) == 1
self.assertEqual(tree_df.loc[0, 'tree_index'], 0) assert tree_df.loc[0, 'tree_index'] == 0
self.assertEqual(tree_df.loc[0, 'node_depth'], 1) assert tree_df.loc[0, 'node_depth'] == 1
self.assertEqual(tree_df.loc[0, 'node_index'], "0-L0") assert tree_df.loc[0, 'node_index'] == "0-L0"
self.assertIsNotNone(tree_df.loc[0, 'value']) assert tree_df.loc[0, 'value'] is not None
for col in ('left_child', 'right_child', 'parent_index', 'split_feature', for col in ('left_child', 'right_child', 'parent_index', 'split_feature',
'split_gain', 'threshold', 'decision_type', 'missing_direction', 'split_gain', 'threshold', 'decision_type', 'missing_direction',
'missing_type', 'weight', 'count'): 'missing_type', 'weight', 'count'):
self.assertIsNone(tree_df.loc[0, col]) assert tree_df.loc[0, col] is None
def test_interaction_constraints(self):
def test_interaction_constraints():
X, y = load_boston(return_X_y=True) X, y = load_boston(return_X_y=True)
num_features = X.shape[1] num_features = X.shape[1]
train_data = lgb.Dataset(X, label=y) train_data = lgb.Dataset(X, label=y)
...@@ -2408,12 +2467,12 @@ class TestEngine(unittest.TestCase): ...@@ -2408,12 +2467,12 @@ class TestEngine(unittest.TestCase):
list(range(num_features // 2, num_features))]), list(range(num_features // 2, num_features))]),
train_data, num_boost_round=10) train_data, num_boost_round=10)
pred3 = est.predict(X) pred3 = est.predict(X)
self.assertLess(mean_squared_error(y, pred1), mean_squared_error(y, pred3)) assert mean_squared_error(y, pred1) < mean_squared_error(y, pred3)
# check that constraints consisting of single features reduce accuracy further # check that constraints consisting of single features reduce accuracy further
est = lgb.train(dict(params, interaction_constraints=[[i] for i in range(num_features)]), train_data, est = lgb.train(dict(params, interaction_constraints=[[i] for i in range(num_features)]), train_data,
num_boost_round=10) num_boost_round=10)
pred4 = est.predict(X) pred4 = est.predict(X)
self.assertLess(mean_squared_error(y, pred3), mean_squared_error(y, pred4)) assert mean_squared_error(y, pred3) < mean_squared_error(y, pred4)
# test that interaction constraints work when not all features are used # test that interaction constraints work when not all features are used
X = np.concatenate([np.zeros((X.shape[0], 1)), X], axis=1) X = np.concatenate([np.zeros((X.shape[0], 1)), X], axis=1)
num_features = X.shape[1] num_features = X.shape[1]
...@@ -2422,7 +2481,8 @@ class TestEngine(unittest.TestCase): ...@@ -2422,7 +2481,8 @@ class TestEngine(unittest.TestCase):
[1] + list(range(2, num_features))]), [1] + list(range(2, num_features))]),
train_data, num_boost_round=10) train_data, num_boost_round=10)
def test_linear_trees(self):
def test_linear_trees(tmp_path):
# check that setting linear_tree=True fits better than ordinary trees when data has linear relationship # check that setting linear_tree=True fits better than ordinary trees when data has linear relationship
np.random.seed(0) np.random.seed(0)
x = np.arange(0, 100, 0.1) x = np.arange(0, 100, 0.1)
...@@ -2441,7 +2501,7 @@ class TestEngine(unittest.TestCase): ...@@ -2441,7 +2501,7 @@ class TestEngine(unittest.TestCase):
valid_sets=[lgb_train], valid_names=['train']) valid_sets=[lgb_train], valid_names=['train'])
pred2 = est.predict(x) pred2 = est.predict(x)
np.testing.assert_allclose(res['train']['l2'][-1], mean_squared_error(y, pred2), atol=10**(-1)) np.testing.assert_allclose(res['train']['l2'][-1], mean_squared_error(y, pred2), atol=10**(-1))
self.assertLess(mean_squared_error(y, pred2), mean_squared_error(y, pred1)) assert mean_squared_error(y, pred2) < mean_squared_error(y, pred1)
# test again with nans in data # test again with nans in data
x[:10] = np.nan x[:10] = np.nan
lgb_train = lgb.Dataset(x, label=y) lgb_train = lgb.Dataset(x, label=y)
...@@ -2453,7 +2513,7 @@ class TestEngine(unittest.TestCase): ...@@ -2453,7 +2513,7 @@ class TestEngine(unittest.TestCase):
valid_sets=[lgb_train], valid_names=['train']) valid_sets=[lgb_train], valid_names=['train'])
pred2 = est.predict(x) pred2 = est.predict(x)
np.testing.assert_allclose(res['train']['l2'][-1], mean_squared_error(y, pred2), atol=10**(-1)) np.testing.assert_allclose(res['train']['l2'][-1], mean_squared_error(y, pred2), atol=10**(-1))
self.assertLess(mean_squared_error(y, pred2), mean_squared_error(y, pred1)) assert mean_squared_error(y, pred2) < mean_squared_error(y, pred1)
# test again with bagging # test again with bagging
res = {} res = {}
est = lgb.train(dict(params, linear_tree=True, subsample=0.8, bagging_freq=1), lgb_train, est = lgb.train(dict(params, linear_tree=True, subsample=0.8, bagging_freq=1), lgb_train,
...@@ -2480,18 +2540,20 @@ class TestEngine(unittest.TestCase): ...@@ -2480,18 +2540,20 @@ class TestEngine(unittest.TestCase):
est2 = est.refit(x, label=y) est2 = est.refit(x, label=y)
p1 = est.predict(x) p1 = est.predict(x)
p2 = est2.predict(x) p2 = est2.predict(x)
self.assertLess(np.mean(np.abs(p1 - p2)), 2) assert np.mean(np.abs(p1 - p2)) < 2
# test refit with save and load # test refit with save and load
est.save_model('temp_model.txt') temp_model = str(tmp_path / "temp_model.txt")
est2 = lgb.Booster(model_file='temp_model.txt') est.save_model(temp_model)
est2 = lgb.Booster(model_file=temp_model)
est2 = est2.refit(x, label=y) est2 = est2.refit(x, label=y)
p1 = est.predict(x) p1 = est.predict(x)
p2 = est2.predict(x) p2 = est2.predict(x)
self.assertLess(np.mean(np.abs(p1 - p2)), 2) assert np.mean(np.abs(p1 - p2)) < 2
# test refit: different results training on different data # test refit: different results training on different data
est3 = est.refit(x[:100, :], label=y[:100]) est3 = est.refit(x[:100, :], label=y[:100])
p3 = est3.predict(x) p3 = est3.predict(x)
self.assertGreater(np.mean(np.abs(p2 - p1)), np.abs(np.max(p3 - p1))) assert np.mean(np.abs(p2 - p1)) > np.abs(np.max(p3 - p1))
# test when num_leaves - 1 < num_features and when num_leaves - 1 > num_features # test when num_leaves - 1 < num_features and when num_leaves - 1 > num_features
X_train, _, y_train, _ = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) X_train, _, y_train, _ = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2)
params = {'linear_tree': True, params = {'linear_tree': True,
...@@ -2503,12 +2565,14 @@ class TestEngine(unittest.TestCase): ...@@ -2503,12 +2565,14 @@ class TestEngine(unittest.TestCase):
train_data = lgb.Dataset(X_train, label=y_train, params=dict(params, num_leaves=60)) train_data = lgb.Dataset(X_train, label=y_train, params=dict(params, num_leaves=60))
est = lgb.train(params, train_data, num_boost_round=10, categorical_feature=[0]) est = lgb.train(params, train_data, num_boost_round=10, categorical_feature=[0])
def test_predict_with_start_iteration(self):
def test_predict_with_start_iteration():
def inner_test(X, y, params, early_stopping_rounds): def inner_test(X, y, params, early_stopping_rounds):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
train_data = lgb.Dataset(X_train, label=y_train) train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test) valid_data = lgb.Dataset(X_test, label=y_test)
booster = lgb.train(params, train_data, num_boost_round=50, early_stopping_rounds=early_stopping_rounds, valid_sets=[valid_data]) booster = lgb.train(params, train_data, num_boost_round=50, early_stopping_rounds=early_stopping_rounds,
valid_sets=[valid_data])
# test that the predict once with all iterations equals summed results with start_iteration and num_iteration # test that the predict once with all iterations equals summed results with start_iteration and num_iteration
all_pred = booster.predict(X, raw_score=True) all_pred = booster.predict(X, raw_score=True)
...@@ -2565,7 +2629,6 @@ class TestEngine(unittest.TestCase): ...@@ -2565,7 +2629,6 @@ class TestEngine(unittest.TestCase):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
params = { params = {
'objective': 'multiclass', 'objective': 'multiclass',
'metric': 'multi_logloss',
'num_class': 3, 'num_class': 3,
'verbose': -1, 'verbose': -1,
'metric': 'multi_error' 'metric': 'multi_error'
...@@ -2579,7 +2642,6 @@ class TestEngine(unittest.TestCase): ...@@ -2579,7 +2642,6 @@ class TestEngine(unittest.TestCase):
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
params = { params = {
'objective': 'binary', 'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1, 'verbose': -1,
'metric': 'auc' 'metric': 'auc'
} }
...@@ -2588,7 +2650,8 @@ class TestEngine(unittest.TestCase): ...@@ -2588,7 +2650,8 @@ class TestEngine(unittest.TestCase):
inner_test(X, y, params, early_stopping_rounds=5) inner_test(X, y, params, early_stopping_rounds=5)
inner_test(X, y, params, early_stopping_rounds=None) inner_test(X, y, params, early_stopping_rounds=None)
def test_average_precision_metric(self):
def test_average_precision_metric():
# test against sklearn average precision metric # test against sklearn average precision metric
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
params = { params = {
...@@ -2602,15 +2665,16 @@ class TestEngine(unittest.TestCase): ...@@ -2602,15 +2665,16 @@ class TestEngine(unittest.TestCase):
ap = res['training']['average_precision'][-1] ap = res['training']['average_precision'][-1]
pred = est.predict(X) pred = est.predict(X)
sklearn_ap = average_precision_score(y, pred) sklearn_ap = average_precision_score(y, pred)
self.assertAlmostEqual(ap, sklearn_ap) assert ap == pytest.approx(sklearn_ap)
# test that average precision is 1 where model predicts perfectly # test that average precision is 1 where model predicts perfectly
y = y.copy() y = y.copy()
y[:] = 1 y[:] = 1
lgb_X = lgb.Dataset(X, label=y) lgb_X = lgb.Dataset(X, label=y)
lgb.train(params, lgb_X, num_boost_round=1, valid_sets=[lgb_X], evals_result=res) lgb.train(params, lgb_X, num_boost_round=1, valid_sets=[lgb_X], evals_result=res)
self.assertAlmostEqual(res['training']['average_precision'][-1], 1) assert res['training']['average_precision'][-1] == pytest.approx(1)
def test_reset_params_works_with_metric_num_class_and_boosting(self): def test_reset_params_works_with_metric_num_class_and_boosting():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
dataset_params = {"max_bin": 150} dataset_params = {"max_bin": 150}
booster_params = { booster_params = {
...@@ -2628,11 +2692,11 @@ class TestEngine(unittest.TestCase): ...@@ -2628,11 +2692,11 @@ class TestEngine(unittest.TestCase):
) )
expected_params = dict(dataset_params, **booster_params) expected_params = dict(dataset_params, **booster_params)
self.assertDictEqual(bst.params, expected_params) assert bst.params == expected_params
booster_params['bagging_fraction'] += 0.1 booster_params['bagging_fraction'] += 0.1
new_bst = bst.reset_parameter(booster_params) new_bst = bst.reset_parameter(booster_params)
expected_params = dict(dataset_params, **booster_params) expected_params = dict(dataset_params, **booster_params)
self.assertDictEqual(bst.params, expected_params) assert bst.params == expected_params
self.assertDictEqual(new_bst.params, expected_params) assert new_bst.params == expected_params
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment