Unverified Commit 108e80f2 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[tests] fixed codestyle, removed unused code and added several new checks (#1688)

* break huge lines in sklearn tests

* break huge line in plotting tests

* break huge lines in basic tests

* multiple enhancements in engine tests

* multiple enhancements in sklearn tests

* hotfixes

* break huge lines and use with statement in C API test

* make NDCG test more strict
parent 5d744197
......@@ -75,7 +75,7 @@ def load_from_file(filename, reference):
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle
......@@ -86,11 +86,10 @@ def save_to_binary(handle, filename):
def load_from_csr(filename, reference):
data = []
label = []
inp = open(filename, 'r')
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
inp.close()
with open(filename, 'r') as inp:
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
mat = np.array(data)
label = np.array(label, dtype=np.float32)
csr = sparse.csr_matrix(mat)
......@@ -116,18 +115,17 @@ def load_from_csr(filename, reference):
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle
def load_from_csc(filename, reference):
data = []
label = []
inp = open(filename, 'r')
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
inp.close()
with open(filename, 'r') as inp:
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
mat = np.array(data)
label = np.array(label, dtype=np.float32)
csr = sparse.csc_matrix(mat)
......@@ -153,18 +151,17 @@ def load_from_csc(filename, reference):
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle
def load_from_mat(filename, reference):
data = []
label = []
inp = open(filename, 'r')
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
inp.close()
with open(filename, 'r') as inp:
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
mat = np.array(data)
data = np.array(mat.reshape(mat.size), copy=False)
label = np.array(label, dtype=np.float32)
......@@ -173,8 +170,8 @@ def load_from_mat(filename, reference):
if reference is not None:
ref = reference
LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as(
ctypes.POINTER(ctypes.c_void_p)),
LIB.LGBM_DatasetCreateFromMat(
data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64,
mat.shape[0],
mat.shape[1],
......@@ -187,7 +184,7 @@ def load_from_mat(filename, reference):
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle
......@@ -196,12 +193,16 @@ def free_dataset(handle):
def test_dataset():
train = load_from_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.train'), None)
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train)
train = load_from_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.train'), None)
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
free_dataset(test)
test = load_from_csr(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train)
test = load_from_csr(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
free_dataset(test)
test = load_from_csc(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train)
test = load_from_csc(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
free_dataset(test)
save_to_binary(train, 'train.binary.bin')
free_dataset(train)
......@@ -210,31 +211,43 @@ def test_dataset():
def test_booster():
train = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.train'), None)
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train)
train = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.train'), None)
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
booster = ctypes.c_void_p()
LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
LIB.LGBM_BoosterCreate(
train,
c_str("app=binary metric=auc num_leaves=31 verbose=0"),
ctypes.byref(booster))
LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0)
for i in range(1, 101):
LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_ulong(0)
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
LIB.LGBM_BoosterGetEval(
booster,
0,
ctypes.byref(out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
if i % 10 == 0:
print('%d Iteration test AUC %f' % (i, result[0]))
print('%d iteration test AUC %f' % (i, result[0]))
LIB.LGBM_BoosterSaveModel(booster, 0, -1, c_str('model.txt'))
LIB.LGBM_BoosterFree(booster)
free_dataset(train)
free_dataset(test)
booster2 = ctypes.c_void_p()
num_total_model = ctypes.c_long()
LIB.LGBM_BoosterCreateFromModelfile(c_str('model.txt'), ctypes.byref(num_total_model), ctypes.byref(booster2))
LIB.LGBM_BoosterCreateFromModelfile(
c_str('model.txt'),
ctypes.byref(num_total_model),
ctypes.byref(booster2))
data = []
inp = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), 'r')
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
inp.close()
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), 'r') as inp:
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
mat = np.array(data)
preb = np.zeros(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_long()
......@@ -253,7 +266,8 @@ def test_booster():
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
LIB.LGBM_BoosterPredictForFile(
booster2,
c_str(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test')),
c_str(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test')),
0,
0,
50,
......
......@@ -13,7 +13,8 @@ from sklearn.model_selection import train_test_split
class TestBasic(unittest.TestCase):
def test(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True),
test_size=0.1, random_state=2)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = train_data.create_valid(X_test, label=y_test)
......@@ -80,8 +81,10 @@ class TestBasic(unittest.TestCase):
valid_data.construct()
def test_subset_group(self):
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
self.assertEqual(len(lgb_train.get_group()), 201)
subset = lgb_train.subset(list(lgb.compat.range_(10))).construct()
......
......@@ -10,7 +10,7 @@ import random
import numpy as np
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris, load_svmlight_file)
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
from scipy.sparse import csr_matrix
......@@ -25,7 +25,6 @@ def multi_logloss(y_true, y_pred):
class TestEngine(unittest.TestCase):
def test_binary(self):
X, y = load_breast_cancer(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
......@@ -145,6 +144,9 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_missing_value_handle_zero(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
......@@ -174,6 +176,9 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_missing_value_handle_none(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
......@@ -204,6 +209,9 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[0], pred[1], places=5)
self.assertAlmostEqual(pred[-1], pred[0], places=5)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.83)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_categorical_handle(self):
x = [0, 1, 2, 3, 4, 5, 6, 7]
......@@ -238,8 +246,11 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_categorical_handle2(self):
def test_categorical_handle_na(self):
x = [0, np.nan, 0, np.nan, 0, np.nan]
y = [0, 1, 0, 1, 0, 1]
......@@ -272,6 +283,9 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_multiclass(self):
X, y = load_digits(10, True)
......@@ -331,20 +345,19 @@ class TestEngine(unittest.TestCase):
'verbose': -1
}
lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=50,
valid_sets=lgb_eval,
verbose_eval=False,
evals_result=evals_result)
num_boost_round=50)
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
pred_parameter = {"pred_early_stop": True,
"pred_early_stop_freq": 5,
"pred_early_stop_margin": 1.5}
ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
self.assertLess(ret, 0.8)
self.assertGreater(ret, 0.5) # loss will be higher than when evaluating the full model
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 5.5}
pred_parameter = {"pred_early_stop": True,
"pred_early_stop_freq": 5,
"pred_early_stop_margin": 5.5}
ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
self.assertLess(ret, 0.2)
......@@ -486,23 +499,23 @@ class TestEngine(unittest.TestCase):
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train)
feature_names = ['f_' + str(i) for i in range(13)]
feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
self.assertListEqual(feature_names, gbm.feature_name())
# test feature_names with whitespaces
feature_names_with_space = ['f ' + str(i) for i in range(13)]
feature_names_with_space = ['f ' + str(i) for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
self.assertListEqual(feature_names, gbm.feature_name())
def test_save_load_copy_pickle(self):
def test_template(init_model=None, return_model=False):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
'objective': 'regression',
'metric': 'l2',
'verbose': -1
}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train)
gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
......@@ -576,7 +589,11 @@ class TestEngine(unittest.TestCase):
tmp_dat_train = tmp_dat.subset(np.arange(80))
tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
params = {'objective': 'regression_l2', 'metric': 'rmse'}
gbm = lgb.train(params, tmp_dat_train, num_boost_round=20, valid_sets=[tmp_dat_train, tmp_dat_val])
evals_result = {}
gbm = lgb.train(params, tmp_dat_train, num_boost_round=20,
valid_sets=[tmp_dat_train, tmp_dat_val], evals_result=evals_result)
self.assertEqual(len(evals_result['training']['rmse']), 20)
self.assertEqual(len(evals_result['valid_1']['rmse']), 20)
def test_contribs(self):
X, y = load_breast_cancer(True)
......@@ -587,15 +604,11 @@ class TestEngine(unittest.TestCase):
'verbose': -1,
}
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
verbose_eval=False,
evals_result=evals_result)
num_boost_round=20)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True) - np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True)
- np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4)
def test_sliced_data(self):
def train_and_get_predictions(features, labels):
......@@ -605,13 +618,13 @@ class TestEngine(unittest.TestCase):
'verbose': -1,
'min_data': 5,
}
lgbm_model = lgb.train(
gbm = lgb.train(
params=lgb_params,
train_set=dataset,
num_boost_round=10,
)
predictions = lgbm_model.predict(features)
return predictions
return gbm.predict(features)
num_samples = 100
features = np.random.rand(num_samples, 5)
positive_samples = int(num_samples * 0.25)
......@@ -649,10 +662,10 @@ class TestEngine(unittest.TestCase):
def test_monotone_constraint(self):
def is_increasing(y):
return np.count_nonzero(np.diff(y) < 0.0) == 0
return (np.diff(y) >= 0.0).all()
def is_decreasing(y):
return np.count_nonzero(np.diff(y) > 0.0) == 0
return (np.diff(y) <= 0.0).all()
def is_correctly_constrained(learner):
n = 200
......@@ -697,8 +710,7 @@ class TestEngine(unittest.TestCase):
'min_data': 10
}
lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train,
num_boost_round=20)
gbm = lgb.train(params, lgb_train, num_boost_round=20)
err_pred = log_loss(y_test, gbm.predict(X_test))
new_gbm = gbm.refit(X_test, y_test)
new_err_pred = log_loss(y_test, new_gbm.predict(X_test))
......@@ -716,8 +728,7 @@ class TestEngine(unittest.TestCase):
'boost_from_average': False
}
lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train,
num_boost_round=20)
gbm = lgb.train(params, lgb_train, num_boost_round=20)
pred = gbm.predict(X)
pred_mean = pred.mean()
self.assertGreater(pred_mean, 20)
......@@ -734,8 +745,7 @@ class TestEngine(unittest.TestCase):
'boost_from_average': False
}
lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train,
num_boost_round=40)
gbm = lgb.train(params, lgb_train, num_boost_round=40)
pred = gbm.predict(X)
pred_mean = pred.mean()
self.assertGreater(pred_mean, 18)
......
......@@ -17,7 +17,8 @@ if GRAPHVIZ_INSTALLED:
class TestBasic(unittest.TestCase):
def setUp(self):
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(*load_breast_cancer(True),
test_size=0.1, random_state=1)
self.train_data = lgb.Dataset(self.X_train, self.y_train)
self.params = {
"objective": "binary",
......
......@@ -57,10 +57,14 @@ class TestSklearn(unittest.TestCase):
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)
def test_lambdarank(self):
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query'))
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train'))
X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.test'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.test.query'))
gbm = lgb.LGBMRanker()
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
......@@ -74,6 +78,7 @@ class TestSklearn(unittest.TestCase):
grad = (y_pred - y_true)
hess = np.ones(len(y_true))
return grad, hess
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
......@@ -88,10 +93,11 @@ class TestSklearn(unittest.TestCase):
grad = y_pred - y_true
hess = y_pred * (1.0 - y_pred)
return grad, hess
X, y = load_digits(2, True)
def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
X, y = load_digits(2, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment