Unverified Commit 108e80f2 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[tests] fixed codestyle, removed unused code and added several new checks (#1688)

* break huge lines in sklearn tests

* break huge line in plotting tests

* break huge lines in basic tests

* multiple enhancements in engine tests

* multiple enhancements in sklearn tests

* hotfixes

* break huge lines and use with statement in C API test

* make NDCG test more strict
parent 5d744197
...@@ -75,7 +75,7 @@ def load_from_file(filename, reference): ...@@ -75,7 +75,7 @@ def load_from_file(filename, reference):
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
print('#data:%d #feature:%d' % (num_data.value, num_feature.value)) print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle return handle
...@@ -86,11 +86,10 @@ def save_to_binary(handle, filename): ...@@ -86,11 +86,10 @@ def save_to_binary(handle, filename):
def load_from_csr(filename, reference): def load_from_csr(filename, reference):
data = [] data = []
label = [] label = []
inp = open(filename, 'r') with open(filename, 'r') as inp:
for line in inp.readlines(): for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]]) data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0])) label.append(float(line.split('\t')[0]))
inp.close()
mat = np.array(data) mat = np.array(data)
label = np.array(label, dtype=np.float32) label = np.array(label, dtype=np.float32)
csr = sparse.csr_matrix(mat) csr = sparse.csr_matrix(mat)
...@@ -116,18 +115,17 @@ def load_from_csr(filename, reference): ...@@ -116,18 +115,17 @@ def load_from_csr(filename, reference):
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0) LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print('#data:%d #feature:%d' % (num_data.value, num_feature.value)) print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle return handle
def load_from_csc(filename, reference): def load_from_csc(filename, reference):
data = [] data = []
label = [] label = []
inp = open(filename, 'r') with open(filename, 'r') as inp:
for line in inp.readlines(): for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]]) data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0])) label.append(float(line.split('\t')[0]))
inp.close()
mat = np.array(data) mat = np.array(data)
label = np.array(label, dtype=np.float32) label = np.array(label, dtype=np.float32)
csr = sparse.csc_matrix(mat) csr = sparse.csc_matrix(mat)
...@@ -153,18 +151,17 @@ def load_from_csc(filename, reference): ...@@ -153,18 +151,17 @@ def load_from_csc(filename, reference):
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0) LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print('#data:%d #feature:%d' % (num_data.value, num_feature.value)) print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle return handle
def load_from_mat(filename, reference): def load_from_mat(filename, reference):
data = [] data = []
label = [] label = []
inp = open(filename, 'r') with open(filename, 'r') as inp:
for line in inp.readlines(): for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]]) data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0])) label.append(float(line.split('\t')[0]))
inp.close()
mat = np.array(data) mat = np.array(data)
data = np.array(mat.reshape(mat.size), copy=False) data = np.array(mat.reshape(mat.size), copy=False)
label = np.array(label, dtype=np.float32) label = np.array(label, dtype=np.float32)
...@@ -173,8 +170,8 @@ def load_from_mat(filename, reference): ...@@ -173,8 +170,8 @@ def load_from_mat(filename, reference):
if reference is not None: if reference is not None:
ref = reference ref = reference
LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as( LIB.LGBM_DatasetCreateFromMat(
ctypes.POINTER(ctypes.c_void_p)), data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64, dtype_float64,
mat.shape[0], mat.shape[0],
mat.shape[1], mat.shape[1],
...@@ -187,7 +184,7 @@ def load_from_mat(filename, reference): ...@@ -187,7 +184,7 @@ def load_from_mat(filename, reference):
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0) LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print('#data:%d #feature:%d' % (num_data.value, num_feature.value)) print('#data: %d #feature: %d' % (num_data.value, num_feature.value))
return handle return handle
...@@ -196,12 +193,16 @@ def free_dataset(handle): ...@@ -196,12 +193,16 @@ def free_dataset(handle):
def test_dataset(): def test_dataset():
train = load_from_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.train'), None) train = load_from_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train) '../../examples/binary_classification/binary.train'), None)
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
free_dataset(test) free_dataset(test)
test = load_from_csr(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train) test = load_from_csr(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
free_dataset(test) free_dataset(test)
test = load_from_csc(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train) test = load_from_csc(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
free_dataset(test) free_dataset(test)
save_to_binary(train, 'train.binary.bin') save_to_binary(train, 'train.binary.bin')
free_dataset(train) free_dataset(train)
...@@ -210,31 +211,43 @@ def test_dataset(): ...@@ -210,31 +211,43 @@ def test_dataset():
def test_booster(): def test_booster():
train = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.train'), None) train = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)),
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), train) '../../examples/binary_classification/binary.train'), None)
test = load_from_mat(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), train)
booster = ctypes.c_void_p() booster = ctypes.c_void_p()
LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster)) LIB.LGBM_BoosterCreate(
train,
c_str("app=binary metric=auc num_leaves=31 verbose=0"),
ctypes.byref(booster))
LIB.LGBM_BoosterAddValidData(booster, test) LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0) is_finished = ctypes.c_int(0)
for i in range(1, 101): for i in range(1, 101):
LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished)) LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float64) result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_ulong(0) out_len = ctypes.c_ulong(0)
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))) LIB.LGBM_BoosterGetEval(
booster,
0,
ctypes.byref(out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
if i % 10 == 0: if i % 10 == 0:
print('%d Iteration test AUC %f' % (i, result[0])) print('%d iteration test AUC %f' % (i, result[0]))
LIB.LGBM_BoosterSaveModel(booster, 0, -1, c_str('model.txt')) LIB.LGBM_BoosterSaveModel(booster, 0, -1, c_str('model.txt'))
LIB.LGBM_BoosterFree(booster) LIB.LGBM_BoosterFree(booster)
free_dataset(train) free_dataset(train)
free_dataset(test) free_dataset(test)
booster2 = ctypes.c_void_p() booster2 = ctypes.c_void_p()
num_total_model = ctypes.c_long() num_total_model = ctypes.c_long()
LIB.LGBM_BoosterCreateFromModelfile(c_str('model.txt'), ctypes.byref(num_total_model), ctypes.byref(booster2)) LIB.LGBM_BoosterCreateFromModelfile(
c_str('model.txt'),
ctypes.byref(num_total_model),
ctypes.byref(booster2))
data = [] data = []
inp = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test'), 'r') with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test'), 'r') as inp:
for line in inp.readlines(): for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]]) data.append([float(x) for x in line.split('\t')[1:]])
inp.close()
mat = np.array(data) mat = np.array(data)
preb = np.zeros(mat.shape[0], dtype=np.float64) preb = np.zeros(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_long() num_preb = ctypes.c_long()
...@@ -253,7 +266,8 @@ def test_booster(): ...@@ -253,7 +266,8 @@ def test_booster():
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double))) preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
LIB.LGBM_BoosterPredictForFile( LIB.LGBM_BoosterPredictForFile(
booster2, booster2,
c_str(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/binary_classification/binary.test')), c_str(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/binary_classification/binary.test')),
0, 0,
0, 0,
50, 50,
......
...@@ -13,7 +13,8 @@ from sklearn.model_selection import train_test_split ...@@ -13,7 +13,8 @@ from sklearn.model_selection import train_test_split
class TestBasic(unittest.TestCase): class TestBasic(unittest.TestCase):
def test(self): def test(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2) X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True),
test_size=0.1, random_state=2)
train_data = lgb.Dataset(X_train, label=y_train) train_data = lgb.Dataset(X_train, label=y_train)
valid_data = train_data.create_valid(X_test, label=y_test) valid_data = train_data.create_valid(X_test, label=y_test)
...@@ -80,8 +81,10 @@ class TestBasic(unittest.TestCase): ...@@ -80,8 +81,10 @@ class TestBasic(unittest.TestCase):
valid_data.construct() valid_data.construct()
def test_subset_group(self): def test_subset_group(self):
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) '../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
self.assertEqual(len(lgb_train.get_group()), 201) self.assertEqual(len(lgb_train.get_group()), 201)
subset = lgb_train.subset(list(lgb.compat.range_(10))).construct() subset = lgb_train.subset(list(lgb.compat.range_(10))).construct()
......
...@@ -10,7 +10,7 @@ import random ...@@ -10,7 +10,7 @@ import random
import numpy as np import numpy as np
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris, load_svmlight_file) load_iris, load_svmlight_file)
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
...@@ -25,7 +25,6 @@ def multi_logloss(y_true, y_pred): ...@@ -25,7 +25,6 @@ def multi_logloss(y_true, y_pred):
class TestEngine(unittest.TestCase): class TestEngine(unittest.TestCase):
def test_binary(self): def test_binary(self):
X, y = load_breast_cancer(True) X, y = load_breast_cancer(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
...@@ -145,6 +144,9 @@ class TestEngine(unittest.TestCase): ...@@ -145,6 +144,9 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result) evals_result=evals_result)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y) np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_missing_value_handle_zero(self): def test_missing_value_handle_zero(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
...@@ -174,6 +176,9 @@ class TestEngine(unittest.TestCase): ...@@ -174,6 +176,9 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result) evals_result=evals_result)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y) np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_missing_value_handle_none(self): def test_missing_value_handle_none(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
...@@ -204,6 +209,9 @@ class TestEngine(unittest.TestCase): ...@@ -204,6 +209,9 @@ class TestEngine(unittest.TestCase):
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[0], pred[1], places=5) self.assertAlmostEqual(pred[0], pred[1], places=5)
self.assertAlmostEqual(pred[-1], pred[0], places=5) self.assertAlmostEqual(pred[-1], pred[0], places=5)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.83)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_categorical_handle(self): def test_categorical_handle(self):
x = [0, 1, 2, 3, 4, 5, 6, 7] x = [0, 1, 2, 3, 4, 5, 6, 7]
...@@ -238,8 +246,11 @@ class TestEngine(unittest.TestCase): ...@@ -238,8 +246,11 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result) evals_result=evals_result)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y) np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_categorical_handle2(self): def test_categorical_handle_na(self):
x = [0, np.nan, 0, np.nan, 0, np.nan] x = [0, np.nan, 0, np.nan, 0, np.nan]
y = [0, 1, 0, 1, 0, 1] y = [0, 1, 0, 1, 0, 1]
...@@ -272,6 +283,9 @@ class TestEngine(unittest.TestCase): ...@@ -272,6 +283,9 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result) evals_result=evals_result)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y) np.testing.assert_almost_equal(pred, y)
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.999)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
def test_multiclass(self): def test_multiclass(self):
X, y = load_digits(10, True) X, y = load_digits(10, True)
...@@ -331,20 +345,19 @@ class TestEngine(unittest.TestCase): ...@@ -331,20 +345,19 @@ class TestEngine(unittest.TestCase):
'verbose': -1 'verbose': -1
} }
lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
evals_result = {}
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train,
num_boost_round=50, num_boost_round=50)
valid_sets=lgb_eval,
verbose_eval=False,
evals_result=evals_result)
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} pred_parameter = {"pred_early_stop": True,
"pred_early_stop_freq": 5,
"pred_early_stop_margin": 1.5}
ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter)) ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
self.assertLess(ret, 0.8) self.assertLess(ret, 0.8)
self.assertGreater(ret, 0.5) # loss will be higher than when evaluating the full model self.assertGreater(ret, 0.5) # loss will be higher than when evaluating the full model
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 5.5} pred_parameter = {"pred_early_stop": True,
"pred_early_stop_freq": 5,
"pred_early_stop_margin": 5.5}
ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter)) ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
...@@ -486,23 +499,23 @@ class TestEngine(unittest.TestCase): ...@@ -486,23 +499,23 @@ class TestEngine(unittest.TestCase):
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
params = {'verbose': -1} params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
feature_names = ['f_' + str(i) for i in range(13)] feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names) gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
self.assertListEqual(feature_names, gbm.feature_name()) self.assertListEqual(feature_names, gbm.feature_name())
# test feature_names with whitespaces # test feature_names with whitespaces
feature_names_with_space = ['f ' + str(i) for i in range(13)] feature_names_with_space = ['f ' + str(i) for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space) gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
self.assertListEqual(feature_names, gbm.feature_name()) self.assertListEqual(feature_names, gbm.feature_name())
def test_save_load_copy_pickle(self): def test_save_load_copy_pickle(self):
def test_template(init_model=None, return_model=False): def test_template(init_model=None, return_model=False):
X, y = load_boston(True) X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
'objective': 'regression', 'objective': 'regression',
'metric': 'l2', 'metric': 'l2',
'verbose': -1 'verbose': -1
} }
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model) gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test)) return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
...@@ -576,7 +589,11 @@ class TestEngine(unittest.TestCase): ...@@ -576,7 +589,11 @@ class TestEngine(unittest.TestCase):
tmp_dat_train = tmp_dat.subset(np.arange(80)) tmp_dat_train = tmp_dat.subset(np.arange(80))
tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18)) tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
params = {'objective': 'regression_l2', 'metric': 'rmse'} params = {'objective': 'regression_l2', 'metric': 'rmse'}
gbm = lgb.train(params, tmp_dat_train, num_boost_round=20, valid_sets=[tmp_dat_train, tmp_dat_val]) evals_result = {}
gbm = lgb.train(params, tmp_dat_train, num_boost_round=20,
valid_sets=[tmp_dat_train, tmp_dat_val], evals_result=evals_result)
self.assertEqual(len(evals_result['training']['rmse']), 20)
self.assertEqual(len(evals_result['valid_1']['rmse']), 20)
def test_contribs(self): def test_contribs(self):
X, y = load_breast_cancer(True) X, y = load_breast_cancer(True)
...@@ -587,15 +604,11 @@ class TestEngine(unittest.TestCase): ...@@ -587,15 +604,11 @@ class TestEngine(unittest.TestCase):
'verbose': -1, 'verbose': -1,
} }
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
evals_result = {}
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train,
num_boost_round=20, num_boost_round=20)
valid_sets=lgb_eval,
verbose_eval=False,
evals_result=evals_result)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True) - np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4) self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True)
- np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4)
def test_sliced_data(self): def test_sliced_data(self):
def train_and_get_predictions(features, labels): def train_and_get_predictions(features, labels):
...@@ -605,13 +618,13 @@ class TestEngine(unittest.TestCase): ...@@ -605,13 +618,13 @@ class TestEngine(unittest.TestCase):
'verbose': -1, 'verbose': -1,
'min_data': 5, 'min_data': 5,
} }
lgbm_model = lgb.train( gbm = lgb.train(
params=lgb_params, params=lgb_params,
train_set=dataset, train_set=dataset,
num_boost_round=10, num_boost_round=10,
) )
predictions = lgbm_model.predict(features) return gbm.predict(features)
return predictions
num_samples = 100 num_samples = 100
features = np.random.rand(num_samples, 5) features = np.random.rand(num_samples, 5)
positive_samples = int(num_samples * 0.25) positive_samples = int(num_samples * 0.25)
...@@ -649,10 +662,10 @@ class TestEngine(unittest.TestCase): ...@@ -649,10 +662,10 @@ class TestEngine(unittest.TestCase):
def test_monotone_constraint(self): def test_monotone_constraint(self):
def is_increasing(y): def is_increasing(y):
return np.count_nonzero(np.diff(y) < 0.0) == 0 return (np.diff(y) >= 0.0).all()
def is_decreasing(y): def is_decreasing(y):
return np.count_nonzero(np.diff(y) > 0.0) == 0 return (np.diff(y) <= 0.0).all()
def is_correctly_constrained(learner): def is_correctly_constrained(learner):
n = 200 n = 200
...@@ -697,8 +710,7 @@ class TestEngine(unittest.TestCase): ...@@ -697,8 +710,7 @@ class TestEngine(unittest.TestCase):
'min_data': 10 'min_data': 10
} }
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train, num_boost_round=20)
num_boost_round=20)
err_pred = log_loss(y_test, gbm.predict(X_test)) err_pred = log_loss(y_test, gbm.predict(X_test))
new_gbm = gbm.refit(X_test, y_test) new_gbm = gbm.refit(X_test, y_test)
new_err_pred = log_loss(y_test, new_gbm.predict(X_test)) new_err_pred = log_loss(y_test, new_gbm.predict(X_test))
...@@ -716,8 +728,7 @@ class TestEngine(unittest.TestCase): ...@@ -716,8 +728,7 @@ class TestEngine(unittest.TestCase):
'boost_from_average': False 'boost_from_average': False
} }
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train, num_boost_round=20)
num_boost_round=20)
pred = gbm.predict(X) pred = gbm.predict(X)
pred_mean = pred.mean() pred_mean = pred.mean()
self.assertGreater(pred_mean, 20) self.assertGreater(pred_mean, 20)
...@@ -734,8 +745,7 @@ class TestEngine(unittest.TestCase): ...@@ -734,8 +745,7 @@ class TestEngine(unittest.TestCase):
'boost_from_average': False 'boost_from_average': False
} }
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train, num_boost_round=40)
num_boost_round=40)
pred = gbm.predict(X) pred = gbm.predict(X)
pred_mean = pred.mean() pred_mean = pred.mean()
self.assertGreater(pred_mean, 18) self.assertGreater(pred_mean, 18)
......
...@@ -17,7 +17,8 @@ if GRAPHVIZ_INSTALLED: ...@@ -17,7 +17,8 @@ if GRAPHVIZ_INSTALLED:
class TestBasic(unittest.TestCase): class TestBasic(unittest.TestCase):
def setUp(self): def setUp(self):
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(*load_breast_cancer(True),
test_size=0.1, random_state=1)
self.train_data = lgb.Dataset(self.X_train, self.y_train) self.train_data = lgb.Dataset(self.X_train, self.y_train)
self.params = { self.params = {
"objective": "binary", "objective": "binary",
......
...@@ -57,10 +57,14 @@ class TestSklearn(unittest.TestCase): ...@@ -57,10 +57,14 @@ class TestSklearn(unittest.TestCase):
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5) self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)
def test_lambdarank(self): def test_lambdarank(self):
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test')) '../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query')) '../../examples/lambdarank/rank.test'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.test.query'))
gbm = lgb.LGBMRanker() gbm = lgb.LGBMRanker()
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
...@@ -74,6 +78,7 @@ class TestSklearn(unittest.TestCase): ...@@ -74,6 +78,7 @@ class TestSklearn(unittest.TestCase):
grad = (y_pred - y_true) grad = (y_pred - y_true)
hess = np.ones(len(y_true)) hess = np.ones(len(y_true))
return grad, hess return grad, hess
X, y = load_boston(True) X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
...@@ -88,10 +93,11 @@ class TestSklearn(unittest.TestCase): ...@@ -88,10 +93,11 @@ class TestSklearn(unittest.TestCase):
grad = y_pred - y_true grad = y_pred - y_true
hess = y_pred * (1.0 - y_pred) hess = y_pred * (1.0 - y_pred)
return grad, hess return grad, hess
X, y = load_digits(2, True)
def binary_error(y_test, y_pred): def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)]) return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
X, y = load_digits(2, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment