Unverified Commit 1f1dc452 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[tests][python] refined python tests (#2483)

* speed up tests

* more updates

* fixed pylint

* updated tests

* Update test_sklearn.py

* test that indices are sorted internally
parent 00d1e693
......@@ -69,7 +69,8 @@ def load_from_file(filename, reference):
LIB.LGBM_DatasetCreateFromFile(
c_str(filename),
c_str('max_bin=15'),
ref, ctypes.byref(handle))
ref,
ctypes.byref(handle))
print(LIB.LGBM_GetLastError())
num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
......@@ -88,8 +89,9 @@ def load_from_csr(filename, reference):
label = []
with open(filename, 'r') as inp:
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
values = line.split('\t')
data.append([float(x) for x in values[1:]])
label.append(float(values[0]))
mat = np.array(data)
label = np.array(label, dtype=np.float32)
csr = sparse.csr_matrix(mat)
......@@ -124,8 +126,9 @@ def load_from_csc(filename, reference):
label = []
with open(filename, 'r') as inp:
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
values = line.split('\t')
data.append([float(x) for x in values[1:]])
label.append(float(values[0]))
mat = np.array(data)
label = np.array(label, dtype=np.float32)
csr = sparse.csc_matrix(mat)
......@@ -160,8 +163,9 @@ def load_from_mat(filename, reference):
label = []
with open(filename, 'r') as inp:
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
values = line.split('\t')
data.append([float(x) for x in values[1:]])
label.append(float(values[0]))
mat = np.array(data)
data = np.array(mat.reshape(mat.size), copy=False)
label = np.array(label, dtype=np.float32)
......@@ -222,7 +226,7 @@ def test_booster():
ctypes.byref(booster))
LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0)
for i in range(1, 101):
for i in range(1, 51):
LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_ulong(0)
......@@ -260,7 +264,7 @@ def test_booster():
mat.shape[1],
1,
1,
50,
25,
c_str(''),
ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
......@@ -270,7 +274,7 @@ def test_booster():
'../../examples/binary_classification/binary.test')),
0,
0,
50,
25,
c_str(''),
c_str('preb.txt'))
LIB.LGBM_BoosterFree(booster2)
......@@ -31,13 +31,13 @@ class TestBasic(unittest.TestCase):
bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1")
for i in range(30):
for i in range(20):
bst.update()
if i % 10 == 0:
print(bst.eval_train(), bst.eval_valid())
self.assertEqual(bst.current_iteration(), 30)
self.assertEqual(bst.num_trees(), 30)
self.assertEqual(bst.current_iteration(), 20)
self.assertEqual(bst.num_trees(), 20)
self.assertEqual(bst.num_model_per_iteration(), 1)
bst.save_model("model.txt")
......@@ -48,26 +48,20 @@ class TestBasic(unittest.TestCase):
dump_svmlight_file(X_test, y_test, f)
pred_from_file = bst.predict(tname)
os.remove(tname)
self.assertEqual(len(pred_from_matr), len(pred_from_file))
for preds in zip(pred_from_matr, pred_from_file):
self.assertAlmostEqual(*preds, places=15)
np.testing.assert_allclose(pred_from_matr, pred_from_file)
# check saved model persistence
bst = lgb.Booster(params, model_file="model.txt")
os.remove("model.txt")
pred_from_model_file = bst.predict(X_test)
self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
for preds in zip(pred_from_matr, pred_from_model_file):
# we need to check the consistency of model file here, so test for exact equal
self.assertEqual(*preds)
np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)
# check early stopping is working. Make it stop very early, so the scores should be very close to zero
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
pred_early_stopping = bst.predict(X_test, **pred_parameter)
self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
for preds in zip(pred_early_stopping, pred_from_matr):
# scores likely to be different, but prediction should still be the same
self.assertEqual(preds[0] > 0, preds[1] > 0)
np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))
# test that shape is checked during prediction
bad_X_test = X_test[:, 1:]
......@@ -97,7 +91,6 @@ class TestBasic(unittest.TestCase):
train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
train_data.construct()
valid_data.construct()
......@@ -108,23 +101,23 @@ class TestBasic(unittest.TestCase):
'../../examples/lambdarank/rank.train.query'))
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
self.assertEqual(len(lgb_train.get_group()), 201)
subset = lgb_train.subset(list(lgb.compat.range_(10))).construct()
subset = lgb_train.subset(list(range(10))).construct()
subset_group = subset.get_group()
self.assertEqual(len(subset_group), 2)
self.assertEqual(subset_group[0], 1)
self.assertEqual(subset_group[1], 9)
def test_add_features_throws_if_num_data_unequal(self):
X1 = np.random.random((1000, 1))
X2 = np.random.random((100, 1))
X1 = np.random.random((100, 1))
X2 = np.random.random((10, 1))
d1 = lgb.Dataset(X1).construct()
d2 = lgb.Dataset(X2).construct()
with self.assertRaises(lgb.basic.LightGBMError):
d1.add_features_from(d2)
def test_add_features_throws_if_datasets_unconstructed(self):
X1 = np.random.random((1000, 1))
X2 = np.random.random((1000, 1))
X1 = np.random.random((100, 1))
X2 = np.random.random((100, 1))
with self.assertRaises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2)
......@@ -139,7 +132,8 @@ class TestBasic(unittest.TestCase):
d1.add_features_from(d2)
def test_add_features_equal_data_on_alternating_used_unused(self):
X = np.random.random((1000, 5))
self.maxDiff = None
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = ['col_%d' % i for i in range(5)]
for j in range(1, 5):
......@@ -162,7 +156,8 @@ class TestBasic(unittest.TestCase):
self.assertEqual(dtxt, d1txt)
def test_add_features_same_booster_behaviour(self):
X = np.random.random((1000, 5))
self.maxDiff = None
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = ['col_%d' % i for i in range(5)]
for j in range(1, 5):
......@@ -170,7 +165,7 @@ class TestBasic(unittest.TestCase):
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1.add_features_from(d2)
d = lgb.Dataset(X, feature_name=names).construct()
y = np.random.random(1000)
y = np.random.random(100)
d1.set_label(y)
d.set_label(y)
b1 = lgb.Booster(train_set=d1)
......@@ -191,7 +186,7 @@ class TestBasic(unittest.TestCase):
self.assertEqual(dtxt, d1txt)
def test_get_feature_penalty_and_monotone_constraints(self):
X = np.random.random((1000, 1))
X = np.random.random((100, 1))
d = lgb.Dataset(X, params={'feature_penalty': [0.5],
'monotone_constraints': [1]}).construct()
np.testing.assert_allclose(d.get_feature_penalty(), [0.5])
......@@ -201,7 +196,7 @@ class TestBasic(unittest.TestCase):
self.assertIsNone(d.get_monotone_constraints())
def test_add_features_feature_penalty(self):
X = np.random.random((1000, 2))
X = np.random.random((100, 2))
test_cases = [
(None, None, None),
([0.5], None, [0.5, 1]),
......@@ -220,7 +215,7 @@ class TestBasic(unittest.TestCase):
np.testing.assert_allclose(actual, expected)
def test_add_features_monotone_types(self):
X = np.random.random((1000, 2))
X = np.random.random((100, 2))
test_cases = [
(None, None, None),
([1], None, [1, 0]),
......@@ -239,9 +234,9 @@ class TestBasic(unittest.TestCase):
np.testing.assert_array_equal(actual, expected)
def test_cegb_affects_behavior(self):
X = np.random.random((1000, 5))
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(1000)
y = np.random.random(100)
names = ['col_%d' % i for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
......@@ -269,9 +264,9 @@ class TestBasic(unittest.TestCase):
self.assertNotEqual(basetxt, casetxt)
def test_cegb_scaling_equalities(self):
X = np.random.random((1000, 5))
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(1000)
y = np.random.random(100)
names = ['col_%d' % i for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
......@@ -298,9 +293,9 @@ class TestBasic(unittest.TestCase):
with tempfile.NamedTemporaryFile() as f:
p2name = f.name
booster2.save_model(p2name)
self.maxDiff = None
with open(p2name, 'rt') as f:
p2txt = f.read()
self.maxDiff = None
self.assertEqual(p1txt, p2txt)
def test_consistent_state_for_dataset_fields(self):
......
......@@ -21,6 +21,13 @@ except ImportError:
import pickle
decreasing_generator = itertools.count(0, -1)
def dummy_obj(preds, train_data):
return np.ones(preds.shape), np.ones(preds.shape)
def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
......@@ -32,6 +39,14 @@ def top_k_error(y_true, y_pred, k):
return 1 - np.mean((y_pred[np.arange(len(y_true)), y_true] > max_rest))
def constant_metric(preds, train_data):
return ('error', 0.0, False)
def decreasing_metric(preds, train_data):
return ('decreasing_metric', next(decreasing_generator), False)
class TestEngine(unittest.TestCase):
def test_binary(self):
X, y = load_breast_cancer(True)
......@@ -51,7 +66,7 @@ class TestEngine(unittest.TestCase):
verbose_eval=False,
evals_result=evals_result)
ret = log_loss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.15)
self.assertLess(ret, 0.11)
self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50)
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
......@@ -77,7 +92,7 @@ class TestEngine(unittest.TestCase):
verbose_eval=False,
evals_result=evals_result)
ret = log_loss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.25)
self.assertLess(ret, 0.19)
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
def test_regression(self):
......@@ -96,13 +111,13 @@ class TestEngine(unittest.TestCase):
verbose_eval=False,
evals_result=evals_result)
ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 16)
self.assertLess(ret, 7)
self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5)
def test_missing_value_handle(self):
X_train = np.zeros((1000, 1))
y_train = np.zeros(1000)
trues = random.sample(range(1000), 200)
X_train = np.zeros((100, 1))
y_train = np.zeros(100)
trues = random.sample(range(100), 20)
for idx in trues:
X_train[idx, 0] = np.nan
y_train[idx] = 1
......@@ -118,7 +133,7 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
verbose_eval=True,
verbose_eval=False,
evals_result=evals_result)
ret = mean_squared_error(y_train, gbm.predict(X_train))
self.assertLess(ret, 0.005)
......@@ -148,7 +163,7 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
verbose_eval=False,
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y)
......@@ -180,7 +195,7 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
verbose_eval=False,
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y)
......@@ -212,11 +227,11 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
verbose_eval=False,
evals_result=evals_result)
pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[0], pred[1], places=5)
self.assertAlmostEqual(pred[-1], pred[0], places=5)
self.assertAlmostEqual(pred[0], pred[1])
self.assertAlmostEqual(pred[-1], pred[0])
ret = roc_auc_score(y_train, pred)
self.assertGreater(ret, 0.83)
self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
......@@ -250,7 +265,7 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
verbose_eval=False,
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y)
......@@ -287,7 +302,7 @@ class TestEngine(unittest.TestCase):
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
verbose_eval=False,
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_allclose(pred, y)
......@@ -313,7 +328,7 @@ class TestEngine(unittest.TestCase):
verbose_eval=False,
evals_result=evals_result)
ret = multi_logloss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.2)
self.assertLess(ret, 0.15)
self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
def test_multiclass_rf(self):
......@@ -335,12 +350,12 @@ class TestEngine(unittest.TestCase):
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=100,
num_boost_round=50,
valid_sets=lgb_eval,
verbose_eval=False,
evals_result=evals_result)
ret = multi_logloss(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.4)
self.assertLess(ret, 0.23)
self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
def test_multiclass_prediction_early_stopping(self):
......@@ -361,7 +376,7 @@ class TestEngine(unittest.TestCase):
"pred_early_stop_margin": 1.5}
ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
self.assertLess(ret, 0.8)
self.assertGreater(ret, 0.5) # loss will be higher than when evaluating the full model
self.assertGreater(ret, 0.6) # loss will be higher than when evaluating the full model
pred_parameter = {"pred_early_stop": True,
"pred_early_stop_freq": 5,
......@@ -436,11 +451,12 @@ class TestEngine(unittest.TestCase):
self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
# early stopping occurs
gbm = lgb.train(params, lgb_train,
num_boost_round=40,
valid_sets=lgb_eval,
valid_names=valid_set_name,
verbose_eval=False,
early_stopping_rounds=5)
self.assertLessEqual(gbm.best_iteration, 100)
self.assertLessEqual(gbm.best_iteration, 31)
self.assertIn(valid_set_name, gbm.best_score)
self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
......@@ -463,14 +479,13 @@ class TestEngine(unittest.TestCase):
valid_sets=lgb_eval,
verbose_eval=False,
# test custom eval metrics
feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)),
feval=(lambda p, d: ('custom_mae', mean_absolute_error(p, d.get_label()), False)),
evals_result=evals_result,
init_model='model.txt')
ret = mean_absolute_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 3.5)
self.assertLess(ret, 2.0)
self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']):
self.assertAlmostEqual(l1, mae, places=5)
np.testing.assert_allclose(evals_result['valid_0']['l1'], evals_result['valid_0']['custom_mae'])
os.remove(model_name)
def test_continue_train_dart(self):
......@@ -493,7 +508,7 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result,
init_model=init_gbm)
ret = mean_absolute_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 2.5)
self.assertLess(ret, 2.0)
self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
def test_continue_train_multiclass(self):
......@@ -516,12 +531,11 @@ class TestEngine(unittest.TestCase):
evals_result=evals_result,
init_model=init_gbm)
ret = multi_logloss(y_test, gbm.predict(X_test))
self.assertLess(ret, 1.5)
self.assertLess(ret, 0.1)
self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
def test_cv(self):
X, y = load_boston(True)
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, y_train = load_boston(True)
params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train)
# shuffle = False, override metric in params
......@@ -580,8 +594,7 @@ class TestEngine(unittest.TestCase):
np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])
def test_feature_name(self):
X, y = load_boston(True)
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, y_train = load_boston(True)
params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train)
feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])]
......@@ -593,7 +606,7 @@ class TestEngine(unittest.TestCase):
self.assertListEqual(feature_names, gbm.feature_name())
def test_save_load_copy_pickle(self):
def test_template(init_model=None, return_model=False):
def train_and_predict(init_model=None, return_model=False):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
......@@ -604,22 +617,23 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X_train, y_train)
gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
gbm = test_template(return_model=True)
ret_origin = test_template(init_model=gbm)
gbm = train_and_predict(return_model=True)
ret_origin = train_and_predict(init_model=gbm)
other_ret = []
gbm.save_model('lgb.model')
other_ret.append(test_template(init_model='lgb.model'))
other_ret.append(train_and_predict(init_model='lgb.model'))
gbm_load = lgb.Booster(model_file='lgb.model')
other_ret.append(test_template(init_model=gbm_load))
other_ret.append(test_template(init_model=copy.copy(gbm)))
other_ret.append(test_template(init_model=copy.deepcopy(gbm)))
other_ret.append(train_and_predict(init_model=gbm_load))
other_ret.append(train_and_predict(init_model=copy.copy(gbm)))
other_ret.append(train_and_predict(init_model=copy.deepcopy(gbm)))
with open('lgb.pkl', 'wb') as f:
pickle.dump(gbm, f)
with open('lgb.pkl', 'rb') as f:
gbm_pickle = pickle.load(f)
other_ret.append(test_template(init_model=gbm_pickle))
other_ret.append(train_and_predict(init_model=gbm_pickle))
gbm_pickles = pickle.loads(pickle.dumps(gbm))
other_ret.append(test_template(init_model=gbm_pickles))
other_ret.append(train_and_predict(init_model=gbm_pickles))
for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5)
......@@ -670,16 +684,15 @@ class TestEngine(unittest.TestCase):
gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test)
self.assertListEqual(lgb_train.categorical_feature, ['A', 'B', 'C', 'D'])
model_str = gbm4.model_to_string()
gbm4.model_from_string(model_str, False)
pred5 = gbm4.predict(X_test)
gbm5 = lgb.Booster(model_str=model_str)
pred6 = gbm5.predict(X_test)
lgb_train = lgb.Dataset(X, y)
gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['E'])
gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D', 'E'])
pred7 = gbm6.predict(X_test)
self.assertListEqual(lgb_train.categorical_feature, ['E'])
self.assertListEqual(lgb_train.categorical_feature, ['A', 'B', 'C', 'D', 'E'])
lgb_train = lgb.Dataset(X, y)
gbm7 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[])
pred8 = gbm7.predict(X_test)
......@@ -746,7 +759,8 @@ class TestEngine(unittest.TestCase):
params = {'objective': 'regression_l2', 'metric': 'rmse'}
evals_result = {}
gbm = lgb.train(params, tmp_dat_train, num_boost_round=20,
valid_sets=[tmp_dat_train, tmp_dat_val], evals_result=evals_result)
valid_sets=[tmp_dat_train, tmp_dat_val],
verbose_eval=False, evals_result=evals_result)
self.assertEqual(len(evals_result['training']['rmse']), 20)
self.assertEqual(len(evals_result['valid_1']['rmse']), 20)
......@@ -759,8 +773,7 @@ class TestEngine(unittest.TestCase):
'verbose': -1,
}
lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train,
num_boost_round=20)
gbm = lgb.train(params, lgb_train, num_boost_round=20)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True)
- np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4)
......@@ -783,10 +796,8 @@ class TestEngine(unittest.TestCase):
num_samples = 100
features = np.random.rand(num_samples, 5)
positive_samples = int(num_samples * 0.25)
labels = np.append(
np.ones(positive_samples, dtype=np.float32),
np.zeros(num_samples - positive_samples, dtype=np.float32),
)
labels = np.append(np.ones(positive_samples, dtype=np.float32),
np.zeros(num_samples - positive_samples, dtype=np.float32))
# test sliced labels
origin_pred = train_and_get_predictions(features, labels)
stacked_labels = np.column_stack((labels, np.ones(num_samples, dtype=np.float32)))
......@@ -816,12 +827,12 @@ class TestEngine(unittest.TestCase):
np.testing.assert_allclose(origin_pred, sliced_pred)
def test_init_with_subset(self):
data = np.random.random((500, 2))
y = [1] * 250 + [0] * 250
data = np.random.random((50, 2))
y = [1] * 25 + [0] * 25
lgb_train = lgb.Dataset(data, y, free_raw_data=False)
subset_index_1 = sorted(np.random.choice(np.arange(500), 300, replace=False))
subset_index_1 = np.random.choice(np.arange(50), 30, replace=False)
subset_data_1 = lgb_train.subset(subset_index_1)
subset_index_2 = sorted(np.random.choice(np.arange(500), 200, replace=False))
subset_index_2 = np.random.choice(np.arange(50), 20, replace=False)
subset_data_2 = lgb_train.subset(subset_index_2)
params = {
'objective': 'binary',
......@@ -835,9 +846,9 @@ class TestEngine(unittest.TestCase):
train_set=subset_data_2,
num_boost_round=10,
init_model=init_gbm)
self.assertEqual(lgb_train.get_data().shape[0], 500)
self.assertEqual(subset_data_1.get_data().shape[0], 300)
self.assertEqual(subset_data_2.get_data().shape[0], 200)
self.assertEqual(lgb_train.get_data().shape[0], 50)
self.assertEqual(subset_data_1.get_data().shape[0], 30)
self.assertEqual(subset_data_2.get_data().shape[0], 20)
lgb_train.save_binary("lgb_train_data.bin")
lgb_train_from_file = lgb.Dataset('lgb_train_data.bin', free_raw_data=False)
subset_data_3 = lgb_train_from_file.subset(subset_index_1)
......@@ -876,7 +887,7 @@ class TestEngine(unittest.TestCase):
return False
return True
number_of_dpoints = 3000
number_of_dpoints = 2000
x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y))
......@@ -931,11 +942,11 @@ class TestEngine(unittest.TestCase):
'verbose': -1,
'max_bin': 2}
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=5)
lgb.train(params, lgb_x, num_boost_round=5)
x[0, 0] = np.nan
params['max_bin'] = 3
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=5)
lgb.train(params, lgb_x, num_boost_round=5)
np.random.seed() # reset seed
def test_refit(self):
......@@ -1059,12 +1070,6 @@ class TestEngine(unittest.TestCase):
self.assertEqual(len(results['multi_logloss-mean']), 10)
def test_metrics(self):
def custom_obj(preds, train_data):
return np.zeros(preds.shape), np.zeros(preds.shape)
def custom_metric(preds, train_data):
return 'error', 0, False
X, y = load_digits(2, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, silent=True)
......@@ -1087,11 +1092,11 @@ class TestEngine(unittest.TestCase):
params_metric_none_verbose = {'metric': 'None', 'verbose': -1}
def get_cv_result(params=params_obj_verbose, **kwargs):
return lgb.cv(params, lgb_train, num_boost_round=5, verbose_eval=False, **kwargs)
return lgb.cv(params, lgb_train, num_boost_round=2, verbose_eval=False, **kwargs)
def train_booster(params=params_obj_verbose, **kwargs):
lgb.train(params, lgb_train,
num_boost_round=5,
num_boost_round=2,
valid_sets=[lgb_valid],
evals_result=evals_result,
verbose_eval=False, **kwargs)
......@@ -1145,32 +1150,32 @@ class TestEngine(unittest.TestCase):
# fobj, no feval
# no default metric
res = get_cv_result(params=params_verbose, fobj=custom_obj)
res = get_cv_result(params=params_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 0)
# metric in params
res = get_cv_result(params=params_metric_err_verbose, fobj=custom_obj)
res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 2)
self.assertIn('binary_error-mean', res)
# metric in args
res = get_cv_result(params=params_verbose, fobj=custom_obj, metrics='binary_error')
res = get_cv_result(params=params_verbose, fobj=dummy_obj, metrics='binary_error')
self.assertEqual(len(res), 2)
self.assertIn('binary_error-mean', res)
# metric in args overwrites its' alias in params
res = get_cv_result(params=params_metric_inv_verbose, fobj=custom_obj, metrics='binary_error')
res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj, metrics='binary_error')
self.assertEqual(len(res), 2)
self.assertIn('binary_error-mean', res)
# multiple metrics in params
res = get_cv_result(params=params_metric_multi_verbose, fobj=custom_obj)
res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 4)
self.assertIn('binary_logloss-mean', res)
self.assertIn('binary_error-mean', res)
# multiple metrics in args
res = get_cv_result(params=params_verbose, fobj=custom_obj,
res = get_cv_result(params=params_verbose, fobj=dummy_obj,
metrics=['binary_logloss', 'binary_error'])
self.assertEqual(len(res), 4)
self.assertIn('binary_logloss-mean', res)
......@@ -1178,89 +1183,89 @@ class TestEngine(unittest.TestCase):
# no fobj, feval
# default metric with custom one
res = get_cv_result(feval=custom_metric)
res = get_cv_result(feval=constant_metric)
self.assertEqual(len(res), 4)
self.assertIn('binary_logloss-mean', res)
self.assertIn('error-mean', res)
# non-default metric in params with custom one
res = get_cv_result(params=params_obj_metric_err_verbose, feval=custom_metric)
res = get_cv_result(params=params_obj_metric_err_verbose, feval=constant_metric)
self.assertEqual(len(res), 4)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# default metric in args with custom one
res = get_cv_result(metrics='binary_logloss', feval=custom_metric)
res = get_cv_result(metrics='binary_logloss', feval=constant_metric)
self.assertEqual(len(res), 4)
self.assertIn('binary_logloss-mean', res)
self.assertIn('error-mean', res)
# non-default metric in args with custom one
res = get_cv_result(metrics='binary_error', feval=custom_metric)
res = get_cv_result(metrics='binary_error', feval=constant_metric)
self.assertEqual(len(res), 4)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# metric in args overwrites one in params, custom one is evaluated too
res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error', feval=custom_metric)
res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error', feval=constant_metric)
self.assertEqual(len(res), 4)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# multiple metrics in params with custom one
res = get_cv_result(params=params_obj_metric_multi_verbose, feval=custom_metric)
res = get_cv_result(params=params_obj_metric_multi_verbose, feval=constant_metric)
self.assertEqual(len(res), 6)
self.assertIn('binary_logloss-mean', res)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# multiple metrics in args with custom one
res = get_cv_result(metrics=['binary_logloss', 'binary_error'], feval=custom_metric)
res = get_cv_result(metrics=['binary_logloss', 'binary_error'], feval=constant_metric)
self.assertEqual(len(res), 6)
self.assertIn('binary_logloss-mean', res)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# custom metric is evaluated despite 'None' is passed
res = get_cv_result(metrics=['None'], feval=custom_metric)
res = get_cv_result(metrics=['None'], feval=constant_metric)
self.assertEqual(len(res), 2)
self.assertIn('error-mean', res)
# fobj, feval
# no default metric, only custom one
res = get_cv_result(params=params_verbose, fobj=custom_obj, feval=custom_metric)
res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2)
self.assertIn('error-mean', res)
# metric in params with custom one
res = get_cv_result(params=params_metric_err_verbose, fobj=custom_obj, feval=custom_metric)
res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 4)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# metric in args with custom one
res = get_cv_result(params=params_verbose, fobj=custom_obj,
feval=custom_metric, metrics='binary_error')
res = get_cv_result(params=params_verbose, fobj=dummy_obj,
feval=constant_metric, metrics='binary_error')
self.assertEqual(len(res), 4)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# metric in args overwrites one in params, custom one is evaluated too
res = get_cv_result(params=params_metric_inv_verbose, fobj=custom_obj,
feval=custom_metric, metrics='binary_error')
res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj,
feval=constant_metric, metrics='binary_error')
self.assertEqual(len(res), 4)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# multiple metrics in params with custom one
res = get_cv_result(params=params_metric_multi_verbose, fobj=custom_obj, feval=custom_metric)
res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 6)
self.assertIn('binary_logloss-mean', res)
self.assertIn('binary_error-mean', res)
self.assertIn('error-mean', res)
# multiple metrics in args with custom one
res = get_cv_result(params=params_verbose, fobj=custom_obj, feval=custom_metric,
res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric,
metrics=['binary_logloss', 'binary_error'])
self.assertEqual(len(res), 6)
self.assertIn('binary_logloss-mean', res)
......@@ -1268,7 +1273,7 @@ class TestEngine(unittest.TestCase):
self.assertIn('error-mean', res)
# custom metric is evaluated despite 'None' is passed
res = get_cv_result(params=params_metric_none_verbose, fobj=custom_obj, feval=custom_metric)
res = get_cv_result(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2)
self.assertIn('error-mean', res)
......@@ -1302,72 +1307,72 @@ class TestEngine(unittest.TestCase):
# fobj, no feval
# no default metric
train_booster(params=params_verbose, fobj=custom_obj)
train_booster(params=params_verbose, fobj=dummy_obj)
self.assertEqual(len(evals_result), 0)
# metric in params
train_booster(params=params_metric_log_verbose, fobj=custom_obj)
train_booster(params=params_metric_log_verbose, fobj=dummy_obj)
self.assertEqual(len(evals_result['valid_0']), 1)
self.assertIn('binary_logloss', evals_result['valid_0'])
# multiple metrics in params
train_booster(params=params_metric_multi_verbose, fobj=custom_obj)
train_booster(params=params_metric_multi_verbose, fobj=dummy_obj)
self.assertEqual(len(evals_result['valid_0']), 2)
self.assertIn('binary_logloss', evals_result['valid_0'])
self.assertIn('binary_error', evals_result['valid_0'])
# no fobj, feval
# default metric with custom one
train_booster(feval=custom_metric)
train_booster(feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2)
self.assertIn('binary_logloss', evals_result['valid_0'])
self.assertIn('error', evals_result['valid_0'])
# default metric in params with custom one
train_booster(params=params_obj_metric_log_verbose, feval=custom_metric)
train_booster(params=params_obj_metric_log_verbose, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2)
self.assertIn('binary_logloss', evals_result['valid_0'])
self.assertIn('error', evals_result['valid_0'])
# non-default metric in params with custom one
train_booster(params=params_obj_metric_err_verbose, feval=custom_metric)
train_booster(params=params_obj_metric_err_verbose, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2)
self.assertIn('binary_error', evals_result['valid_0'])
self.assertIn('error', evals_result['valid_0'])
# multiple metrics in params with custom one
train_booster(params=params_obj_metric_multi_verbose, feval=custom_metric)
train_booster(params=params_obj_metric_multi_verbose, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 3)
self.assertIn('binary_logloss', evals_result['valid_0'])
self.assertIn('binary_error', evals_result['valid_0'])
self.assertIn('error', evals_result['valid_0'])
# custom metric is evaluated despite 'None' is passed
train_booster(params=params_obj_metric_none_verbose, feval=custom_metric)
train_booster(params=params_obj_metric_none_verbose, feval=constant_metric)
self.assertEqual(len(evals_result), 1)
self.assertIn('error', evals_result['valid_0'])
# fobj, feval
# no default metric, only custom one
train_booster(params=params_verbose, fobj=custom_obj, feval=custom_metric)
train_booster(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 1)
self.assertIn('error', evals_result['valid_0'])
# metric in params with custom one
train_booster(params=params_metric_log_verbose, fobj=custom_obj, feval=custom_metric)
train_booster(params=params_metric_log_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 2)
self.assertIn('binary_logloss', evals_result['valid_0'])
self.assertIn('error', evals_result['valid_0'])
# multiple metrics in params with custom one
train_booster(params=params_metric_multi_verbose, fobj=custom_obj, feval=custom_metric)
train_booster(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result['valid_0']), 3)
self.assertIn('binary_logloss', evals_result['valid_0'])
self.assertIn('binary_error', evals_result['valid_0'])
self.assertIn('error', evals_result['valid_0'])
# custom metric is evaluated despite 'None' is passed
train_booster(params=params_metric_none_verbose, fobj=custom_obj, feval=custom_metric)
train_booster(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(evals_result), 1)
self.assertIn('error', evals_result['valid_0'])
......@@ -1384,25 +1389,25 @@ class TestEngine(unittest.TestCase):
self.assertEqual(len(res), 2)
self.assertIn('multi_logloss-mean', res)
# multiclass default metric with custom one
res = get_cv_result(params_obj_class_3_verbose, feval=custom_metric)
res = get_cv_result(params_obj_class_3_verbose, feval=constant_metric)
self.assertEqual(len(res), 4)
self.assertIn('multi_logloss-mean', res)
self.assertIn('error-mean', res)
# multiclass metric alias with custom one for custom objective
res = get_cv_result(params_obj_class_3_verbose, fobj=custom_obj, feval=custom_metric)
res = get_cv_result(params_obj_class_3_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2)
self.assertIn('error-mean', res)
# no metric for invalid class_num
res = get_cv_result(params_obj_class_1_verbose, fobj=custom_obj)
res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 0)
# custom metric for invalid class_num
res = get_cv_result(params_obj_class_1_verbose, fobj=custom_obj, feval=custom_metric)
res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj, feval=constant_metric)
self.assertEqual(len(res), 2)
self.assertIn('error-mean', res)
# multiclass metric alias with custom one with invalid class_num
self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
params_obj_class_1_verbose, metrics=obj_multi_alias,
fobj=custom_obj, feval=custom_metric)
fobj=dummy_obj, feval=constant_metric)
# multiclass default metric without num_class
self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
params_obj_verbose)
......@@ -1423,20 +1428,20 @@ class TestEngine(unittest.TestCase):
self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
params_class_3_verbose)
# no metric with non-default num_class for custom objective
res = get_cv_result(params_class_3_verbose, fobj=custom_obj)
res = get_cv_result(params_class_3_verbose, fobj=dummy_obj)
self.assertEqual(len(res), 0)
for metric_multi_alias in obj_multi_aliases + ['multi_logloss']:
# multiclass metric alias for custom objective
res = get_cv_result(params_class_3_verbose, metrics=metric_multi_alias, fobj=custom_obj)
res = get_cv_result(params_class_3_verbose, metrics=metric_multi_alias, fobj=dummy_obj)
self.assertEqual(len(res), 2)
self.assertIn('multi_logloss-mean', res)
# multiclass metric for custom objective
res = get_cv_result(params_class_3_verbose, metrics='multi_error', fobj=custom_obj)
res = get_cv_result(params_class_3_verbose, metrics='multi_error', fobj=dummy_obj)
self.assertEqual(len(res), 2)
self.assertIn('multi_error-mean', res)
# binary metric with non-default num_class for custom objective
self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
params_class_3_verbose, metrics='binary_error', fobj=custom_obj)
params_class_3_verbose, metrics='binary_error', fobj=dummy_obj)
@unittest.skipIf(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, 'not enough RAM')
def test_model_size(self):
......@@ -1543,12 +1548,6 @@ class TestEngine(unittest.TestCase):
def test_early_stopping_for_only_first_metric(self):
def decreasing_metric(preds, train_data):
return ('decreasing_metric', next(decreasing_generator), False)
def constant_metric(preds, train_data):
return ('constant_metric', 0.0, False)
def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration,
first_metric_only, feval=None):
params = {
......@@ -1582,7 +1581,6 @@ class TestEngine(unittest.TestCase):
eval_train_metric=eval_train_metric)
self.assertEqual(assumed_iteration, len(ret[list(ret.keys())[0]]))
decreasing_generator = itertools.count(0, -1)
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=73)
......@@ -1683,8 +1681,7 @@ class TestEngine(unittest.TestCase):
self.assertLess(ret, 0.13)
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
params['feature_fraction'] = 0.5
gbm2 = lgb.train(params, lgb_train,
num_boost_round=25)
gbm2 = lgb.train(params, lgb_train, num_boost_round=25)
ret2 = log_loss(y_test, gbm2.predict(X_test))
self.assertNotEqual(ret, ret2)
......@@ -1700,10 +1697,9 @@ class TestEngine(unittest.TestCase):
'forcedbins_filename': forcedbins_filename,
'num_leaves': 2,
'min_data_in_leaf': 1,
'verbose': -1,
'seed': 0}
'verbose': -1}
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=100)
est = lgb.train(params, lgb_x, num_boost_round=20)
new_x = np.zeros((3, x.shape[1]))
new_x[:, 0] = [0.31, 0.37, 0.41]
new_x[:, 1] = [0, 0, 0]
......@@ -1715,16 +1711,16 @@ class TestEngine(unittest.TestCase):
self.assertEqual(len(np.unique(predicted)), 1)
params['forcedbins_filename'] = ''
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=100)
est = lgb.train(params, lgb_x, num_boost_round=20)
predicted = est.predict(new_x)
self.assertEqual(len(np.unique(predicted)), 3)
params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/regression/forced_bins2.json')
params['max_bin'] = 11
lgb_x = lgb.Dataset(x[:, :1], label=y)
est = lgb.train(params, lgb_x, num_boost_round=100)
est = lgb.train(params, lgb_x, num_boost_round=50)
predicted = est.predict(x[1:, :1])
vals, counts = np.unique(predicted, return_counts=True)
_, counts = np.unique(predicted, return_counts=True)
self.assertGreaterEqual(min(counts), 9)
self.assertLessEqual(max(counts), 11)
......@@ -1741,7 +1737,7 @@ class TestEngine(unittest.TestCase):
'verbose': -1,
'seed': 0}
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=100)
est = lgb.train(params, lgb_x, num_boost_round=20)
new_x = np.zeros((3, 2))
new_x[:, 0] = [-1, 0, 1]
predicted = est.predict(new_x)
......
......@@ -18,12 +18,7 @@ from sklearn.utils.estimator_checks import (_yield_all_checks, SkipTest,
check_parameters_default_constructible)
def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred)
def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
decreasing_generator = itertools.count(0, -1)
def custom_asymmetric_obj(y_true, y_pred):
......@@ -33,10 +28,47 @@ def custom_asymmetric_obj(y_true, y_pred):
return grad, hess
def objective_ls(y_true, y_pred):
grad = (y_pred - y_true)
hess = np.ones(len(y_true))
return grad, hess
def logregobj(y_true, y_pred):
y_pred = 1.0 / (1.0 + np.exp(-y_pred))
grad = y_pred - y_true
hess = y_pred * (1.0 - y_pred)
return grad, hess
def custom_dummy_obj(y_true, y_pred):
return np.ones(y_true.shape), np.ones(y_true.shape)
def constant_metric(y_true, y_pred):
return 'error', 0, False
def decreasing_metric(y_true, y_pred):
return ('decreasing_metric', next(decreasing_generator), False)
def mse(y_true, y_pred):
return 'custom MSE', mean_squared_error(y_true, y_pred), False
def binary_error(y_true, y_pred):
return np.mean((y_pred > 0.5) != y_true)
def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred)
def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
class TestSklearn(unittest.TestCase):
def test_binary(self):
......@@ -45,7 +77,7 @@ class TestSklearn(unittest.TestCase):
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = log_loss(y_test, gbm.predict_proba(X_test))
self.assertLess(ret, 0.15)
self.assertLess(ret, 0.11)
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)
def test_regression(self):
......@@ -54,7 +86,7 @@ class TestSklearn(unittest.TestCase):
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 16)
self.assertLess(ret, 7)
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
def test_multiclass(self):
......@@ -63,8 +95,9 @@ class TestSklearn(unittest.TestCase):
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = multi_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.2)
self.assertLess(ret, 0.05)
ret = multi_logloss(y_test, gbm.predict_proba(X_test))
self.assertLess(ret, 0.15)
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)
def test_lambdarank(self):
......@@ -76,76 +109,64 @@ class TestSklearn(unittest.TestCase):
'../../examples/lambdarank/rank.train.query'))
q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.test.query'))
gbm = lgb.LGBMRanker()
gbm = lgb.LGBMRanker(n_estimators=50)
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
self.assertLessEqual(gbm.best_iteration_, 25)
self.assertLessEqual(gbm.best_iteration_, 24)
self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333)
self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048)
def test_regression_with_custom_objective(self):
def objective_ls(y_true, y_pred):
grad = (y_pred - y_true)
hess = np.ones(len(y_true))
return grad, hess
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = mean_squared_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 100)
self.assertLess(ret, 7.0)
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
def test_binary_classification_with_custom_objective(self):
def logregobj(y_true, y_pred):
y_pred = 1.0 / (1.0 + np.exp(-y_pred))
grad = y_pred - y_true
hess = y_pred * (1.0 - y_pred)
return grad, hess
def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
X, y = load_digits(2, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
ret = binary_error(y_test, gbm.predict(X_test))
self.assertLess(ret, 0.1)
self.assertLess(ret, 0.05)
def test_dart(self):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(boosting_type='dart')
gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50)
gbm.fit(X_train, y_train)
self.assertLessEqual(gbm.score(X_train, y_train), 1.)
score = gbm.score(X_test, y_test)
self.assertGreaterEqual(score, 0.8)
self.assertLessEqual(score, 1.)
def test_grid_search(self):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {'boosting_type': ['dart', 'gbdt'],
'n_estimators': [5, 8],
'drop_rate': [0.05, 0.1]}
gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
gbm.fit(X_train, y_train)
self.assertIn(gbm.best_params_['n_estimators'], [5, 8])
grid = GridSearchCV(lgb.LGBMRegressor(n_estimators=10), params, cv=3)
grid.fit(X, y)
self.assertIn(grid.best_params_['boosting_type'], ['dart', 'gbdt'])
self.assertIn(grid.best_params_['n_estimators'], [5, 8])
self.assertIn(grid.best_params_['drop_rate'], [0.05, 0.1])
self.assertLess(grid.best_score_, 0.3)
def test_clone_and_property(self):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
gbm = lgb.LGBMRegressor(n_estimators=10, silent=True)
gbm.fit(X, y, verbose=False)
gbm_clone = clone(gbm)
self.assertIsInstance(gbm.booster_, lgb.Booster)
self.assertIsInstance(gbm.feature_importances_, np.ndarray)
X, y = load_digits(2, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
clf = lgb.LGBMClassifier(n_estimators=10, silent=True)
clf.fit(X, y, verbose=False)
self.assertListEqual(sorted(clf.classes_), [0, 1])
self.assertEqual(clf.n_classes_, 2)
self.assertIsInstance(clf.booster_, lgb.Booster)
......@@ -177,15 +198,15 @@ class TestSklearn(unittest.TestCase):
np.testing.assert_allclose(pred_origin, pred_pickle)
def test_feature_importances_single_leaf(self):
clf = lgb.LGBMClassifier(n_estimators=100)
data = load_iris()
clf = lgb.LGBMClassifier(n_estimators=10)
clf.fit(data.data, data.target)
importances = clf.feature_importances_
self.assertEqual(len(importances), 4)
def test_feature_importances_type(self):
clf = lgb.LGBMClassifier(n_estimators=100)
data = load_iris()
clf = lgb.LGBMClassifier(n_estimators=10)
clf.fit(data.data, data.target)
clf.set_params(importance_type='split')
importances_split = clf.feature_importances_
......@@ -237,21 +258,21 @@ class TestSklearn(unittest.TestCase):
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred0 = gbm0.predict(X_test, raw_score=True)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier().fit(X, pd.Series(y), categorical_feature=[0])
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test, raw_score=True)
gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A'])
pred2 = gbm2.predict(X_test, raw_score=True)
gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = gbm3.predict(X_test, raw_score=True)
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test)
gbm5 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['E'])
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E'])
pred5 = gbm5.predict(X_test, raw_score=True)
gbm6 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[])
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
pred6 = gbm6.predict(X_test, raw_score=True)
self.assertRaises(AssertionError,
np.testing.assert_allclose,
......@@ -289,7 +310,7 @@ class TestSklearn(unittest.TestCase):
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
gbm = lgb.sklearn.LGBMClassifier().fit(X, y)
gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
......@@ -298,6 +319,7 @@ class TestSklearn(unittest.TestCase):
np.testing.assert_allclose(pred_sparse, pred_dense)
def test_predict(self):
# With default params
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
test_size=0.2, random_state=42)
......@@ -356,14 +378,8 @@ class TestSklearn(unittest.TestCase):
self.assertIn('l2', gbm.evals_result_['valid_1'])
def test_metrics(self):
def custom_obj(y_true, y_pred):
return np.zeros(y_true.shape), np.zeros(y_true.shape)
def custom_metric(y_true, y_pred):
return 'error', 0, False
X, y = load_boston(True)
params = {'n_estimators': 5, 'verbose': -1}
params = {'n_estimators': 2, 'verbose': -1}
params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False}
# no custom objective, no custom metric
......@@ -441,43 +457,43 @@ class TestSklearn(unittest.TestCase):
# custom objective, no custom metric
# default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, **params).fit(**params_fit)
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1)
self.assertIn('l2', gbm.evals_result_['training'])
# non-default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, metric='mape', **params).fit(**params_fit)
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', **params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1)
self.assertIn('mape', gbm.evals_result_['training'])
# multiple regression metrics for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, metric=['l1', 'gamma'],
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(**params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('l1', gbm.evals_result_['training'])
self.assertIn('gamma', gbm.evals_result_['training'])
# no metric
gbm = lgb.LGBMRegressor(objective=custom_obj, metric='None',
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='None',
**params).fit(**params_fit)
self.assertIs(gbm.evals_result_, None)
# default regression metric with non-default metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj,
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj,
**params).fit(eval_metric='mape', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('l2', gbm.evals_result_['training'])
self.assertIn('mape', gbm.evals_result_['training'])
# non-default regression metric with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, metric='mape',
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape',
**params).fit(eval_metric='gamma', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('mape', gbm.evals_result_['training'])
self.assertIn('gamma', gbm.evals_result_['training'])
# multiple regression metrics with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, metric=['l1', 'gamma'],
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(eval_metric='l2', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3)
self.assertIn('l1', gbm.evals_result_['training'])
......@@ -485,7 +501,7 @@ class TestSklearn(unittest.TestCase):
self.assertIn('l2', gbm.evals_result_['training'])
# multiple regression metrics with multiple metrics in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, metric=['l1', 'gamma'],
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(eval_metric=['l2', 'mape'], **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 4)
self.assertIn('l1', gbm.evals_result_['training'])
......@@ -495,21 +511,21 @@ class TestSklearn(unittest.TestCase):
# no custom objective, custom metric
# default metric with custom metric
gbm = lgb.LGBMRegressor(**params).fit(eval_metric=custom_metric, **params_fit)
gbm = lgb.LGBMRegressor(**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('l2', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# non-default metric with custom metric
gbm = lgb.LGBMRegressor(metric='mape',
**params).fit(eval_metric=custom_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('mape', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# multiple metrics with custom metric
gbm = lgb.LGBMRegressor(metric=['l1', 'gamma'],
**params).fit(eval_metric=custom_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3)
self.assertIn('l1', gbm.evals_result_['training'])
self.assertIn('gamma', gbm.evals_result_['training'])
......@@ -517,27 +533,27 @@ class TestSklearn(unittest.TestCase):
# custom metric (disable default metric)
gbm = lgb.LGBMRegressor(metric='None',
**params).fit(eval_metric=custom_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1)
self.assertIn('error', gbm.evals_result_['training'])
# default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1',
**params).fit(eval_metric=custom_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('l1', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# non-default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape',
**params).fit(eval_metric=custom_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('mape', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# multiple metrics for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric=['l1', 'gamma'],
**params).fit(eval_metric=custom_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3)
self.assertIn('l1', gbm.evals_result_['training'])
self.assertIn('gamma', gbm.evals_result_['training'])
......@@ -545,27 +561,27 @@ class TestSklearn(unittest.TestCase):
# custom metric (disable default metric for non-default objective)
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None',
**params).fit(eval_metric=custom_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1)
self.assertIn('error', gbm.evals_result_['training'])
# custom objective, custom metric
# custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj,
**params).fit(eval_metric=custom_metric, **params_fit)
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj,
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1)
self.assertIn('error', gbm.evals_result_['training'])
# non-default regression metric with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, metric='mape',
**params).fit(eval_metric=custom_metric, **params_fit)
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('mape', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# multiple regression metrics with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_obj, metric=['l2', 'mape'],
**params).fit(eval_metric=custom_metric, **params_fit)
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l2', 'mape'],
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3)
self.assertIn('l2', gbm.evals_result_['training'])
self.assertIn('mape', gbm.evals_result_['training'])
......@@ -608,13 +624,13 @@ class TestSklearn(unittest.TestCase):
self.assertIn('binary_error', gbm.evals_result_['training'])
# invalid multiclass metric is replaced with binary alternative for custom objective
gbm = lgb.LGBMClassifier(objective=custom_obj,
gbm = lgb.LGBMClassifier(objective=custom_dummy_obj,
**params).fit(eval_metric='multi_logloss', **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1)
self.assertIn('binary_logloss', gbm.evals_result_['training'])
def test_inf_handle(self):
nrows = 1000
nrows = 100
ncols = 10
X = np.random.randn(nrows, ncols)
y = np.random.randn(nrows) + np.full(nrows, 1e30)
......@@ -626,7 +642,7 @@ class TestSklearn(unittest.TestCase):
np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.inf)
def test_nan_handle(self):
nrows = 1000
nrows = 100
ncols = 10
X = np.random.randn(nrows, ncols)
y = np.random.randn(nrows) + np.full(nrows, 1e30)
......@@ -639,12 +655,6 @@ class TestSklearn(unittest.TestCase):
def test_first_metric_only(self):
def decreasing_metric(y_true, y_pred):
return ('decreasing_metric', next(decreasing_generator), False)
def constant_metric(y_true, y_pred):
return ('constant_metric', 0.0, False)
def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only):
params['first_metric_only'] = first_metric_only
gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
......@@ -660,10 +670,9 @@ class TestSklearn(unittest.TestCase):
if eval_set_name != 'training'
and assumed_iteration != gbm.n_estimators else 0)
self.assertEqual(expected, actual)
self.assertEqual(assumed_iteration if eval_set_name != 'training' else params['n_estimators'],
self.assertEqual(assumed_iteration if eval_set_name != 'training' else gbm.n_estimators,
gbm.best_iteration_)
decreasing_generator = itertools.count(0, -1)
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72)
......@@ -697,11 +706,11 @@ class TestSklearn(unittest.TestCase):
params_fit['eval_metric'] = lambda preds, train_data: [decreasing_metric(preds, train_data),
constant_metric(preds, train_data)]
params_fit['eval_set'] = (X_test1, y_test1)
fit_and_check(['valid_0'], ['decreasing_metric', 'constant_metric'], 1, False)
fit_and_check(['valid_0'], ['decreasing_metric', 'constant_metric'], 30, True)
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 1, False)
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 30, True)
params_fit['eval_metric'] = lambda preds, train_data: [constant_metric(preds, train_data),
decreasing_metric(preds, train_data)]
fit_and_check(['valid_0'], ['decreasing_metric', 'constant_metric'], 1, True)
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 1, True)
# single eval_set
params.pop('metric')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment