Unverified Commit 9bacf03c authored by Thomas J. Fan's avatar Thomas J. Fan Committed by GitHub
Browse files

[python][tests] Migrates test_basic.py to use pytest (#3764)

* TST Migrates test_basic.py to use pytest

* STY Linting

* CI Force CI to run
parent 53639f4a
# coding: utf-8 # coding: utf-8
import os import os
import tempfile import tempfile
import unittest
import lightgbm as lgb import lightgbm as lgb
import numpy as np import numpy as np
import pytest
from scipy import sparse from scipy import sparse
from sklearn.datasets import dump_svmlight_file, load_svmlight_file from sklearn.datasets import dump_svmlight_file, load_svmlight_file
...@@ -13,346 +13,346 @@ from sklearn.model_selection import train_test_split ...@@ -13,346 +13,346 @@ from sklearn.model_selection import train_test_split
from .utils import load_breast_cancer from .utils import load_breast_cancer
class TestBasic(unittest.TestCase): def test_basic(tmp_path):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True),
def test(self): test_size=0.1, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), train_data = lgb.Dataset(X_train, label=y_train)
test_size=0.1, random_state=2) valid_data = train_data.create_valid(X_test, label=y_test)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = train_data.create_valid(X_test, label=y_test) params = {
"objective": "binary",
params = { "metric": "auc",
"objective": "binary", "min_data": 10,
"metric": "auc", "num_leaves": 15,
"min_data": 10, "verbose": -1,
"num_leaves": 15, "num_threads": 1,
"verbose": -1, "max_bin": 255,
"num_threads": 1, "gpu_use_dp": True
"max_bin": 255, }
"gpu_use_dp": True bst = lgb.Booster(params, train_data)
} bst.add_valid(valid_data, "valid_1")
bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1") for i in range(20):
bst.update()
for i in range(20): if i % 10 == 0:
bst.update() print(bst.eval_train(), bst.eval_valid())
if i % 10 == 0:
print(bst.eval_train(), bst.eval_valid()) assert bst.current_iteration() == 20
assert bst.num_trees() == 20
self.assertEqual(bst.current_iteration(), 20) assert bst.num_model_per_iteration() == 1
self.assertEqual(bst.num_trees(), 20) assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
self.assertEqual(bst.num_model_per_iteration(), 1) assert bst.upper_bound() == pytest.approx(3.3182142872462883)
self.assertAlmostEqual(bst.lower_bound(), -2.9040190126976606)
self.assertAlmostEqual(bst.upper_bound(), 3.3182142872462883) tname = str(tmp_path / "svm_light.dat")
model_file = str(tmp_path / "model.txt")
bst.save_model("model.txt")
pred_from_matr = bst.predict(X_test) bst.save_model(model_file)
with tempfile.NamedTemporaryFile() as f: pred_from_matr = bst.predict(X_test)
tname = f.name with open(tname, "w+b") as f:
with open(tname, "w+b") as f: dump_svmlight_file(X_test, y_test, f)
dump_svmlight_file(X_test, y_test, f) pred_from_file = bst.predict(tname)
pred_from_file = bst.predict(tname) np.testing.assert_allclose(pred_from_matr, pred_from_file)
os.remove(tname)
np.testing.assert_allclose(pred_from_matr, pred_from_file) # check saved model persistence
bst = lgb.Booster(params, model_file=model_file)
# check saved model persistence pred_from_model_file = bst.predict(X_test)
bst = lgb.Booster(params, model_file="model.txt") # we need to check the consistency of model file here, so test for exact equal
os.remove("model.txt") np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)
pred_from_model_file = bst.predict(X_test)
# we need to check the consistency of model file here, so test for exact equal # check early stopping is working. Make it stop very early, so the scores should be very close to zero
np.testing.assert_array_equal(pred_from_matr, pred_from_model_file) pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
pred_early_stopping = bst.predict(X_test, **pred_parameter)
# check early stopping is working. Make it stop very early, so the scores should be very close to zero # scores likely to be different, but prediction should still be the same
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))
pred_early_stopping = bst.predict(X_test, **pred_parameter)
# scores likely to be different, but prediction should still be the same # test that shape is checked during prediction
np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping)) bad_X_test = X_test[:, 1:]
bad_shape_error_msg = "The number of features in data*"
# test that shape is checked during prediction np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bad_X_test = X_test[:, 1:] bst.predict, bad_X_test)
bad_shape_error_msg = "The number of features in data*" np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test))
bst.predict, bad_X_test) np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test))
bst.predict, sparse.csr_matrix(bad_X_test)) with open(tname, "w+b") as f:
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, dump_svmlight_file(bad_X_test, y_test, f)
bst.predict, sparse.csc_matrix(bad_X_test)) np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
with open(tname, "w+b") as f: bst.predict, tname)
dump_svmlight_file(bad_X_test, y_test, f) with open(tname, "w+b") as f:
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, dump_svmlight_file(X_test, y_test, f, zero_based=False)
bst.predict, tname) np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
with open(tname, "w+b") as f: bst.predict, tname)
dump_svmlight_file(X_test, y_test, f, zero_based=False)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, tname) def test_chunked_dataset():
os.remove(tname) X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
random_state=2)
def test_chunked_dataset(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
chunk_size = X_train.shape[0] // 10 + 1 X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)] train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100}) train_data.construct()
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100}) valid_data.construct()
train_data.construct()
valid_data.construct()
def test_chunked_dataset_linear():
def test_chunked_dataset_linear(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2)
random_state=2) chunk_size = X_train.shape[0] // 10 + 1
chunk_size = X_train.shape[0] // 10 + 1 X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)] X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)] params = {"bin_construct_sample_cnt": 100, 'linear_tree': True}
params = {"bin_construct_sample_cnt": 100, 'linear_tree': True} train_data = lgb.Dataset(X_train, label=y_train, params=params)
train_data = lgb.Dataset(X_train, label=y_train, params=params) valid_data = train_data.create_valid(X_test, label=y_test, params=params)
valid_data = train_data.create_valid(X_test, label=y_test, params=params) train_data.construct()
train_data.construct() valid_data.construct()
valid_data.construct()
def test_save_and_load_linear(self): def test_save_and_load_linear(tmp_path):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
random_state=2) random_state=2)
X_train = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], 1) X_train = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], 1)
X_train[:X_train.shape[0] // 2, 0] = 0 X_train[:X_train.shape[0] // 2, 0] = 0
y_train[:X_train.shape[0] // 2] = 1 y_train[:X_train.shape[0] // 2] = 1
params = {'linear_tree': True} params = {'linear_tree': True}
train_data_1 = lgb.Dataset(X_train, label=y_train, params=params) train_data_1 = lgb.Dataset(X_train, label=y_train, params=params)
est_1 = lgb.train(params, train_data_1, num_boost_round=10, categorical_feature=[0]) est_1 = lgb.train(params, train_data_1, num_boost_round=10, categorical_feature=[0])
pred_1 = est_1.predict(X_train) pred_1 = est_1.predict(X_train)
train_data_1.save_binary('temp_dataset.bin')
train_data_2 = lgb.Dataset('temp_dataset.bin') tmp_dataset = str(tmp_path / 'temp_dataset.bin')
est_2 = lgb.train(params, train_data_2, num_boost_round=10) train_data_1.save_binary(tmp_dataset)
pred_2 = est_2.predict(X_train) train_data_2 = lgb.Dataset(tmp_dataset)
np.testing.assert_allclose(pred_1, pred_2) est_2 = lgb.train(params, train_data_2, num_boost_round=10)
est_2.save_model('model.txt') pred_2 = est_2.predict(X_train)
est_3 = lgb.Booster(model_file='model.txt') np.testing.assert_allclose(pred_1, pred_2)
pred_3 = est_3.predict(X_train)
np.testing.assert_allclose(pred_2, pred_3) model_file = str(tmp_path / 'model.txt')
est_2.save_model(model_file)
def test_subset_group(self): est_3 = lgb.Booster(model_file=model_file)
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), pred_3 = est_3.predict(X_train)
'../../examples/lambdarank/rank.train')) np.testing.assert_allclose(pred_2, pred_3)
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
lgb_train = lgb.Dataset(X_train, y_train, group=q_train) def test_subset_group():
self.assertEqual(len(lgb_train.get_group()), 201) X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
subset = lgb_train.subset(list(range(10))).construct() '../../examples/lambdarank/rank.train'))
subset_group = subset.get_group() q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
self.assertEqual(len(subset_group), 2) '../../examples/lambdarank/rank.train.query'))
self.assertEqual(subset_group[0], 1) lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
self.assertEqual(subset_group[1], 9) assert len(lgb_train.get_group()) == 201
subset = lgb_train.subset(list(range(10))).construct()
def test_add_features_throws_if_num_data_unequal(self): subset_group = subset.get_group()
X1 = np.random.random((100, 1)) assert len(subset_group) == 2
X2 = np.random.random((10, 1)) assert subset_group[0] == 1
assert subset_group[1] == 9
def test_add_features_throws_if_num_data_unequal():
X1 = np.random.random((100, 1))
X2 = np.random.random((10, 1))
d1 = lgb.Dataset(X1).construct()
d2 = lgb.Dataset(X2).construct()
with pytest.raises(lgb.basic.LightGBMError):
d1.add_features_from(d2)
def test_add_features_throws_if_datasets_unconstructed():
X1 = np.random.random((100, 1))
X2 = np.random.random((100, 1))
with pytest.raises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2)
d1.add_features_from(d2)
with pytest.raises(ValueError):
d1 = lgb.Dataset(X1).construct() d1 = lgb.Dataset(X1).construct()
d2 = lgb.Dataset(X2)
d1.add_features_from(d2)
with pytest.raises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2).construct() d2 = lgb.Dataset(X2).construct()
with self.assertRaises(lgb.basic.LightGBMError): d1.add_features_from(d2)
d1.add_features_from(d2)
def test_add_features_throws_if_datasets_unconstructed(self):
X1 = np.random.random((100, 1))
X2 = np.random.random((100, 1))
with self.assertRaises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2)
d1.add_features_from(d2)
with self.assertRaises(ValueError):
d1 = lgb.Dataset(X1).construct()
d2 = lgb.Dataset(X2)
d1.add_features_from(d2)
with self.assertRaises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2).construct()
d1.add_features_from(d2)
def test_add_features_equal_data_on_alternating_used_unused(self): def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
self.maxDiff = None X = np.random.random((100, 5))
X = np.random.random((100, 5)) X[:, [1, 3]] = 0
X[:, [1, 3]] = 0 names = ['col_%d' % i for i in range(5)]
names = ['col_%d' % i for i in range(5)] for j in range(1, 5):
for j in range(1, 5): d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct() d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct() d1.add_features_from(d2)
d1.add_features_from(d2) d1name = str(tmp_path / "d1.txt")
with tempfile.NamedTemporaryFile() as f: d1._dump_text(d1name)
d1name = f.name d = lgb.Dataset(X, feature_name=names).construct()
d1._dump_text(d1name) dname = str(tmp_path / "d.txt")
d = lgb.Dataset(X, feature_name=names).construct() d._dump_text(dname)
with tempfile.NamedTemporaryFile() as f: with open(d1name, 'rt') as d1f:
dname = f.name d1txt = d1f.read()
d._dump_text(dname) with open(dname, 'rt') as df:
with open(d1name, 'rt') as d1f: dtxt = df.read()
d1txt = d1f.read() assert dtxt == d1txt
with open(dname, 'rt') as df:
dtxt = df.read()
os.remove(dname)
os.remove(d1name)
self.assertEqual(dtxt, d1txt)
def test_add_features_same_booster_behaviour(self):
self.maxDiff = None
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = ['col_%d' % i for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1.add_features_from(d2)
d = lgb.Dataset(X, feature_name=names).construct()
y = np.random.random(100)
d1.set_label(y)
d.set_label(y)
b1 = lgb.Booster(train_set=d1)
b = lgb.Booster(train_set=d)
for k in range(10):
b.update()
b1.update()
with tempfile.NamedTemporaryFile() as df:
dname = df.name
with tempfile.NamedTemporaryFile() as d1f:
d1name = d1f.name
b1.save_model(d1name)
b.save_model(dname)
with open(dname, 'rt') as df:
dtxt = df.read()
with open(d1name, 'rt') as d1f:
d1txt = d1f.read()
self.assertEqual(dtxt, d1txt)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_add_features_from_different_sources(self):
import pandas as pd
n_row = 100
n_col = 5
X = np.random.random((n_row, n_col))
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
names = ['col_%d' % i for i in range(n_col)]
for x_1 in xxs:
# test that method works even with free_raw_data=True
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
d2 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
d1.add_features_from(d2)
self.assertIsNone(d1.data)
# test that method works but sets raw data to None in case of immergeable data types
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct() def test_add_features_same_booster_behaviour(tmp_path):
d2 = lgb.Dataset([X[:n_row // 2, :], X[n_row // 2:, :]], X = np.random.random((100, 5))
feature_name=names, free_raw_data=False).construct() X[:, [1, 3]] = 0
d1.add_features_from(d2) names = ['col_%d' % i for i in range(5)]
self.assertIsNone(d1.data) for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
# test that method works for different data types d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct() d1.add_features_from(d2)
res_feature_names = [name for name in names] d = lgb.Dataset(X, feature_name=names).construct()
for idx, x_2 in enumerate(xxs, 2):
original_type = type(d1.get_data())
d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
d1.add_features_from(d2)
self.assertIsInstance(d1.get_data(), original_type)
self.assertTupleEqual(d1.get_data().shape, (n_row, n_col * idx))
res_feature_names += ['D{}_{}'.format(idx, name) for name in names]
self.assertListEqual(d1.feature_name, res_feature_names)
def test_cegb_affects_behavior(self):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100) y = np.random.random(100)
names = ['col_%d' % i for i in range(5)] d1.set_label(y)
ds = lgb.Dataset(X, feature_name=names).construct() d.set_label(y)
ds.set_label(y) b1 = lgb.Booster(train_set=d1)
base = lgb.Booster(train_set=ds) b = lgb.Booster(train_set=d)
for k in range(10): for k in range(10):
base.update() b.update()
with tempfile.NamedTemporaryFile() as f: b1.update()
basename = f.name dname = str(tmp_path / "d.txt")
base.save_model(basename) d1name = str(tmp_path / "d1.txt")
with open(basename, 'rt') as f: b1.save_model(d1name)
basetxt = f.read() b.save_model(dname)
# Set extremely harsh penalties, so CEGB will block most splits. with open(dname, 'rt') as df:
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]}, dtxt = df.read()
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]}, with open(d1name, 'rt') as d1f:
{'cegb_penalty_split': 1}] d1txt = d1f.read()
for case in cases: assert dtxt == d1txt
booster = lgb.Booster(train_set=ds, params=case)
for k in range(10):
booster.update() def test_add_features_from_different_sources():
with tempfile.NamedTemporaryFile() as f: pd = pytest.importorskip("pandas")
casename = f.name n_row = 100
booster.save_model(casename) n_col = 5
with open(casename, 'rt') as f: X = np.random.random((n_row, n_col))
casetxt = f.read() xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
self.assertNotEqual(basetxt, casetxt) names = ['col_%d' % i for i in range(n_col)]
for x_1 in xxs:
def test_cegb_scaling_equalities(self): # test that method works even with free_raw_data=True
X = np.random.random((100, 5)) d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
X[:, [1, 3]] = 0 d2 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
y = np.random.random(100) d1.add_features_from(d2)
names = ['col_%d' % i for i in range(5)] assert d1.data is None
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y) # test that method works but sets raw data to None in case of immergeable data types
# Compare pairs of penalties, to ensure scaling works as intended d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]}, d2 = lgb.Dataset([X[:n_row // 2, :], X[n_row // 2:, :]],
{'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}), feature_name=names, free_raw_data=False).construct()
({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]}, d1.add_features_from(d2)
{'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}), assert d1.data is None
({'cegb_penalty_split': 1},
{'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})] # test that method works for different data types
for (p1, p2) in pairs: d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
booster1 = lgb.Booster(train_set=ds, params=p1) res_feature_names = [name for name in names]
booster2 = lgb.Booster(train_set=ds, params=p2) for idx, x_2 in enumerate(xxs, 2):
for k in range(10): original_type = type(d1.get_data())
booster1.update() d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
booster2.update() d1.add_features_from(d2)
with tempfile.NamedTemporaryFile() as f: assert isinstance(d1.get_data(), original_type)
p1name = f.name assert d1.get_data().shape == (n_row, n_col * idx)
# Reset booster1's parameters to p2, so the parameter section of the file matches. res_feature_names += ['D{}_{}'.format(idx, name) for name in names]
booster1.reset_parameter(p2) assert d1.feature_name == res_feature_names
booster1.save_model(p1name)
with open(p1name, 'rt') as f:
p1txt = f.read() def test_cegb_affects_behavior(tmp_path):
with tempfile.NamedTemporaryFile() as f: X = np.random.random((100, 5))
p2name = f.name X[:, [1, 3]] = 0
booster2.save_model(p2name) y = np.random.random(100)
with open(p2name, 'rt') as f: names = ['col_%d' % i for i in range(5)]
p2txt = f.read() ds = lgb.Dataset(X, feature_name=names).construct()
self.maxDiff = None ds.set_label(y)
self.assertEqual(p1txt, p2txt) base = lgb.Booster(train_set=ds)
for k in range(10):
def test_consistent_state_for_dataset_fields(self): base.update()
with tempfile.NamedTemporaryFile() as f:
def check_asserts(data): basename = f.name
np.testing.assert_allclose(data.label, data.get_label()) base.save_model(basename)
np.testing.assert_allclose(data.label, data.get_field('label')) with open(basename, 'rt') as f:
self.assertFalse(np.isnan(data.label[0])) basetxt = f.read()
self.assertFalse(np.isinf(data.label[1])) # Set extremely harsh penalties, so CEGB will block most splits.
np.testing.assert_allclose(data.weight, data.get_weight()) cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
np.testing.assert_allclose(data.weight, data.get_field('weight')) {'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
self.assertFalse(np.isnan(data.weight[0])) {'cegb_penalty_split': 1}]
self.assertFalse(np.isinf(data.weight[1])) for case in cases:
np.testing.assert_allclose(data.init_score, data.get_init_score()) booster = lgb.Booster(train_set=ds, params=case)
np.testing.assert_allclose(data.init_score, data.get_field('init_score')) for k in range(10):
self.assertFalse(np.isnan(data.init_score[0])) booster.update()
self.assertFalse(np.isinf(data.init_score[1])) casename = str(tmp_path / "casename.txt")
self.assertTrue(np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], booster.save_model(casename)
data.label[0]))) with open(casename, 'rt') as f:
self.assertAlmostEqual(data.label[1], data.weight[1]) casetxt = f.read()
self.assertListEqual(data.feature_name, data.get_feature_name()) assert basetxt != casetxt
X, y = load_breast_cancer(return_X_y=True)
sequence = np.ones(y.shape[0]) def test_cegb_scaling_equalities(tmp_path):
sequence[0] = np.nan X = np.random.random((100, 5))
sequence[1] = np.inf X[:, [1, 3]] = 0
feature_names = ['f{0}'.format(i) for i in range(X.shape[1])] y = np.random.random(100)
lgb_data = lgb.Dataset(X, sequence, names = ['col_%d' % i for i in range(5)]
weight=sequence, init_score=sequence, ds = lgb.Dataset(X, feature_name=names).construct()
feature_name=feature_names).construct() ds.set_label(y)
check_asserts(lgb_data) # Compare pairs of penalties, to ensure scaling works as intended
lgb_data = lgb.Dataset(X, y).construct() pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]},
lgb_data.set_label(sequence) {'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}),
lgb_data.set_weight(sequence) ({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]},
lgb_data.set_init_score(sequence) {'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}),
lgb_data.set_feature_name(feature_names) ({'cegb_penalty_split': 1},
check_asserts(lgb_data) {'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})]
for (p1, p2) in pairs:
booster1 = lgb.Booster(train_set=ds, params=p1)
booster2 = lgb.Booster(train_set=ds, params=p2)
for k in range(10):
booster1.update()
booster2.update()
p1name = str(tmp_path / "p1.txt")
# Reset booster1's parameters to p2, so the parameter section of the file matches.
booster1.reset_parameter(p2)
booster1.save_model(p1name)
with open(p1name, 'rt') as f:
p1txt = f.read()
p2name = str(tmp_path / "p2.txt")
booster2.save_model(p2name)
with open(p2name, 'rt') as f:
p2txt = f.read()
assert p1txt == p2txt
def test_consistent_state_for_dataset_fields():
def check_asserts(data):
np.testing.assert_allclose(data.label, data.get_label())
np.testing.assert_allclose(data.label, data.get_field('label'))
assert not np.isnan(data.label[0])
assert not np.isinf(data.label[1])
np.testing.assert_allclose(data.weight, data.get_weight())
np.testing.assert_allclose(data.weight, data.get_field('weight'))
assert not np.isnan(data.weight[0])
assert not np.isinf(data.weight[1])
np.testing.assert_allclose(data.init_score, data.get_init_score())
np.testing.assert_allclose(data.init_score, data.get_field('init_score'))
assert not np.isnan(data.init_score[0])
assert not np.isinf(data.init_score[1])
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]],
data.label[0]))
assert data.label[1] == pytest.approx(data.weight[1])
assert data.feature_name == data.get_feature_name()
X, y = load_breast_cancer(return_X_y=True)
sequence = np.ones(y.shape[0])
sequence[0] = np.nan
sequence[1] = np.inf
feature_names = ['f{0}'.format(i) for i in range(X.shape[1])]
lgb_data = lgb.Dataset(X, sequence,
weight=sequence, init_score=sequence,
feature_name=feature_names).construct()
check_asserts(lgb_data)
lgb_data = lgb.Dataset(X, y).construct()
lgb_data.set_label(sequence)
lgb_data.set_weight(sequence)
lgb_data.set_init_score(sequence)
lgb_data.set_feature_name(feature_names)
check_asserts(lgb_data)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment